In [None]:
#Create code to generate metadata for training and testing set
# Structure of metadata.csv
# train
#      -
#      -
#      -
#      .
#      .
# valid
#      -
#      -
#      -
#      -
#      .
#      .

In [1]:
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split

def create_metadata_csv(base_directory, output_file):
    # List to hold all metadata entries
    data_entries = []
    
    # Categories and their corresponding labels
    categories = {
        'synhesized_samples_positive_pps_tortoise': 1,
        'synthesized_samples_POSITIVE_waveglov':1,
        'negative_pps':0,
        'synthesized_samples_NEGATIVE_waveglov':0
    }
    
    # Process each category directory
    for category, label in categories.items():
        directory_path = os.path.join(base_directory, category)
        files = os.listdir(directory_path)
        
        # Create entries for each file
        for file in files:
            if file.endswith('.wav') or file.endswith('.flac') or file.endswith('.mp3') or file.endswith('.npz'):  # Ensure it's a WAV file
                data_entries.append({
                    'path': os.path.join(directory_path, file),
                    'label': label
                })
    
    # Convert list to DataFrame
    df = pd.DataFrame(data_entries)
    
    # Split into train and validation set
    train_df, valid_df = train_test_split(df, test_size=0.1, random_state=42)  # Ensure reproducible split
    train_df['split'] = 'train'
    valid_df['split'] = 'valid'
    
    # Concatenate train and valid sets back into one DataFrame
    result_df = pd.concat([train_df, valid_df])
    
    # Save to CSV
    result_df.to_csv(output_file, index=False)
    print(f"Metadata CSV file created at {output_file}")

# Directory containing the dataset subfolders
base_directory = '/home/ai01/Desktop/marin_work_folder/new_dataset_wwd/'
output_file = 'metadata_new_train_set.csv'

# Create the metadata CSV
create_metadata_csv(base_directory, output_file)



Metadata CSV file created at metadata_new_train_set.csv


In [8]:
#Testing samples
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split

def create_metadata_csv(base_directory, output_file):
    # List to hold all metadata entries
    data_entries = []
    
    # Categories and their corresponding labels
    categories = {
        'heyjules': 1,
        'negative': 0
    }
    
    # Process each category directory
    for category, label in categories.items():
        directory_path = os.path.join(base_directory, category)
        files = os.listdir(directory_path)
        
        # Create entries for each file
        for file in files:
            if file.endswith('.wav') or file.endswith('.mp3') or file.endswith('.npz'):  # Ensure it's a WAV file
                data_entries.append({
                    'path': os.path.join(directory_path, file),
                    'label': label
                })
    
    # Convert list to DataFrame
    df = pd.DataFrame(data_entries)
    df['split']='test'
    
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    print(f"Metadata CSV file created at {output_file}")

# Directory containing the dataset subfolders
base_directory = '/home/ai01/Desktop/marin_work_folder/test_clean_npz'
output_file = 'metadata_test_clean_npz.csv'

# Create the metadata CSV
create_metadata_csv(base_directory, output_file)

Metadata CSV file created at metadata_test_clean_npz.csv


In [8]:
### Chunk the input data to some chunks...Let's go with 1.2 seconds chunks with 0.2 step size
##
#

import os
import librosa
import soundfile as sf

def chunk_audio_and_save(audio_folder, output_folder, chunk_duration=1.2, step_size=0.2):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Loop through each file in the input folder
    for file_name in os.listdir(audio_folder):
        if file_name.endswith('.wav') or file_name.endswith('.mp3') or file_name.endswith('.flac'):
            file_path = os.path.join(audio_folder, file_name)
            
            # Load the audio file
            y, sr = librosa.load(file_path, sr=16000)
            
            # Calculate the number of samples per chunk
            samples_per_chunk = int(sr * chunk_duration)
            
            # Calculate the step size
            step = int(samples_per_chunk * step_size)
            
            # Start index for each chunk
            start_index = 0
            
            # Generate output folder for this file

            
            # Iterate through the audio and extract each chunk
            chunk_index = 0
            while start_index + samples_per_chunk < len(y):
                chunk = y[start_index:start_index + samples_per_chunk]
                
                # Generate output file path
                output_file = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_{chunk_index}.wav")
                
                # Save the chunk as a WAV file using soundfile
                sf.write(output_file, chunk, sr)
                
                # Move the start index forward by the step size
                start_index += step
                
                # Increment chunk index
                chunk_index += 1

# Example usage
input_folder = '/home/ai01/Desktop/marin_work_folder/Wake_word_dataset_v1/samples'  # Folder containing the audio files
output_folder = '/home/ai01/Desktop/marin_work_folder/Wake_word_dataset_v1/samples_chunked'  # Folder where the chunks will be saved
chunk_audio_and_save(input_folder, output_folder)



In [1]:
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split

def create_metadata_csv(base_directory, output_file):
    # List to hold all metadata entries
    data_entries = []
    
    # Categories and their corresponding labels
    categories = {
        'synhesized_samples_positive_pps_tortoise': 1,
        'synthesized_samples_POSITIVE_waveglov':1,
        'confusion_phrases_tortoise':0,
        'short_negative':0
    }
    
    # Process each category directory
    for category, label in categories.items():
        directory_path = os.path.join(base_directory, category)
        files = os.listdir(directory_path)
        
        # Create entries for each file
        for file in files:
            if file.endswith('.wav') or file.endswith('.flac') or file.endswith('.mp3') or file.endswith('.npz'):  # Ensure it's a WAV file
                data_entries.append({
                    'path': os.path.join(directory_path, file),
                    'label': label
                })
    
    # Convert list to DataFrame
    df = pd.DataFrame(data_entries)
    
    # Split into train and validation set
    train_df, valid_df = train_test_split(df, test_size=0.1, random_state=42)  # Ensure reproducible split
    train_df['split'] = 'train'
    valid_df['split'] = 'valid'
    
    # Concatenate train and valid sets back into one DataFrame
    result_df = pd.concat([train_df, valid_df])
    
    # Save to CSV
    result_df.to_csv(output_file, index=False)
    print(f"Metadata CSV file created at {output_file}")

# Directory containing the dataset subfolders
base_directory = '/home/ai01/Desktop/marin_work_folder/second_stage_dataset/'
output_file = 'metadata_second_stage_train_set.csv'

# Create the metadata CSV
create_metadata_csv(base_directory, output_file)


Metadata CSV file created at metadata_second_stage_train_set.csv
