In [2]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

We will go through all the files and extrac the audio files represent it as numbers, generate MFCCs for them and save all the MFCCs and labels in a pickle file

In [2]:

def extract_features(file_path):
    # Load the audio file
    audio, sample_rate = librosa.load(file_path)
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    # Average MFCCs over time frames
    mfccs_processed = np.mean(mfccs.T, axis=0)
    
    return mfccs_processed

def load_data(data_path):
    features = []
    labels = []
    for folder in os.listdir(data_path):
        if os.path.isdir(os.path.join(data_path, folder)):
            for subfolder in tqdm(os.listdir(os.path.join(data_path, folder))):
                subfolder_path = os.path.join(data_path, folder, subfolder)
                if os.path.isdir(subfolder_path):
                    for file in os.listdir(subfolder_path):
                        if file.endswith('.mp3'):  # Assuming files are in MP3 format
                            file_path = os.path.join(subfolder_path, file)
                            data = extract_features(file_path)
                            features.append(data)
                            labels.append(folder)
    return features, labels

# Load the dataset
data_path = r'C:\Users\neel2\Code\data\data255'  # Change this to your dataset's path
features, labels = load_data(data_path)

# Convert to DataFrame
df = pd.DataFrame(features)
df['label'] = labels


print('Data loaded and split into training and testing sets.')
df.to_csv('all_data.csv', index=False)
df.to_pickle('all_data.pkl')

100%|██████████| 441/441 [19:01<00:00,  2.59s/it]
100%|██████████| 751/751 [36:32<00:00,  2.92s/it]
100%|██████████| 367/367 [22:51<00:00,  3.74s/it]


Data loaded and split into training and testing sets.


##### The below code is computationally very expensive and will generate 3 pickle files which are over 20Gb in size each. So execute them with caution

We will go through all the files and extrac the audio files represent it as numbers, extrac meta data such as sampling rate and audio length and save everything as a csv as well as pickle file. 

In [3]:
def extract_meta_features(file_path):
    # Load the audio file
    audio, sample_rate = librosa.load(file_path)
    #finding file length
    file_length = librosa.get_duration(y=audio, sr=sample_rate)
    return audio, sample_rate, file_length

def load_meta_data(data_path,ln):
    audios = []
    sample_rates = []
    file_lengths = []
    labels = []
    for folder in os.listdir(data_path):
      if folder == ln:
        if os.path.isdir(os.path.join(data_path, folder)):
            for subfolder in tqdm(os.listdir(os.path.join(data_path, folder))):
                subfolder_path = os.path.join(data_path, folder, subfolder)
                if os.path.isdir(subfolder_path):
                    for file in os.listdir(subfolder_path):
                        if file.endswith('.mp3'):  # Assuming files are in MP3 format
                            file_path = os.path.join(subfolder_path, file)
                            audio, sample_rate, file_length = extract_meta_features(file_path)
                            audios.append(audio)
                            sample_rates.append(sample_rate)
                            file_lengths.append(file_length)
                            labels.append(folder)
    return audios, sample_rates, file_lengths, labels


    

Extracting data from all english files

In [4]:
# Load the dataset
data_path = r'C:\Users\neel2\Code\data\data255'  # Change this to your dataset's path


In [None]:
audios, sample_rates, file_lengths, labels = load_meta_data(data_path,'english')

In [5]:
df_meta_english = pd.DataFrame()
df_meta_english['audio'] = audios
df_meta_english['sample_rate'] = sample_rates
df_meta_english['file_length'] = file_lengths
df_meta_english['label'] = labels
df_meta_english.to_csv('meta_data_english.csv', index=False)
df_meta_english.to_pickle('all_meta_english.pkl')


Memory Management

In [6]:

del audios, sample_rates, file_lengths, labels

In [8]:
import gc
gc.collect()

473

Extracting for all french data

In [8]:
audios, sample_rates, file_lengths, labels = load_meta_data(data_path,'french')

100%|██████████| 751/751 [1:12:31<00:00,  5.79s/it]


In [9]:
df_meta_french = pd.DataFrame()
df_meta_french['audio'] = audios
df_meta_french['sample_rate'] = sample_rates
df_meta_french['file_length'] = file_lengths
df_meta_french['label'] = labels
df_meta_french.to_csv('meta_data_french.csv', index=False)
df_meta_french.to_pickle('all_meta_french.pkl')


In [10]:
del audios, sample_rates, file_lengths, labels
gc.collect()

0

Extracting for all spanish data

In [5]:
audios, sample_rates, file_lengths, labels = load_meta_data(data_path,'spanish')

100%|██████████| 367/367 [16:42<00:00,  2.73s/it]


In [6]:
df_meta_spanish = pd.DataFrame()
df_meta_spanish['audio'] = audios
df_meta_spanish['sample_rate'] = sample_rates
df_meta_spanish['file_length'] = file_lengths
df_meta_spanish['label'] = labels
df_meta_spanish.to_csv('meta_data_spanish.csv', index=False)
df_meta_spanish.to_pickle('all_meta_spanish.pkl')


In [None]:
del audios, sample_rates, file_lengths, labels
gc.collect()

Combine the audio meta data for all 3 files

In [9]:
#combinig all 3 meta dataframes
en =pd.read_csv('meta_data_english.csv')
fr =pd.read_csv('meta_data_french.csv')
sp =pd.read_csv('meta_data_spanish.csv')

df_meta = pd.concat([en,fr,sp],ignore_index=True)

In [11]:
df_meta.to_csv('all_meta_data.csv', index=False)