In [2]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
def extract_features(file_path):
    # Load the audio file
    audio, sample_rate = librosa.load(file_path)
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    # Average MFCCs over time frames
    mfccs_processed = np.mean(mfccs.T, axis=0)
    
    return mfccs_processed

def load_data(data_path):
    features = []
    labels = []
    for folder in os.listdir(data_path):
        if os.path.isdir(os.path.join(data_path, folder)):
            for subfolder in tqdm(os.listdir(os.path.join(data_path, folder))):
                subfolder_path = os.path.join(data_path, folder, subfolder)
                if os.path.isdir(subfolder_path):
                    for file in os.listdir(subfolder_path):
                        if file.endswith('.mp3'):  # Assuming files are in MP3 format
                            file_path = os.path.join(subfolder_path, file)
                            data = extract_features(file_path)
                            features.append(data)
                            labels.append(folder)
    return features, labels

# Load the dataset
data_path = r'C:\Users\neel2\Code\data\data255'  # Change this to your dataset's path
features, labels = load_data(data_path)

# Convert to DataFrame
df = pd.DataFrame(features)
df['label'] = labels

# Create train-test split
X_train, X_test, y_train, y_test = train_test_split(df.drop('label', axis=1), df['label'], test_size=0.2, random_state=0)

print('Data loaded and split into training and testing sets.')
df.to_csv('all_data.csv', index=False)
df.to_pickle('all_data.pkl')

100%|██████████| 441/441 [19:01<00:00,  2.59s/it]
100%|██████████| 751/751 [36:32<00:00,  2.92s/it]
100%|██████████| 367/367 [22:51<00:00,  3.74s/it]


Data loaded and split into training and testing sets.


In [3]:
df.shape

(155712, 41)

In [5]:
df.isna().sum().sum()

0

In [None]:
#getting the raw data for each audio files 
data_path = r'C:\Users\neel2\Code\data\data255\english\'  # Change this to your dataset's path