EXTRACTING THE AUDIO FILES FROM .TAR

In [2]:
import os
import numpy as np 
import pandas as pd

In [3]:
import tarfile

# Define the path to the .tar.gz file
tar_file_path = "speech_commands_v0.02.tar.gz"

# Define the extraction directory
extraction_dir = "data"

# Create the extraction directory if it doesn't exist
os.makedirs(extraction_dir, exist_ok=True)

# Open the tar.gz file
with tarfile.open(tar_file_path, "r:gz") as tar:
    # Extract all contents
    tar.extractall(path=extraction_dir)

print(f"Extraction completed! Files are saved in {extraction_dir}")


Extraction completed! Files are saved in data


MAKING A DATAFRAME OF ALL THE AUDIO FILES, WITH THEIR CATEGORY

In [16]:
import os
import pandas as pd

def create_dataframe_from_folder(main_folder_path):
    # Initialize lists to hold file paths and categories
    file_paths = []
    categories = []

    # Iterate through subfolders in the main folder
    for subfolder in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, subfolder)
        
        # Check if it's a directory
        if os.path.isdir(subfolder_path):
            # Check if there are any .wav files in the subfolder
            wav_files = [f for f in os.listdir(subfolder_path) if f.endswith('.wav')]
            
            if wav_files:  # If there are .wav files
                # Add each .wav file's path and category to lists
                for wav_file in wav_files:
                    file_paths.append(os.path.join(subfolder_path, wav_file))
                    categories.append(subfolder)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'file_path': file_paths,
        'category': categories
    })

    # Count unique categories
    unique_categories = df['category'].nunique()
    
    # Print the number of unique categories
    print(f"Number of unique categories: {unique_categories}")
    
    return df

# Example usage
main_folder_path = 'data'
df = create_dataframe_from_folder(main_folder_path)



Number of unique categories: 36


In [18]:
df.head()
df.shape

(105835, 2)

In [21]:
import pandas as pd
import numpy as np
import librosa

# Function to preprocess the audio files
def preprocess_audio(audio, sr, target_sr=16000, max_length=5):
    # Resample audio if needed
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    
    # Normalize the audio
    audio = librosa.util.normalize(audio)
    
    # Trim leading and trailing silence
    audio, _ = librosa.effects.trim(audio)
    
    # Calculate the required padding
    target_length = target_sr * max_length
    if len(audio) < target_length:
        padding = target_length - len(audio)
        # Pad the audio file
        audio = np.pad(audio, (0, padding), 'constant')
    else:
        # If audio is longer than target length, truncate it
        audio = audio[:target_length]
    
    return audio, sr

# Function to preprocess a chunk of the DataFrame
def preprocess_chunk(df_chunk, target_sr=16000, max_length=5):
    processed_data = []
    
    for idx, row in df_chunk.iterrows():
        file_path = row['file_path']
        category = row['category']
        
        try:
            # Load the audio file
            audio, sr = librosa.load(file_path, sr=None)
            
            # Preprocess the audio
            preprocessed_audio, preprocessed_sr = preprocess_audio(audio, sr, target_sr, max_length)
            
            # Append preprocessed data
            processed_data.append({
                'file_path': file_path,
                'category': category,
                'audio': preprocessed_audio,
                'sr': preprocessed_sr
            })
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
    
    return pd.DataFrame(processed_data)

# Function to process the DataFrame in chunks
def process_dataframe_in_chunks(df, chunk_size=100, target_sr=16000, max_length=5):
    processed_chunks = []
    
    # Split DataFrame into chunks and process each chunk
    for chunk_idx, df_chunk in enumerate(np.array_split(df, len(df) // chunk_size + 1)):
        print(f"Processing chunk {chunk_idx + 1}")
        processed_chunk = preprocess_chunk(df_chunk, target_sr, max_length)
        processed_chunks.append(processed_chunk)
        del df_chunk  # Free memory
    
    # Concatenate all processed chunks into a single DataFrame
    processed_df = pd.concat(processed_chunks, ignore_index=True)
    
    return processed_df

# Example of how to use the function
# Assume df has the columns: file_path, category
preprocessed_df = process_dataframe_in_chunks(df)


Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Processing chunk 41
Processing chunk 42
Processing chunk 43
Processing chunk 44
Processing chunk 45
Processing chunk 46
Processing chunk 47
Processing chunk 48
Processing chunk 49
Processing chunk 50
Processin

In [22]:
df.head()

Unnamed: 0,file_path,category
0,data\backward\0165e0e8_nohash_0.wav,backward
1,data\backward\017c4098_nohash_0.wav,backward
2,data\backward\017c4098_nohash_1.wav,backward
3,data\backward\017c4098_nohash_2.wav,backward
4,data\backward\017c4098_nohash_3.wav,backward


In [23]:
import pandas as pd
import numpy as np
import librosa


In [24]:
def extract_mfcc_features(audio, sr, n_mfcc=13):
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    # Take the mean of the MFCC coefficients over time
    mfcc_mean = mfcc.mean(axis=1)
    return mfcc_mean


In [25]:
def process_audio(file_path, target_sr=16000, n_mfcc=13):
    # Load the audio file
    audio, sr = librosa.load(file_path, sr=None)
    
    # Resample and preprocess audio
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    
    audio = librosa.util.normalize(audio)
    audio, _ = librosa.effects.trim(audio)
    
    target_length = target_sr * 5  # Max length in seconds (e.g., 5 seconds)
    if len(audio) < target_length:
        padding = target_length - len(audio)
        audio = np.pad(audio, (0, padding), 'constant')
    else:
        audio = audio[:target_length]
    
    # Extract MFCC features
    mfcc_features = extract_mfcc_features(audio, sr, n_mfcc)
    
    return mfcc_features


In [26]:
def process_dataframe_in_chunks(df, chunk_size=100, target_sr=16000, n_mfcc=13):
    # Initialize lists to store processed data
    mfcc_features_list = []
    file_paths = []
    
    for chunk_idx, df_chunk in enumerate(np.array_split(df, len(df) // chunk_size + 1)):
        print(f"Processing chunk {chunk_idx + 1}")
        
        for idx, row in df_chunk.iterrows():
            file_path = row['file_path']
            
            try:
                # Process audio file and extract MFCC features
                mfcc_features = process_audio(file_path, target_sr, n_mfcc)
                mfcc_features_list.append(mfcc_features)
                file_paths.append(file_path)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
        
        # Free memory
        del df_chunk

    # Convert lists to DataFrame
    mfcc_df = pd.DataFrame(mfcc_features_list, columns=[f'mfcc_{i}' for i in range(n_mfcc)])
    result_df = pd.DataFrame({'file_path': file_paths}).join(mfcc_df)

    return result_df


In [27]:
# Example usage
# Assume df has the columns: file_path, category
mfcc_df = process_dataframe_in_chunks(df)

# Merge MFCC features with the original DataFrame
df = df.merge(mfcc_df, on='file_path', how='left')


  return bound(*args, **kwds)


Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Processing chunk 20
Processing chunk 21
Processing chunk 22
Processing chunk 23
Processing chunk 24
Processing chunk 25
Processing chunk 26
Processing chunk 27
Processing chunk 28
Processing chunk 29
Processing chunk 30
Processing chunk 31
Processing chunk 32
Processing chunk 33
Processing chunk 34
Processing chunk 35
Processing chunk 36
Processing chunk 37
Processing chunk 38
Processing chunk 39
Processing chunk 40
Processing chunk 41
Processing chunk 42
Processing chunk 43
Processing chunk 44
Processing chunk 45
Processing chunk 46
Processing chunk 47
Processing chunk 48
Processing chunk 49
Processing chunk 50
Processin

In [28]:
df.head()

Unnamed: 0,file_path,category,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12
0,data\backward\0165e0e8_nohash_0.wav,backward,-543.961487,29.465919,-0.224312,-2.614496,1.775066,5.812069,-0.475146,1.754618,1.698112,1.871549,0.78031,0.555312,0.740507
1,data\backward\017c4098_nohash_0.wav,backward,-514.944763,22.943722,-2.943335,0.380538,-1.639417,-0.303708,-3.245466,-1.0019,0.160957,-0.656518,-0.744881,-1.379814,0.494081
2,data\backward\017c4098_nohash_1.wav,backward,-564.731384,24.316267,-4.805779,1.355843,-3.137849,-0.731808,-3.90259,-1.036058,0.867936,-1.381379,-0.920068,-0.432483,0.251636
3,data\backward\017c4098_nohash_2.wav,backward,-519.679565,24.08087,-3.222754,1.211598,-0.023732,-0.661048,-3.239309,-1.242702,-0.020317,-0.827836,-0.380631,-0.757278,0.135732
4,data\backward\017c4098_nohash_3.wav,backward,-523.218933,20.803392,-5.167766,-0.93255,-2.657047,0.153993,-4.686269,-1.573421,0.026947,-2.08637,-1.115192,0.045962,-0.660579


In [1]:
# Save as CSV
df.to_csv('processed_data.csv', index=False)



NameError: name 'df' is not defined

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('processed_data.csv')

In [5]:
df.head()

Unnamed: 0,file_path,category,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12
0,data\backward\0165e0e8_nohash_0.wav,backward,-543.9615,29.46592,-0.224312,-2.614497,1.775066,5.812069,-0.475146,1.754618,1.698111,1.871549,0.78031,0.555312,0.740507
1,data\backward\017c4098_nohash_0.wav,backward,-514.94476,22.943722,-2.943335,0.380538,-1.639417,-0.303708,-3.245466,-1.0019,0.160957,-0.656518,-0.744881,-1.379814,0.494081
2,data\backward\017c4098_nohash_1.wav,backward,-564.7314,24.316267,-4.805779,1.355843,-3.137849,-0.731808,-3.90259,-1.036058,0.867936,-1.381379,-0.920068,-0.432484,0.251636
3,data\backward\017c4098_nohash_2.wav,backward,-519.67957,24.08087,-3.222754,1.211598,-0.023732,-0.661048,-3.239309,-1.242702,-0.020317,-0.827836,-0.380631,-0.757278,0.135732
4,data\backward\017c4098_nohash_3.wav,backward,-523.21893,20.803392,-5.167766,-0.93255,-2.657047,0.153993,-4.686269,-1.573421,0.026947,-2.086371,-1.115192,0.045962,-0.660579


In [6]:
df.shape

(105835, 15)

In [42]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Dropout, Flatten,BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [43]:
# Assume df has the columns: file_path, category, mfcc_0 to mfcc_12

# Extract MFCC features and labels
X = df.iloc[:, 2:].values  # MFCC features (columns mfcc_0 to mfcc_12)
y = df['category'].values  # Labels (category)

# Encode the labels as integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Convert labels to one-hot encoding
NUM_CLASSES = 36  # Assuming there are 36 unique categories
y_one_hot = to_categorical(y, num_classes=NUM_CLASSES)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)

# Reshape X to be compatible with Conv1D layers (samples, timesteps, features)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

print("Data preparation complete. Ready for training.")


Data preparation complete. Ready for training.


In [44]:
# Define a more complex TDNN model
def build_improved_tdnn(input_shape, num_classes):
    model = Sequential()

    # First TDNN Layer (Conv1D)
    model.add(Conv1D(filters=128, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    # Second TDNN Layer (Conv1D)
    model.add(Conv1D(filters=256, kernel_size=3, strides=1, padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    # Third TDNN Layer (Conv1D)
    model.add(Conv1D(filters=256, kernel_size=3, strides=1, padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    # Flatten for fully connected layers
    model.add(Flatten())

    # First fully connected layer
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.4))

    # Second fully connected layer
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.4))

    # Output layer for classification
    model.add(Dense(num_classes, activation='softmax'))

    return model


In [45]:
# Input shape (for each MFCC feature)
input_shape = (X_train.shape[1], 1)

# Build the model
model = build_improved_tdnn(input_shape, NUM_CLASSES)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Add early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with a higher number of epochs and more verbose output
history = model.fit(X_train, y_train, 
                    epochs=50,  # Increase the number of epochs
                    batch_size=128,  # Adjust based on available memory
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping],
                    verbose=2)


Epoch 1/50
662/662 - 38s - loss: 1.9871 - accuracy: 0.3387 - val_loss: 1.0158 - val_accuracy: 0.6090 - 38s/epoch - 58ms/step
Epoch 2/50
662/662 - 37s - loss: 1.1863 - accuracy: 0.5514 - val_loss: 0.8801 - val_accuracy: 0.6313 - 37s/epoch - 56ms/step
Epoch 3/50
662/662 - 37s - loss: 1.0076 - accuracy: 0.6131 - val_loss: 0.6784 - val_accuracy: 0.7296 - 37s/epoch - 56ms/step
Epoch 4/50
662/662 - 37s - loss: 0.9151 - accuracy: 0.6481 - val_loss: 0.6090 - val_accuracy: 0.7767 - 37s/epoch - 56ms/step
Epoch 5/50
662/662 - 37s - loss: 0.8675 - accuracy: 0.6669 - val_loss: 0.7059 - val_accuracy: 0.7011 - 37s/epoch - 56ms/step
Epoch 6/50
662/662 - 38s - loss: 0.7913 - accuracy: 0.6964 - val_loss: 0.4383 - val_accuracy: 0.8472 - 38s/epoch - 57ms/step
Epoch 7/50
662/662 - 40s - loss: 0.7619 - accuracy: 0.7093 - val_loss: 0.3568 - val_accuracy: 0.9024 - 40s/epoch - 61ms/step
Epoch 8/50
662/662 - 41s - loss: 0.7052 - accuracy: 0.7313 - val_loss: 0.3772 - val_accuracy: 0.8882 - 41s/epoch - 61ms/step


In [46]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


662/662 - 7s - loss: 0.0853 - accuracy: 0.9790 - 7s/epoch - 11ms/step
Test Accuracy: 97.90%


In [21]:
%pip install h5py


Note: you may need to restart the kernel to use updated packages.


In [47]:
# Save the trained model in TensorFlow SavedModel format
model.save('model')
print("Model saved to 'complex_tdnn_model'.")




INFO:tensorflow:Assets written to: model\assets


INFO:tensorflow:Assets written to: model\assets


Model saved to 'complex_tdnn_model'.
