## **Importing Libraries**


In [None]:
#Installing the required packages
!pip install librosa
!pip install keras
!pip install tensorflow
!pip install ffmpeg
!pip install noisereduce
!pip install hmmlearn




In [None]:
#Importing the required libraries
import os
import glob
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import wavfile
from scipy import fftpack
from scipy.stats import kurtosis,skew,mode
import sklearn.preprocessing,sklearn.decomposition
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit,StratifiedKFold,train_test_split
from keras import utils
import keras
from keras import layers
from keras.layers import Activation, Dense, Dropout, Conv1D, Conv2D, Flatten,Reshape, BatchNormalization, ZeroPadding2D,MaxPooling1D,AveragePooling1D, MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling1D, AveragePooling2D, Input, Add
from keras.models import Sequential
from keras import regularizers,optimizers
from tensorflow.keras.optimizers import SGD,Adam
from tensorflow.keras.utils import to_categorical
import keras.backend as K
from keras.models import load_model
from keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings("ignore")

## **Google Drive Link: https://drive.google.com/drive/folders/1rqxai0B95AHMcaJUUfA2CHMF5bCwHPr2?usp=sharing**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
print(os.listdir('/content/drive/MyDrive/HW5_Dataset-main/HW5_Dataset-main/en'))


['train.tsv', 'test.tsv', 'clips', 'train_extracted', 'test_extracted', 'train_extracted_denoised', 'test_extracted_denoised']


## **Extracting Training Data from Common Voice. Saves the processed audio files in numpy format**


In [None]:
# Function to extract the training data for Common Voice
def get_training(original_path):

    df = pd.read_csv(os.path.join(original_path, 'train.tsv'), sep='\t')

    # Creating a folder to store the Numpy arrays
    extracted_path = os.path.join(original_path, 'train_extracted')
    if not os.path.exists(extracted_path):
        os.makedirs(extracted_path)

    # Getting the file names of audios from the dataframe
    audio_files = np.array(df['path'])

    # Loading each audio file and save it as a numpy array
    for i, audio_file in enumerate(audio_files):
        audio_path = os.path.join(original_path, 'clips', audio_file)
        d, r = librosa.load(audio_path, mono=True)

        # Saving the audio as a numpy array with the same filename (but with .npy extension)
        np.save(os.path.join(extracted_path, audio_file.replace('.mp3', '.npy')), d)

        if i % 1000 == 0:
            print(f'Processed {i}/{len(audio_files)} files')

get_training(r'/content/drive/MyDrive/HW5_Dataset-main/HW5_Dataset-main/en')

## **Extracting Testing Data from Common Voice. Saves the processed audio files in numpy format**


In [None]:
# Function to extract the testing data for Common Voice
def get_testing(original_path):

    df = pd.read_csv(os.path.join(original_path, 'test.tsv'), sep='\t')

    # Creating a folder to store the Numpy arrays
    extracted_path = os.path.join(original_path, 'test_extracted')
    if not os.path.exists(extracted_path):
        os.makedirs(extracted_path)

    # Getting the file names of audios from the dataframe
    audio_files = np.array(df['path'])

    # Loading each audio file and save it as a numpy array
    for i, audio_file in enumerate(audio_files):
        audio_path = os.path.join(original_path, 'clips', audio_file)
        d, r = librosa.load(audio_path, mono=True)

        # Saving the audio as a numpy array with the same filename (but with .npy extension)
        np.save(os.path.join(extracted_path, audio_file.replace('.mp3', '.npy')), d)

        if i % 1000 == 0:
            print(f'Processed {i}/{len(audio_files)} files')

get_testing(r'/content/drive/MyDrive/HW5_Dataset-main/HW5_Dataset-main/en')


## **Extracting MFCC Features**


In [None]:
# Function to extract MFCC features which takes a CSV file and the extracted folder as arguments
def get_mfcc_features(original_path, csv_file, extracted_folder):
    # Loading the CSV file into a dataframe.
    df = pd.read_csv(os.path.join(original_path, csv_file), sep='\t')

    # Get the audio file names from the 'path' column.
    audio_extracted = np.array(df['path'])
    labels = np.array(df['accents'])

    # Creating an empty list to store the features.
    mfcc_features = []

    # Looping on each audio file path.
    for i in range(len(audio_extracted)):
        audio_file_data = np.load(os.path.join(original_path, extracted_folder, audio_extracted[i].replace('.mp3', '.npy')))

        # Calculating MFCC coefficients for the audio sequence.
        mfcc_data = librosa.feature.mfcc(y=audio_file_data, sr=22050, n_fft=441, hop_length=220)

        # Calculating various statistical measures on the coefficients.
        mean_mfcc = np.mean(mfcc_data, axis=1)
        median_mfcc = np.median(mfcc_data, axis=1)
        std_mfcc = np.std(mfcc_data, axis=1)
        skew_mfcc = skew(mfcc_data, axis=1)
        kurt_mfcc = kurtosis(mfcc_data, axis=1)
        maximum_mfcc = np.amax(mfcc_data, axis=1)
        minimum_mfcc = np.amin(mfcc_data, axis=1)

        # Concatenating all the statistical measures and adding to the feature list.
        feature_vector = np.concatenate((mean_mfcc, median_mfcc, std_mfcc, skew_mfcc, kurt_mfcc, maximum_mfcc, minimum_mfcc))
        mfcc_features.append(feature_vector)

    return np.array(mfcc_features), labels

mfcc_features = get_mfcc_features(original_path=r'/content/drive/MyDrive/HW5_Dataset-main/HW5_Dataset-main/en', csv_file='train.tsv', extracted_folder='train_extracted')


## **Printing Out MFCC Features**

In [None]:
print(len(mfcc_features))  # Checking how many audio files were processed
print(mfcc_features[0])    # Checking the features of the first audio file

2
[[-395.44125    82.187065   23.269756 ...  -32.778545  -59.210373
   -19.197433]
 [-413.54953    85.24704    12.00551  ...  -31.131939  -51.434082
   -24.725624]
 [-455.73132   114.03998    18.569351 ...  -35.483437  -31.239515
   -32.562225]
 ...
 [-539.7993     89.63088   -12.189858 ...  -49.5084    -41.546036
   -44.69294 ]
 [-520.63873    83.46228   -10.559923 ...  -42.985275  -32.64163
   -36.955837]
 [-493.47556   101.157814  -10.478767 ...  -47.128277  -44.514133
   -35.5579  ]]


## The Below code helps to identify the different accent classes ('United States English', 'England English', 'German English,Non native speaker'), which will correspond to class labels 0, 1, 2.

In [None]:
import pandas as pd
import os

# Define the path and file name
original_path = '/content/drive/MyDrive/HW5_Dataset-main/HW5_Dataset-main/en'
csv_file = 'train.tsv'

# Load the CSV file into a dataframe
df = pd.read_csv(os.path.join(original_path, csv_file), sep='\t')

# Check the unique values in the 'accents' column
unique_accents = df['accents'].unique()
print(unique_accents)

## **Training and Evaluating a Random Forest Model on Processed Data**


In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score  # Added the missing import
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Extracting features and labels from the dataset using the get_mfcc_features function
X, y = get_mfcc_features(original_path, csv_file, extracted_folder)

# Encoding the labels as integers (Random Forest requires numeric labels)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Checking the distribution of the classes in the encoded labels
print(f"Class distribution:\n{pd.Series(y_encoded).value_counts()}")

# Splitting the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initializing the Random Forest classifier with balanced class weights to handle class imbalance
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')

# Training the model using the training data
rf_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculating the accuracy of the model on the test set
rf_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {rf_accuracy:.4f}")

# Generating a classification report to evaluate the performance in more detail
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Performing cross-validation to assess the model's performance across different splits of the data
cv_scores = cross_val_score(rf_model, X, y_encoded, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average Cross-Validation Score: {np.mean(cv_scores):.4f}")


Class distribution:
2    2436
0     657
1     629
Name: count, dtype: int64
Accuracy: 0.9879

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.97       127
           1       1.00      0.99      1.00       111
           2       0.99      1.00      0.99       507

    accuracy                           0.99       745
   macro avg       0.99      0.98      0.98       745
weighted avg       0.99      0.99      0.99       745

Cross-Validation Scores: [0.76510067 0.97315436 0.98521505 0.98655914 0.93010753]
Average Cross-Validation Score: 0.9280


## **Training and Evaluating a Decision Tree Model on Processed Data**

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Extracting MFCC features and labels from the dataset
X, y = get_mfcc_features(original_path, csv_file, extracted_folder)

# Encoding labels as integers since Decision Tree needs numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Display the class distribution to understand the dataset
print(f"Class distribution:\n{pd.Series(y_encoded).value_counts()}")

# Splitting data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initializing and training the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = dt_model.predict(X_test)

# Evaluating the model's performance using accuracy score
dt_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {dt_accuracy:.4f}")

# Printing the classification report to evaluate the model in detail
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Checking the model's performance using cross-validation
cv_scores = cross_val_score(dt_model, X, y_encoded, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average Cross-Validation Score: {np.mean(cv_scores):.4f}")

Class distribution:
2    2436
0     657
1     629
Name: count, dtype: int64
Accuracy: 0.9584

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.90       127
           1       0.98      0.97      0.98       111
           2       0.98      0.96      0.97       507

    accuracy                           0.96       745
   macro avg       0.94      0.95      0.95       745
weighted avg       0.96      0.96      0.96       745

Cross-Validation Scores: [0.64966443 0.90067114 0.90994624 0.9233871  0.90860215]
Average Cross-Validation Score: 0.8585


## **Training and Evaluating a KNN Model on Processed Data**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Extracting the MFCC features and labels from the dataset
X, y = get_mfcc_features(original_path, csv_file, extracted_folder)

# Encoding the labels into numeric values as KNN requires numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Displaying the class distribution to understand the dataset
print(f"Class distribution:\n{pd.Series(y_encoded).value_counts()}")

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initializing and training the KNN model with n_neighbors=3
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = knn_model.predict(X_test)

# Evaluating the performance of the model using accuracy score
knn_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {knn_accuracy:.4f}")

# Printing the classification report to evaluate the model's performance in detail
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Checking the model's performance using cross-validation
cv_scores = cross_val_score(knn_model, X, y_encoded, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average Cross-Validation Score: {np.mean(cv_scores):.4f}")

Class distribution:
2    2436
0     657
1     629
Name: count, dtype: int64
Accuracy on test set: 0.9799

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       127
           1       1.00      0.99      1.00       111
           2       0.98      0.99      0.99       507

    accuracy                           0.98       745
   macro avg       0.98      0.97      0.97       745
weighted avg       0.98      0.98      0.98       745

Cross-Validation Scores: [0.62818792 0.95302013 0.96505376 0.95833333 0.94489247]
Average Cross-Validation Score: 0.8899


## **Model's Performance Summary**

In [None]:
model_summary = {
    'Model': ['Random Forest', 'Decision Tree', 'K-Nearest Neighbors'],
    'Accuracy': [rf_accuracy, dt_accuracy, knn_accuracy]
}

# Create a DataFrame to present the results
summary_df = pd.DataFrame(model_summary)

# Display the summary
summary_df

Unnamed: 0,Model,Accuracy
0,Random Forest,0.987919
1,Decision Tree,0.958389
2,K-Nearest Neighbors,0.979866


## **Statistical ANOVA Test on MFCC Coefficients**

In [None]:
from scipy.stats import f_oneway
import pandas as pd
import numpy as np

# Assuming get_mfcc_features function is defined elsewhere and returns X (MFCCs) and y (labels)
X, y = get_mfcc_features(original_path, csv_file, extracted_folder)

# Converting features and labels into a DataFrame for easier manipulation
df = pd.DataFrame(X)
df['label'] = y

# Selecting a specific MFCC coefficient to test, e.g., the first coefficient (index 0)
mfcc_index = 0
grouped_data = [df[df['label'] == label][mfcc_index].values for label in df['label'].unique()]

# Performing ANOVA to test differences in means across groups
f_stat, p_value = f_oneway(*grouped_data)

# Output results
print("ANOVA Results on MFCC Coefficients:")
print(f"F-statistic: {f_stat:.4f}, p-value: {p_value:.4f}")

# Interpretation
if p_value < 0.05:
    print("Significant differences exist in MFCC values across accent classes.")
else:
    print("No significant differences found in MFCC values across accent classes.")


ANOVA Results on MFCC Coefficients:
F-statistic: 180.5076, p-value: 0.0000
Significant differences exist in MFCC values across accent classes.
