In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#libraries
import csv
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.exceptions import NotFittedError

"""
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import os
from sklearn.preprocessing import LabelEncoder

# Load the dataset
dataset_path = '/path/to/mozila_common_voice'
metadata = pd.read_csv(os.path.join(dataset_path, 'validated.tsv'), sep='\t')

# Display the first few rows
print(metadata.head())

# Clean the dataset by dropping rows with missing values
metadata_clean = metadata.dropna()

# Encode the 'client_id' or any other categorical features
label_encoder = LabelEncoder()
metadata_clean['client_id_encoded'] = label_encoder.fit_transform(metadata_clean['client_id'])
"""

In [None]:
# Define a custom label encoder to handle unseen labels
class ExtendedLabelEncoder(LabelEncoder):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.classes_ = None

    def fit(self, y):
        super().fit(y)
        self.classes_ = super().classes_

    def transform(self, y):
        try:
            return super().transform(y)
        except NotFittedError:
            raise
        except ValueError as e:
            unseen_label = max(self.classes_) + 1
            self.classes_ = np.append(self.classes_, unseen_label)
            return np.where(y == e.args[0], unseen_label, super().transform(y))

In [None]:
# dataset
chunk_size = 50000
#Number of Rows per Chunk
chunks = pd.read_csv('/content/drive/MyDrive/dataset/TeamDeepwave/dataset/combined_file.csv', low_memory = False, chunksize = chunk_size)

for i, chunks in enumerate(chunks):
  chunks.to_csv(f'chunks_{i}.csv', index = False)

In [None]:
dataset_path = '/content/drive/MyDrive/dataset/TeamDeepwave/dataset/cv-corpus-17.0-delta-2024-03-15/en'
all_filenames = glob.glob(dataset_path + '/*.tsv')
dataframes = [pd.read_csv(filename, sep='\t', encoding='utf-8') for filename in all_filenames]
combined_csv = pd.concat(dataframes)
combined_csv.to_csv('/content/drive/MyDrive/dataset/TeamDeepwave/dataset/combined_file.csv', index=False, encoding='utf-8-sig')
"""
for filename in all_filenames:
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')
    dataframes.append(df)
    combined_csv = pd.concat(dataframes)
    combined_csv.to_csv('/content/drive/MyDrive/dataset/TeamDeepwave/dataset/combined_file.csv', index=False, encoding='utf-8-sig')
    """


"\nfor filename in all_filenames:\n    df = pd.read_csv(filename, sep='\t', encoding='utf-8')\n    dataframes.append(df)\n    combined_csv = pd.concat(dataframes)\n    combined_csv.to_csv('/content/drive/MyDrive/dataset/TeamDeepwave/dataset/combined_file.csv', index=False, encoding='utf-8-sig')\n    "

In [None]:
print(combined_csv.head(10))

                           clip  duration[ms] client_id path sentence_id  \
0  common_voice_en_39863408.mp3        4140.0       NaN  NaN         NaN   
1  common_voice_en_39944676.mp3        5832.0       NaN  NaN         NaN   
2  common_voice_en_39762970.mp3        4176.0       NaN  NaN         NaN   
3  common_voice_en_39958361.mp3        9036.0       NaN  NaN         NaN   
4  common_voice_en_39916734.mp3        6984.0       NaN  NaN         NaN   
5  common_voice_en_39590171.mp3        5148.0       NaN  NaN         NaN   
6  common_voice_en_39939743.mp3        4068.0       NaN  NaN         NaN   
7  common_voice_en_40103066.mp3        6840.0       NaN  NaN         NaN   
8  common_voice_en_39634391.mp3        4176.0       NaN  NaN         NaN   
9  common_voice_en_39859338.mp3        3636.0       NaN  NaN         NaN   

  sentence sentence_domain  up_votes  down_votes  age gender accents  variant  \
0      NaN             NaN       NaN         NaN  NaN    NaN     NaN      NaN   
1

In [None]:
dropna_csv = combined_csv.dropna()

cleansed_csv = combined_csv.dropna(how = 'all')
print("DataFrame with rows dropped where any column has NaN:")
print(dropna_csv.head(10))

print("\nDataFrame with rows dropped where all columns are NaN:")
print(cleansed_csv.head(10))

DataFrame with rows dropped where any column has NaN:
Empty DataFrame
Columns: [clip, duration[ms], client_id, path, sentence_id, sentence, sentence_domain, up_votes, down_votes, age, gender, accents, variant, locale, segment, reason, source, is_used, clips_count]
Index: []

DataFrame with rows dropped where all columns are NaN:
                           clip  duration[ms] client_id path sentence_id  \
0  common_voice_en_39863408.mp3        4140.0       NaN  NaN         NaN   
1  common_voice_en_39944676.mp3        5832.0       NaN  NaN         NaN   
2  common_voice_en_39762970.mp3        4176.0       NaN  NaN         NaN   
3  common_voice_en_39958361.mp3        9036.0       NaN  NaN         NaN   
4  common_voice_en_39916734.mp3        6984.0       NaN  NaN         NaN   
5  common_voice_en_39590171.mp3        5148.0       NaN  NaN         NaN   
6  common_voice_en_39939743.mp3        4068.0       NaN  NaN         NaN   
7  common_voice_en_40103066.mp3        6840.0       NaN  NaN 

In [None]:
tsv_read = '/content/drive/MyDrive/dataset/TeamDeepwave/dataset/cv-corpus-17.0-delta-2024-03-15/en/validated.tsv'
df = pd.read_csv(tsv_read, sep='\t', encoding='utf-8')
tsv_filename = glob.glob(tsv_read + '/*.tsv')

for filename in tsv_filename:
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')

print(f"Inspecting{filename}:")
print(df.head(10))
print(df.shape)
print(df.info())

Inspecting/content/drive/MyDrive/dataset/TeamDeepwave/dataset/cv-corpus-17.0-delta-2024-03-15/en/unvalidated_sentences.tsv:
                                           client_id  \
0  01e8ea298cdecf26e273f5baac3915eb992c493f229686...   
1  02cbc1fe01fc67fa72c6e067fbe020399082efbeb57a2b...   
2  03b62f72067ec967c423852bef03d1b61e63c156d86f6e...   
3  05112cb5965431bbd47abd29b4faea9fb009b5a2e320e0...   
4  05d33ad00cc2754da8e542a33a5255f9346535ef1d8619...   
5  08072f2de4dcc2bfec5058dca41eb9535b61ccd193ecc4...   
6  083af8bc921baf15ad5d8c8c876f4ecaf4f52bf6370161...   
7  09c60e79113b346bbc1009fe21c46f273d09e62b0a73af...   
8  0a0db76f30e3d011216425efa204fa0d3064767656b913...   
9  0d6f10e8503355612e903ececf61788f7bfa18aee68ebc...   

                           path  \
0  common_voice_en_39751075.mp3   
1  common_voice_en_39589864.mp3   
2  common_voice_en_40087973.mp3   
3  common_voice_en_39587246.mp3   
4  common_voice_en_40117514.mp3   
5  common_voice_en_39603786.mp3   
6  common_voic

In [None]:
df['sentence_domain'] = df['sentence_domain'].fillna('Unknown')
df_cleaned = df.dropna()
print(combined_csv.columns)

Index(['clip', 'duration[ms]', 'client_id', 'path', 'sentence_id', 'sentence',
       'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents',
       'variant', 'locale', 'segment', 'reason', 'source', 'is_used',
       'clips_count'],
      dtype='object')


In [None]:
# Assuming 'label' is the gender column and other columns are acoustic features
X = combined_csv.drop(columns=['clips_count'])
y = combined_csv['client_id']


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=124)

In [None]:
extended_label_encoder = ExtendedLabelEncoder()

In [None]:
# Fit and transform the 'sentence_id' column in the training set
X_train['sentence_id'] = extended_label_encoder.fit_transform(X_train['sentence_id'])


In [None]:
# Metric function
def print_metrics(y_true, y_pred):
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")
