In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os


In [None]:
#Location of Unzipped audio clips
project_path = "/content/drive/MyDrive/Voice project/unzipped_dataset/cv-corpus-23.0-2025-09-05/en/"

In [None]:
os.listdir(project_path)

['clips',
 'clip_durations.tsv',
 'other.tsv',
 'invalidated.tsv',
 'validated.tsv',
 'train.tsv',
 'dev.tsv',
 'test.tsv',
 'reported.tsv',
 'validated_sentences.tsv',
 'unvalidated_sentences.tsv']

In [None]:

train_path = os.path.join(project_path, "train.tsv")
clips_path = os.path.join(project_path, "clips")
other_path = os.path.join(project_path, "other.tsv")
invalidated_path = os.path.join(project_path, "invalidated.tsv")
validated_path = os.path.join(project_path, "validated.tsv")
dev_path = os.path.join(project_path, "dev.tsv")
test_path = os.path.join(project_path, "test.tsv")

print(f"Clips path: {clips_path}")
print(f"Train path: {train_path}")
print(f"Test path: {test_path}")

Clips path: /content/drive/MyDrive/Voice project/unzipped_dataset/cv-corpus-23.0-2025-09-05/en/clips
Train path: /content/drive/MyDrive/Voice project/unzipped_dataset/cv-corpus-23.0-2025-09-05/en/train.tsv
Test path: /content/drive/MyDrive/Voice project/unzipped_dataset/cv-corpus-23.0-2025-09-05/en/test.tsv


In [None]:

try:
    train = pd.read_csv(train_path, sep="\t", low_memory=False, on_bad_lines='skip')
    test = pd.read_csv(test_path, sep="\t", on_bad_lines='skip')
    dev = pd.read_csv(dev_path, sep="\t", on_bad_lines='skip')
    other = pd.read_csv(other_path, sep="\t", low_memory=False, on_bad_lines='skip')
    validated = pd.read_csv(validated_path, sep="\t", low_memory=False, on_bad_lines='skip')
    invalidated = pd.read_csv(invalidated_path, sep="\t", low_memory=False, on_bad_lines='skip')

    print(f"Loaded 'train' with {len(train)} rows.")
    print(f"Loaded 'test' with {len(test)} rows.")
    print(f"Loaded 'dev' with {len(dev)} rows.")
    print(f"Loaded 'other' with {len(other)} rows.")
    print(f"Loaded 'validated' with {len(validated)} rows.")
    print(f"Loaded 'invalidated' with {len(invalidated)} rows.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please check your file paths.")

Loaded 'train' with 1142930 rows.
Loaded 'test' with 16397 rows.
Loaded 'dev' with 16401 rows.
Loaded 'other' with 371534 rows.
Loaded 'validated' with 1861780 rows.
Loaded 'invalidated' with 307933 rows.


In [None]:
# Merging all the files into single dataset file
data_dfs = [train, test, dev, other, validated, invalidated]
master_df = pd.concat(data_dfs, ignore_index=True)
print(f"Master list has {len(master_df)} total rows.")

Master list has 3716975 total rows.


In [None]:
# Dropping the missing fields
clean_master_df = master_df.dropna(subset=['age', 'gender', 'path'])

In [None]:
# Additional cleaning for undefined
clean_master_df = clean_master_df[
    (clean_master_df['age'].astype(str).str.strip() != '') &
    (clean_master_df['age'].astype(str).str.strip() != 'Undefined') &
    (clean_master_df['gender'].astype(str).str.strip() != '') &
    (clean_master_df['gender'].astype(str).str.strip() != 'Undefined')
]
print(f"Found {len(clean_master_df)} total clips with good labels.")

Found 2338008 total clips with good labels.


In [None]:
# Checking for missing values
clean_master_df.isnull().sum()

Unnamed: 0,0
client_id,0
path,0
sentence_id,0
sentence,28
sentence_domain,2337378
up_votes,0
down_votes,0
age,0
gender,0
accents,532175


In [None]:
# Dropping unnecessay features
clean_master_df.drop(columns=["sentence_domain","variant","segment"], inplace=True)

In [None]:
clean_master_df.isnull().sum()

Unnamed: 0,0
client_id,0
path,0
sentence_id,0
sentence,28
up_votes,0
down_votes,0
age,0
gender,0
accents,532175
locale,0


In [None]:
clips_path = os.path.join(project_path,"clips")

In [None]:
clips_list = os.listdir(clips_path)

In [None]:
# Total number of clips that Where initially extracted
print(len(clips_list))

154243


In [None]:
# Checking how many audio clips are present in the clean merged dataset
final_list_path = "/content/drive/MyDrive/voice_project/final_master_gold_list.csv"


print(f"Listing the files we have in the 'clips' folder...")

actual_clips_we_have = set(os.listdir(clips_path))
print(f"Found {len(actual_clips_we_have)} actual MP3 files.")

print(f"Your clean MASTER list ('clean_master_df') has {len(clean_master_df)} rows.")
print("Checking which of these files we *actually* have...")


final_df = clean_master_df[clean_master_df['path'].isin(actual_clips_we_have)]




print(f"Out of your {len(clean_master_df)} total clean rows, {len(final_df)} clips actually exist in the folder.")
print(f"This is our final, usable dataset.")


final_df.to_csv(final_list_path, index=False)
print(f"This final list has been saved to: {final_list_path}")

Listing the files we have in the 'clips' folder...
Found 154243 actual MP3 files.
Your clean MASTER list ('clean_master_df') has 2338008 rows.
Checking which of these files we *actually* have...
Out of your 2338008 total clean rows, 84694 clips actually exist in the folder.
This is our final, usable dataset.
This final list has been saved to: /content/drive/MyDrive/voice_project/final_master_gold_list.csv
