In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
path = f'/content/drive/MyDrive/voice_project/final_master_gold_list.csv'

In [None]:
df = pd.read_csv(path)

In [None]:
df.head()


In [None]:
df.shape

In [None]:

print(df['age'].value_counts())
print("\n")
print(df['gender'].value_counts())

In [None]:
import matplotlib.pyplot as plt

# AGE DISTRIBUTION PLOT
age_counts = df['age'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
plt.bar(age_counts.index.astype(str), age_counts.values)
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Distribution of Age Labels")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# GENDER DISTRIBUTION PLOT
gender_counts = df['gender'].value_counts()

plt.figure(figsize=(8, 6))
plt.bar(gender_counts.index.astype(str), gender_counts.values)
plt.xlabel("Gender")
plt.ylabel("Count")
plt.title("Distribution of Gender Labels")
plt.tight_layout()
plt.show()


In [None]:
# Removing 90's from the dataset because we have only 3 of them which not useful
df = df[df['age'] != 'nineties'].copy()

In [None]:
df['gender'].unique()

In [None]:
# mapping male_masculin to male and female_feminine to female
df['gender'] = df['gender'].map({
    'male_masculine': 'male',
    'female_feminine': 'female'
})

In [None]:
#sixties and seventies are very few in samples so merging both of them
age_map = {
    'teens': 'teens',
    'twenties': 'twenties',
    'thirties': 'thirties',
    'fourties': 'fourties',
    'fifties': 'fifties',
    'sixties': '60plus',
    'seventies': '60plus'
}

df['age'] = df['age'].map(age_map)

In [None]:
group_counts = df.groupby(['age', 'gender']).size()
N = group_counts.min()

print("\n--- Group Counts (Age x Gender) ---")
print(group_counts)
print(f"\nSmallest group (N) has: {N} samples")

In [None]:
balanced_df_list = []
for (age, gender), group in df.groupby(['age', 'gender']):
    # Randomly sample N clips from this group
    balanced_df_list.append(group.sample(N, random_state=42))

# Combine all the balanced samples into one final dataframe
final_balanced_df = pd.concat(balanced_df_list)

# Shuffle the dataframe just to be safe
final_balanced_df = final_balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n--- New Balanced Dataset ---")

print("\nNew Age Distribution:")
print(final_balanced_df['age'].value_counts())

print("\nNew Gender Distribution:")
print(final_balanced_df['gender'].value_counts())

# Save the new balanced list
balanced_list_path = "/content/drive/MyDrive/voice_project/final_balanced_list.csv"
final_balanced_df.to_csv(balanced_list_path, index=False)

print(f"\nSuccessfully saved balanced dataset to: {balanced_list_path}")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


# AGE DISTRIBUTION (BALANCED DATASET)

age_counts_bal = final_balanced_df['age'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
plt.bar(age_counts_bal.index.astype(str), age_counts_bal.values)
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Balanced Dataset: Age Distribution")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# GENDER DISTRIBUTION (BALANCED DATASET)

gender_counts_bal = final_balanced_df['gender'].value_counts()

plt.figure(figsize=(8, 6))
plt.bar(gender_counts_bal.index.astype(str), gender_counts_bal.values)
plt.xlabel("Gender")
plt.ylabel("Count")
plt.title("Balanced Dataset: Gender Distribution")
plt.tight_layout()
plt.show()



# AGE × GENDER HEATMAP (SUPER USEFUL)

cross_tab = pd.crosstab(final_balanced_df['age'], final_balanced_df['gender'])

plt.figure(figsize=(10, 6))
plt.imshow(cross_tab, aspect='auto')

plt.xticks(range(len(cross_tab.columns)), cross_tab.columns)
plt.yticks(range(len(cross_tab.index)), cross_tab.index)

plt.xlabel("Gender")
plt.ylabel("Age")
plt.title("Balanced Dataset: Age × Gender Distribution Heatmap")

# Add values on the heatmap
for i in range(len(cross_tab.index)):
    for j in range(len(cross_tab.columns)):
        plt.text(j, i, cross_tab.iloc[i, j], ha='center', va='center')

plt.tight_layout()
plt.show()
