In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Balanced dataset list
path = "/content/drive/MyDrive/voice_project/final_balanced_list.csv"

In [None]:
df = pd.read_csv(path)

In [None]:
df['stratify_label'] = df['age'] + "_" + df['gender']

In [None]:
train_df, temp_df = train_test_split(
    df,
    test_size=0.3,
    random_state=42,
    stratify=df['stratify_label']
)

In [None]:
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['stratify_label']
)

In [None]:
train_df = train_df.drop(columns=['stratify_label'])
val_df = val_df.drop(columns=['stratify_label'])
test_df = test_df.drop(columns=['stratify_label'])

In [None]:
train_df.shape

In [None]:
train_path = "/content/drive/MyDrive/voice_project/train_list.csv"
val_path = "/content/drive/MyDrive/voice_project/val_list.csv"
test_path = "/content/drive/MyDrive/voice_project/test_list.csv"

In [None]:
train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

In [None]:
import os

In [None]:
feature_path = "/content/drive/MyDrive/voice_project/features_melspec_5sec_all"
train_list_path = "/content/drive/MyDrive/voice_project/train_list.csv"
stats_path = "/content/drive/MyDrive/voice_project/scaling_stats"

In [None]:
os.makedirs(stats_path, exist_ok=True)
train_df = pd.read_csv(train_list_path)


In [None]:
count = 0
mean = 0.0
M2 = 0.0

In [None]:
from tqdm import tqdm
import numpy as np

In [None]:
for index, row in tqdm(train_df.iterrows(), total=len(train_df)):

    # Load the .npy file
    filename = row['path'].replace(".mp3", ".npy")
    file_path = os.path.join(feature_path, filename)

    try:
        spec = np.load(file_path)

        # Flatten the spectrogram into a 1D array
        spec_flat = spec.flatten()

        # Welford's online algorithm:
        for x in spec_flat:
            count += 1
            delta = x - mean
            mean += delta / count
            delta2 = x - mean
            M2 += delta * delta2

    except Exception as e:
        print(f"Error loading {file_path}: {e}")

In [None]:
if count > 1:
    mean_val = mean
    variance_val = M2 / (count - 1)
    std_val = np.sqrt(variance_val)

    print("\n--- Stats Calculation Complete ---")
    print(f"Global Mean:   {mean_val}")
    print(f"Global Std:    {std_val}")

    # 5. Save the stats to files
    mean_file = os.path.join(stats_path, "global_mean.npy")
    std_file = os.path.join(stats_path, "global_std.npy")

    np.save(mean_file, mean_val)
    np.save(std_file, std_val)

    print(f"\nSuccessfully saved stats to: {stats_path}")

else:
    print("Error: No data processed.")