In [1]:
!unzip '/content/drive/MyDrive/sps/timit_dataset.zip'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: timit_dataset/TRAIN/DR6/MSMR0/SX145.TXT  
  inflating: __MACOSX/timit_dataset/TRAIN/DR6/MSMR0/._SX145.TXT  
  inflating: timit_dataset/TRAIN/DR6/MSMR0/SI1150.WRD  
  inflating: __MACOSX/timit_dataset/TRAIN/DR6/MSMR0/._SI1150.WRD  
  inflating: timit_dataset/TRAIN/DR6/MSMR0/SX415.WRD  
  inflating: __MACOSX/timit_dataset/TRAIN/DR6/MSMR0/._SX415.WRD  
  inflating: timit_dataset/TRAIN/DR6/MSMR0/SI1150.PHN  
  inflating: __MACOSX/timit_dataset/TRAIN/DR6/MSMR0/._SI1150.PHN  
  inflating: timit_dataset/TRAIN/DR6/MSMR0/SX415.PHN  
  inflating: __MACOSX/timit_dataset/TRAIN/DR6/MSMR0/._SX415.PHN  
  inflating: timit_dataset/TRAIN/DR6/MSMR0/SX145.WAV  
  inflating: __MACOSX/timit_dataset/TRAIN/DR6/MSMR0/._SX145.WAV  
  inflating: timit_dataset/TRAIN/DR6/MSMR0/SA2.PHN  
  inflating: __MACOSX/timit_dataset/TRAIN/DR6/MSMR0/._SA2.PHN  
  inflating: timit_dataset/TRAIN/DR6/MSMR0/SX55.TXT  
  inflating: __MACOSX/timit_datase

### Preprocessing CSVs

In [2]:
import numpy as np
import pandas as pd
import os

# --- Paths ---
SPKR_INFO_CSV = '/content/drive/MyDrive/sps/spkr_info.csv'
TRAIN_CSV     = '/content/drive/MyDrive/sps/train_data.csv'
TEST_CSV      = '/content/drive/MyDrive/sps/test_data.csv'

spkr_df = pd.read_csv(SPKR_INFO_CSV)
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

In [3]:
spkr_df.drop(columns=['Unnamed: 0', 'RecDate', 'BirthDate','Ht', 'Edu'], inplace=True)

In [4]:
spkr_df.head()

Unnamed: 0,ID,Sex,DR,Use,Race,age,height
0,ABC0,M,6,TRN,WHT,25.71,180.34
1,ABW0,M,2,TST,WHT,26.9,175.26
2,ADC0,M,3,TRN,WHT,31.83,175.26
3,ADD0,M,7,TRN,WHT,27.85,187.96
4,ADG0,F,4,TST,WHT,26.84,162.56


In [5]:
spkr_df.rename(columns={
    'ID': 'SpeakerID',
    'Sex': 'Gender',
    'Race': 'Ethnicity'
}, inplace=True)

In [6]:
spkr_df.head()

Unnamed: 0,SpeakerID,Gender,DR,Use,Ethnicity,age,height
0,ABC0,M,6,TRN,WHT,25.71,180.34
1,ABW0,M,2,TST,WHT,26.9,175.26
2,ADC0,M,3,TRN,WHT,31.83,175.26
3,ADD0,M,7,TRN,WHT,27.85,187.96
4,ADG0,F,4,TST,WHT,26.84,162.56


In [7]:
spkr_df['SpeakerID'] = spkr_df['SpeakerID'].astype(str).str.upper()

# Filter out unknown/invalid Ethnicity ('???')
initial_len = len(spkr_df)
spkr_df = spkr_df[spkr_df['Ethnicity'].astype(str).str.upper() != '???']

print(f"Filtered {initial_len - len(spkr_df)} rows.")

Filtered 17 rows.


In [8]:
spkr_df['age'] = pd.to_numeric(spkr_df['age'], errors='coerce')
spkr_df['height'] = pd.to_numeric(spkr_df['height'], errors='coerce')

spkr_df.dropna(subset=['age', 'height'], inplace=True)

print(f"Rows dropped {initial_len - len(spkr_df)}")

Rows dropped 17


In [9]:
spkr_df.dropna(subset=['Gender', 'Use', 'Ethnicity', 'DR'], inplace=True)
print(f"Rows dropped {initial_len - len(spkr_df)} with missing Gender/Use/Ethnicity/DR.")

Rows dropped 17 with missing Gender/Use/Ethnicity/DR.


In [None]:
spkr_df['SpeakerID'] = spkr_df['Gender'] + spkr_df['SpeakerID']

In [10]:
spkr_df.set_index('SpeakerID', inplace=True)

In [11]:
preprocessed_spkr_df = spkr_df.copy()

In [12]:
preprocessed_spkr_df.head()

Unnamed: 0_level_0,Gender,DR,Use,Ethnicity,age,height
SpeakerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABC0,M,6,TRN,WHT,25.71,180.34
ABW0,M,2,TST,WHT,26.9,175.26
ADC0,M,3,TRN,WHT,31.83,175.26
ADD0,M,7,TRN,WHT,27.85,187.96
ADG0,F,4,TST,WHT,26.84,162.56


In [13]:
train_df.drop(columns=['path_from_data_dir_windows'], inplace=True)
test_df.drop(columns=['path_from_data_dir_windows'], inplace=True)

In [14]:
train_df.rename(columns={
    'speaker_id': 'SpeakerID',
    'Sex': 'Gender',
    'test_or_train': 'Use',
    'path_from_data_dir': 'FilePath',
    'dialect_region': 'DR'
}, inplace=True)

test_df.rename(columns={
    'speaker_id': 'SpeakerID',
    'Sex': 'Gender',
    'test_or_train': 'Use',
    'path_from_data_dir': 'FilePath',
    'dialect_region': 'DR'
}, inplace=True)

In [15]:
train_df.head()

Unnamed: 0,index,Use,DR,SpeakerID,filename,FilePath,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
0,1,TRAIN,DR4,MMDM0,SI681.WAV.wav,TRAIN/DR4/MMDM0/SI681.WAV.wav,True,True,False,False,False
1,2,TRAIN,DR4,MMDM0,SI1311.PHN,TRAIN/DR4/MMDM0/SI1311.PHN,False,False,False,True,False
2,3,TRAIN,DR4,MMDM0,SI1311.WRD,TRAIN/DR4/MMDM0/SI1311.WRD,False,False,True,False,False
3,4,TRAIN,DR4,MMDM0,SX321.PHN,TRAIN/DR4/MMDM0/SX321.PHN,False,False,False,True,False
4,5,TRAIN,DR4,MMDM0,SX321.WRD,TRAIN/DR4/MMDM0/SX321.WRD,False,False,True,False,False


In [16]:
test_df.head()

Unnamed: 0,index,Use,DR,SpeakerID,filename,FilePath,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
0,1,TEST,DR4,MGMM0,SX139.WAV,TEST/DR4/MGMM0/SX139.WAV,False,True,False,False,False
1,2,TEST,DR4,MGMM0,SX139.WAV.wav,TEST/DR4/MGMM0/SX139.WAV.wav,True,True,False,False,False
2,3,TEST,DR4,MGMM0,SX139.TXT,TEST/DR4/MGMM0/SX139.TXT,False,False,False,False,True
3,4,TEST,DR4,MGMM0,SI499.WRD,TEST/DR4/MGMM0/SI499.WRD,False,False,True,False,False
4,5,TEST,DR4,MGMM0,SX319.WRD,TEST/DR4/MGMM0/SX319.WRD,False,False,True,False,False


In [17]:
train_df['SpeakerID'] = train_df['SpeakerID'].astype(str).str.upper()
train_df['FilePath'] = train_df['FilePath'].astype(str).str.strip()

# Keep rows where FilePath ends with .WAV (case-insensitive)
initial_len = len(train_df)

train_df = train_df[~train_df['filename'].str.endswith('.WAV.wav')]
preprocessed_train_df = train_df[train_df['FilePath'].str.contains(r'\.WAV$', case=False, regex=True, na=False)].copy()

print(f"Filtered {initial_len - len(preprocessed_train_df)} non-.WAV entries.")
print(f"Cleaned file list ready: {len(preprocessed_train_df)} audio utterances.")

preprocessed_train_df.head()

Filtered 18480 non-.WAV entries.
Cleaned file list ready: 4620 audio utterances.


Unnamed: 0,index,Use,DR,SpeakerID,filename,FilePath,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
7,8,TRAIN,DR4,MMDM0,SI681.WAV,TRAIN/DR4/MMDM0/SI681.WAV,False,True,False,False,False
17,18,TRAIN,DR4,MMDM0,SA2.WAV,TRAIN/DR4/MMDM0/SA2.WAV,False,True,False,False,False
19,20,TRAIN,DR4,MMDM0,SX411.WAV,TRAIN/DR4/MMDM0/SX411.WAV,False,True,False,False,False
22,23,TRAIN,DR4,MMDM0,SA1.WAV,TRAIN/DR4/MMDM0/SA1.WAV,False,True,False,False,False
25,26,TRAIN,DR4,MMDM0,SX231.WAV,TRAIN/DR4/MMDM0/SX231.WAV,False,True,False,False,False


In [18]:
test_df['SpeakerID'] = test_df['SpeakerID'].astype(str).str.upper()
test_df['FilePath'] = test_df['FilePath'].astype(str).str.strip()

# Keep rows where FilePath ends with .WAV (case-insensitive)
initial_len = len(test_df)

test_df = test_df[~test_df['FilePath'].str.endswith('.WAV.wav')]
preprocessed_test_df = test_df[test_df['FilePath'].str.contains(r'\.WAV$', case=False, regex=True, na=False)].copy()

print(f"Filtered {initial_len - len(preprocessed_test_df)} non-.WAV entries.")
print(f"Cleaned file list ready: {len(preprocessed_test_df)} audio utterances.")

preprocessed_test_df.head()

Filtered 6720 non-.WAV entries.
Cleaned file list ready: 1680 audio utterances.


Unnamed: 0,index,Use,DR,SpeakerID,filename,FilePath,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
0,1,TEST,DR4,MGMM0,SX139.WAV,TEST/DR4/MGMM0/SX139.WAV,False,True,False,False,False
17,18,TEST,DR4,MGMM0,SA2.WAV,TEST/DR4/MGMM0/SA2.WAV,False,True,False,False,False
20,21,TEST,DR4,MGMM0,SX229.WAV,TEST/DR4/MGMM0/SX229.WAV,False,True,False,False,False
23,24,TEST,DR4,MGMM0,SA1.WAV,TEST/DR4/MGMM0/SA1.WAV,False,True,False,False,False
24,25,TEST,DR4,MGMM0,SX49.WAV,TEST/DR4/MGMM0/SX49.WAV,False,True,False,False,False


In [None]:
if 'preprocessed_spkr_df' not in locals() or 'preprocessed_train_df' not in locals() or 'preprocessed_test_df' not in locals():
    raise NameError("One or more preprocessed dataframes (spkr, train, test) not found.")

if preprocessed_spkr_df.index.name != 'SpeakerID':
    if 'SpeakerID' in preprocessed_spkr_df.columns:
         preprocessed_spkr_df.set_index('SpeakerID', inplace=True)
    else:
        raise ValueError("SpeakerID not found as index or column in preprocessed_spkr_df")

if 'SpeakerID' not in preprocessed_train_df.columns or 'SpeakerID' not in preprocessed_test_df.columns:
    raise ValueError("SpeakerID column not found in preprocessed train/test df.")

In [None]:
valid_speakers = set(preprocessed_spkr_df.index)
train_files_valid_spk = preprocessed_train_df[preprocessed_train_df['SpeakerID'].isin(valid_speakers)].copy()
test_files_valid_spk = preprocessed_test_df[preprocessed_test_df['SpeakerID'].isin(valid_speakers)].copy()
print(f"Utterances with valid speakers before merge: Train={len(train_files_valid_spk)}, Test={len(test_files_valid_spk)}")


In [None]:
train_df_merged = pd.merge(
    train_files_valid_spk,
    preprocessed_spkr_df, # Contains targets and 'Use' column
    left_on='SpeakerID',
    right_index=True,     # Merge on spkr_df's index
    how='left'
)

test_df_merged = pd.merge(
    test_files_valid_spk,
    preprocessed_spkr_df,
    left_on='SpeakerID',
    right_index=True,
    how='left'
)

In [None]:
# --- Final Filtering based on 'Use' flag and NaNs ---
target_cols = ['age', 'Gender', 'height', 'Ethnicity']
required_final_cols = ['FilePath', 'SpeakerID', 'Use'] + target_cols
initial_train_len = len(train_df_merged)
initial_test_len = len(test_df_merged)

train_df_merged.dropna(subset=required_final_cols, inplace=True)
test_df_merged.dropna(subset=required_final_cols, inplace=True)

In [None]:
# Separate final train/test based on the 'Use' column from speaker info
train_df_final = train_df_merged[train_df_merged['Use'].str.upper() == 'TRN'].copy()
test_df_final = test_df_merged[test_df_merged['Use'].str.upper() == 'TST'].copy()

print(f"Final Merged Train DF: {len(train_df_final)} utterances (removed {initial_train_len - len(train_df_final)} non-TRN or NaN rows).")
print(f"Final Merged Test DF : {len(test_df_final)} utterances (removed {initial_test_len - len(test_df_final)} non-TST or NaN rows).")

In [None]:
train_df_final.head()

In [None]:
test_df_final.head()

In [None]:
os.remove('/content/drive/MyDrive/sps/preprocessed_spkr_info.csv')
os.remove('/content/drive/MyDrive/sps/preprocessed_train_data.csv')
os.remove('/content/drive/MyDrive/sps/preprocessed_test_data.csv')

In [19]:
train_df_final.to_csv('/content/drive/MyDrive/sps/final_train_data_merged.csv', index=False)
test_df_final.to_csv('/content/drive/MyDrive/sps/final_test_data_merged.csv', index=False)

### Preprocessing TIMIT

In [20]:
import pandas as pd
import os

SPKR_INFO_CSV = '/content/drive/MyDrive/sps/preprocessed_spkr_info.csv'
TRAIN_CSV     = '/content/drive/MyDrive/sps/preprocessed_train_data.csv'
TEST_CSV      = '/content/drive/MyDrive/sps/preprocessed_test_data.csv'
AUDIO_ROOT_DIR = '/content/timit_dataset/'

In [21]:
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

all_filepaths = set(train_df_final['FilePath']).union(set(test_df_final['FilePath']))
print(f"Total unique file paths to keep: {len(all_filepaths)}")

6300


In [22]:
# print(all_filepaths)

In [23]:
# count = 0
# for filename in all_filepaths:
#     filepath = os.path.join(AUDIO_ROOT_DIR, filename)
#     # print(filepath)

#     if os.path.isfile(filepath) and filename not in all_filepaths:
#       try:
#           os.remove(filepath)
#           print(f"Deleted file: {filepath}")
#           count = count + 1
#       except OSError as e:
#           print(f"Error deleting file {filepath}: {e}")

# print(f"Files deleted = {count}")

In [24]:
from tqdm.notebook import tqdm

if 'AUDIO_ROOT_DIR' not in locals() or not os.path.isdir(AUDIO_ROOT_DIR):
    print(f"ERROR: AUDIO_ROOT_DIR ('{AUDIO_ROOT_DIR}') is not defined or not found.")

In [25]:
try:
    if 'FilePath' not in train_df.columns or 'FilePath' not in test_df.columns:
        raise KeyError("Column 'FilePath' not found in preprocessed dataframes.")

    valid_paths_train = set(train_df['FilePath'])
    valid_paths_test = set(test_df['FilePath'])
    valid_relative_paths = valid_paths_train.union(valid_paths_test)
    print(f"  Found {len(valid_relative_paths)} unique valid relative WAV paths to keep.")

    if not valid_relative_paths:
        raise ValueError("Set of valid paths is empty. Cannot proceed.")

except Exception as e:
    print(f"Error creating valid path set: {e}")
    valid_relative_paths = set()

print(f"\nScanning directory tree: {AUDIO_ROOT_DIR}...")
files_to_remove = []
files_to_keep_count = 0

for root, dirs, files in os.walk(AUDIO_ROOT_DIR):
    for filename in files:
        full_path = os.path.join(root, filename)
        relative_path = os.path.relpath(full_path, AUDIO_ROOT_DIR).replace(os.path.sep, '/')

        is_valid_wav = filename.upper().endswith('.WAV') and relative_path in valid_relative_paths

        if not is_valid_wav:
            files_to_remove.append(full_path)
        else:
            files_to_keep_count += 1

print(f"\nScan complete.")
print(f"  Found {files_to_keep_count} WAV files matching the preprocessed CSVs.")
print(f"  Identified {len(files_to_remove)} files to remove (non-WAV or unused WAV).")

  Found 6300 unique valid relative WAV paths to keep.

Scanning directory tree: /content/timit_dataset/...

Scan complete.
  Found 6300 WAV files matching the preprocessed CSVs.
  Identified 18927 files to remove (non-WAV or unused WAV).


In [26]:
perform_deletion = True # True to enable deletion
if files_to_remove:
    if perform_deletion:
        print("\n--- DELETING FILES ---")
        deleted_count = 0
        error_count = 0
        for file_path in tqdm(files_to_remove, desc="Deleting files"):
            try:
                os.remove(file_path)
                deleted_count += 1
            except OSError as e:
                print(f"  Error deleting {file_path}: {e}")
                error_count += 1
        print(f"--- File Deletion Finished ---")
        print(f"  Successfully deleted: {deleted_count} files.")
        print(f"  Errors occurred for: {error_count} files.")

    else:
        print("\n--- Dry Run: Files Marked for Removal ---")
        for i, f in enumerate(files_to_remove):
             if i < 20:
                 print(f"  Would delete: {f}")
             elif i == 20:
                 print("    ...")
                 break
        print(f"\nTotal files marked for removal: {len(files_to_remove)}")
        print("\nSet 'perform_deletion = True' in the script to actually delete these files.")
else:
    print("\nNo files marked for removal. Directory seems clean according to CSVs.")

print("\n--- Directory Cleaning Script Finished ---")


--- DELETING FILES ---


Deleting files:   0%|          | 0/18927 [00:00<?, ?it/s]

--- File Deletion Finished ---
  Successfully deleted: 18927 files.
  Errors occurred for: 0 files.

--- Directory Cleaning Script Finished ---


In [27]:
import shutil
shutil.make_archive('/content/drive/MyDrive/sps/preprocessed_timit_dataset', 'zip', '/content/timit_dataset')

'/content/drive/MyDrive/sps/preprocessed_timit_dataset.zip'