# Introduction
This notebook is used to restructure the files in the timit dataset so that it can work with the speechbrain and speech t5 models respectively. The original timit dataset contains subdirectories of speaker audio files, which will not work with the embedding and speech generation models in notebooks one and two.

# Mount Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Inspecting the directory of one region in the timit dataset

In [None]:
!ls /content/drive/MyDrive/timit/raw/TIMIT/TRAIN/DR1/FCJF0/

SA1.PHN  SA2.TXT     SI1027.WAV  SI1657.WRD  SX127.PHN	SX217.TXT  SX307.WAV  SX37.WRD
SA1.TXT  SA2.WAV     SI1027.WRD  SI648.PHN   SX127.TXT	SX217.WAV  SX307.WRD  SX397.PHN
SA1.WAV  SA2.WRD     SI1657.PHN  SI648.TXT   SX127.WAV	SX217.WRD  SX37.PHN   SX397.TXT
SA1.WRD  SI1027.PHN  SI1657.TXT  SI648.WAV   SX127.WRD	SX307.PHN  SX37.TXT   SX397.WAV
SA2.PHN  SI1027.TXT  SI1657.WAV  SI648.WRD   SX217.PHN	SX307.TXT  SX37.WAV   SX397.WRD


- Create a funtion to collect all text and audio in each subdirectory
- place all text for each speaker in a main directory called 'txt'
- place all audio for each speaker in a main directory called 'wav48'

In [None]:
import os
import shutil

def process_timit_train_directory(timit_train_dir, tts_dataset_dir):
    if not os.path.exists(tts_dataset_dir):
        os.makedirs(tts_dataset_dir)

    # Directories for consolidated text and audio files
    txt_dir = os.path.join(tts_dataset_dir, 'txt', 'speaker_transcript')
    wav_dir = os.path.join(tts_dataset_dir, 'wav48', 'speaker_audio')
    os.makedirs(txt_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)

    # Iterate over DR directories in TIMIT train directory
    for dr_directory in os.listdir(timit_train_dir):
        dr_path = os.path.join(timit_train_dir, dr_directory)
        if os.path.isdir(dr_path):
            # Iterate over speaker directories in the current DR directory
            for speaker_dir in os.listdir(dr_path):
                speaker_path = os.path.join(dr_path, speaker_dir)
                if os.path.isdir(speaker_path):
                    # Copy TXT and WAV files to the respective consolidated directories
                    for file_name in sorted(os.listdir(speaker_path)):
                        file_path = os.path.join(speaker_path, file_name)
                        if file_name.lower().endswith('.txt'):
                            txt_file_path = os.path.join(txt_dir, f"{speaker_dir}_{file_name}")
                            shutil.copyfile(file_path, txt_file_path)
                        elif file_name.lower().endswith('.wav'):
                            wav_file_path = os.path.join(wav_dir, f"{speaker_dir}_{file_name}")
                            shutil.copyfile(file_path, wav_file_path)


In [None]:
timit_train_dir = '/content/drive/MyDrive/timit/raw/TIMIT/TRAIN/'
tts_dataset_dir = '/content/drive/MyDrive/MyTTSDataset_6'
process_timit_train_directory(timit_train_dir, tts_dataset_dir)

In [None]:
print(os.listdir(tts_dataset_dir))

['txt', 'wav48']
