# Get Data By Dialect
- modified version of `GetData.ipynb` to segement by Dialect Region
- See that version for a more comments/code descriptions

In [8]:
# Packages
from transformers import HubertForCTC, Wav2Vec2Processor
import numpy as np
import glob
import os
from collections import defaultdict 

# Custom Helper Libraries
from helper_scripts.TenseLax import TenseLax
from helper_scripts.AudioProcessing import AudioProcessing
from helper_scripts.Constants import *
from helper_scripts.Pathing import Pathing

In [10]:
# Path is TIMIT/<TEST or TRAIN>/<DIALECT>/<SPEAKER ID>/<SEGMENT ID>.wav
DATASET_PATH = "../Timit-Database/TIMIT/"
for dialect in range(1, Constants.TIMIT.NUM_DIALECTS+1):
    ALL_WAVS_PATH = os.path.join(
        DATASET_PATH, "TEST", f"DR{dialect}", "*", "*.wav")

    speech_paths = glob.glob(ALL_WAVS_PATH)
    print(f"Importing {len(speech_paths)} speech samples for phoneme for dialect DR{dialect}")

    all_hidden_states = defaultdict(list)

    for path in speech_paths:
        # Step 1) Generate the hidden states and boundaries
        embedded_audio, num_speech_frames, sequence_length = AudioProcessing.process_audio(
            wav_path=path,
            embedding_model=Constants.EXPERIMENTATION.EMBEDDING_MODEL,
            inference_model=Constants.EXPERIMENTATION.INFERENCE_MODEL,
            sampling_rate=16000
        )

        scaled_segmentation = AudioProcessing.get_sequence_boundary(
            TIMIT_wav_path=path,
            num_speech_frames=num_speech_frames,
            num_speech_vec=sequence_length
        )

        # Step 2) Select boundaries for matching phonemes
        filtered_segmentation = AudioProcessing.filter_segmentation(
            combined_df=scaled_segmentation,
            desired_phonemes=TenseLax.getSet()
        )

        # Step 3) Place Hidden State into output matrix
        for row in filtered_segmentation.itertuples():
            _, seq_start_vec_idx, seq_end_vec_idx, phoneme = row

            # Step 3a) Get the Hidden States per encoder for the entire speech segment
            utterance_hidden_states = AudioProcessing.get_hidden_states(
                input_embedding=embedded_audio,
                inference_model=Constants.EXPERIMENTATION.INFERENCE_MODEL,
                start_idx=seq_start_vec_idx,
                end_idx=seq_end_vec_idx
            )

            # Step 3b) Append hidden States to the existing hidden states for this row
            all_hidden_states[phoneme].append(
                utterance_hidden_states
            )

    for phoneme, hidden_state in all_hidden_states.items():
        combined_per_segment = np.concatenate(hidden_state, axis=1)
        Pathing.save_file_np(
            save_dir=os.path.join(
                Constants.PATHING.hidden_state_save_path, f"DR{dialect}"),
            save_file_name=f"HS_{phoneme}_{
                combined_per_segment.shape[1]}.npy",
            to_save=combined_per_segment
        )

Importing 0 speech samples for phoneme for dialect DR7
Importing 0 speech samples for phoneme for dialect DR8
