## Import Speech Processing Libraries
- `Librosa` for Audio Processing
- `torch` for Pytorch
- `CTC` stands for Connectionist Temporal Classification
    - Self-supervised method to map input sound sequences to an ouput (embedding) when alignment is not clear or available
- Wav2Vec2Processor preprocess audio for HuBERT model

In [96]:
# Core Libraries
import librosa
import torch
from transformers import HubertForCTC, Wav2Vec2Processor

# Utilitiies
import numpy as np
import matplotlib.pyplot as plt
import glob 
import os

# Helper Functions
from TIMIT_utils.TIMIT_Constants import TIMIT_Constants

## Import Data

In [97]:
def import_data(TIMIT_PATH: str = "./TIMIT-Database/TIMIT/TEST/DR3/*/*.wav", NUM_SAMPLES = 200) -> list[str]:
    wav_list = sorted(glob.glob("./TIMIT-Database/TIMIT/TEST/DR3/*/*.wav"))
    sampled_indexes = np.random.choice(len(wav_list), NUM_SAMPLES, replace=False)
    sampled_wavs = [wav_list[rand_idx] for rand_idx in sampled_indexes]
    return sampled_wavs

In [98]:
def get_seg_frames(seg_str, num_layers, embedding_size=1024, save_folder="./"):
    # Note: we set cols to 0 (just wrote the row ptrs) since we don't know the num cols at compile time?
    seg_frames = num_layers * [np.empty((embedding_size, 0), float)]

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    save_path = os.path.join(save_folder, f"HS_{seg_str}.npy")
    np.save(save_path, np.array(seg_frames))

    return seg_frames

def init_all_seg_frames(SEG_LIST, save_folder="./"):
    all_seg_frames = []
    for segment in SEG_LIST:
        all_seg_frames.append(
            get_seg_frames(segment, num_layers=25, embedding_size=1024,save_folder=save_folder)
        )
    return all_seg_frames

### Wav2Vec2Processor
- `960h` refers to 960 hours of Librispeech training
- Librispeech is an ASR corpus sampled at 60kHz

In [99]:
# Import HuBERT Model
processor = Wav2Vec2Processor.from_pretrained(
    "facebook/wav2vec2-base-960h"
)

In [100]:
model = HubertForCTC.from_pretrained(
    "facebook/hubert-large-ls960-ft", 
    output_attentions=True, 
    output_hidden_states=True
)

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForCTC: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForCTC were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-

## Map TIMIT sampling boundaries to HuBERT frames 

In [101]:
segments = TIMIT_Constants.getCommonSegments()

# TODO rename the output of import_data function
wav_paths = import_data(
    TIMIT_PATH="./TIMIT-Database/TIMIT/TEST/DR3/", NUM_SAMPLES=200)

hidden_states = init_all_seg_frames(segments, save_folder="./data")

In [102]:
for seg_idx in range(len(segments)):
    speech, rate = librosa.load(wav_paths[seg_idx], sr=16000)
    # TODO what is return_tensors, where to find these parameters
    input_values = processor(speech, return_tensors="pt", padding="longest",
                             sampling_rate=rate, output_hidden_states=True, output_atentions=True).input_values

    num_speech_frames = model(input_values).hidden_states[24].shape[1]

    # Convert to `PHON` path
    phon_path = wav_paths[seg_idx].replace(".wav", ".PHN")

    beginning_frame, ending_frame, phonetic_items = [], [], []

    file = open(phon_path, "r")

    for line in file:
        beg_frame, end_frame, phon_item = line.split()

        beginning_frame.append(beg_frame)
        ending_frame.append(end_frame)
        phonetic_items.append(phon_item)

    # Convert the Starting Frame numbers in terms of HuBERT hidden state
    # TODO check the different NP types, see if float is needed
    beginning_frame = np.rint(
        (np.array(beginning_frame, np.double) / float(len(speech)))
        * num_speech_frames
    )

    ending_frame = np.rint(
        (np.array(ending_frame, np.double) / float(len(speech)))
        * num_speech_frames
    )

    num_layers = 25
    for layer_idx in range(num_layers):
        # hidden_state = model(input_values).hidden_states
        # print(len(hidden_state))
        # print(hidden_state[layer_idx].shape) # TODO document what each shape means
        hidden_state = model(
            input_values).hidden_states[layer_idx][0, :, :].detach().numpy()
        # print(hidden_state.shape)

        # Optimize
        for phoneme in phonetic_items:
            if phoneme not in segments:
                continue

            phoneme_idx_in_phonetic_items = phonetic_items.index(phoneme)
            phoneme_idx_in_segs = segments.index(phoneme)

            # Use numpy alternative that does not make copies
                # TODO Check for inclusivity
            phoneme_start_idx = int(beginning_frame[phoneme_idx_in_phonetic_items])
            phoneme_end_idx = int(ending_frame[phoneme_idx_in_phonetic_items])

            # print(f"phoneme_idx_in_segments: {phoneme_idx_in_segs}", f"layer_idx: {layer_idx}")
            hidden_states[phoneme_idx_in_segs][layer_idx] = np.append(
                hidden_states[phoneme_idx_in_segs][layer_idx],

                # TODO check the python list indexing syntax
                hidden_state[phoneme_start_idx: phoneme_end_idx, :].T,

                axis=1
            )
            # print(hidden_states[phoneme_idx_in_segs][layer_idx])
            # print(hidden_state[phoneme_start_idx: phoneme_idx_in_phonetic_items, :].T)

In [103]:
save_folder = "./data"
for seg_idx in range(len(segments)):
    save_path = os.path.join(save_folder, f"HS_{segments[seg_idx]}.npy")
    print(hidden_states[seg_idx])
    np.save(save_path, np.array(hidden_states[seg_idx]))

[array([[ 2.37007785e+00,  2.31564879e+00,  2.19898343e+00, ...,
         1.47802985e+00,  1.98719025e+00,  2.01455259e+00],
       [ 4.80834198e+01,  4.77894897e+01,  4.81552849e+01, ...,
         4.83022041e+01,  4.71989594e+01,  4.84683304e+01],
       [-1.68278637e+01, -1.58116961e+01, -1.25907555e+01, ...,
        -9.30018234e+00, -7.60963726e+00, -5.84744596e+00],
       ...,
       [ 8.06751072e-01,  3.98570871e+00,  3.32807589e+00, ...,
        -1.86912477e-01,  6.79978430e-01,  6.31428719e-01],
       [ 2.98888236e-01, -1.62151828e-02, -1.76743031e-01, ...,
        -6.19354956e-02,  5.10944307e-01,  6.38680637e-01],
       [-3.69313955e+00, -2.11538839e+00, -2.34297872e+00, ...,
         1.22525215e-01, -6.45668507e-01,  1.76968098e-01]]), array([[-1.2084868 , -1.26045561, -1.30145693, ..., -1.57143426,
        -1.51643157, -0.97429514],
       [61.9037056 , 62.97336197, 61.32131958, ..., 63.49118042,
        61.83036804, 67.77278137],
       [-9.03580856, -8.4313736 , -5.9573

## Extract Hidden layer representations from HuBERT