# Extracting HuBERT Hidden Representations

In [19]:
# Packages
from transformers import HubertForCTC, Wav2Vec2Processor
import numpy as np
import glob
import os
from collections import defaultdict 

# Custom Helper Libraries
from helper_scripts.TenseLax import TenseLax
from helper_scripts.AudioProcessing import AudioProcessing
from helper_scripts.Constants import *

# Roadmap
```
let n = number of samples of audio files
let s = number of segments/phonemes to be segmented
let k = number of encoders in LLM
let l[i] = number of speech vectors per input sequence
```

1) Select n=200 samples of audio files from a specific subset of dialects from `TIMIT`
2) Create the output `hidden_states`
    - s entries
    - each entry is a size k=25 array 
    - each array index is a numpy array representing `l[i]` speech vectors--where each speech vector is of size 1024
3) Load in each of the n samples using `librosa`
4) 

# 1) Import Audio Files
- Extract 200 path's to audio samples to save computation

In [20]:
# Path is TIMIT/<TEST or TRAIN>/<DIALECT>/<SPEAKER ID>/<SEGMENT ID>.wav
DATASET_PATH = "../Timit-Database/TIMIT/"
ALL_WAVS_PATH = os.path.join(DATASET_PATH, "*", "*", "*", "*.wav")

speech_paths = glob.glob(ALL_WAVS_PATH)
print(f"Importing {len(speech_paths)} speech samples")

speech_paths = AudioProcessing.select_samples(
    speech_paths,
    num_samples=Constants.EXPERIMENTATION.NUM_SPEECH_SAMPLES
)

print(f"Succesfully randomly sampled {len(speech_paths)} speech samples")

Importing 4969 speech samples
Succesfully randomly sampled 200 speech samples


# 2) Create the Data Structure for Saving the Hidden States

```
For Each Phoneme
    For Each Encoder 
        For Each Sequence of 1024 Vectors
            Append Hidden State
```

Hidden States maps <`phoneme string`, a list of`Hidden State Representation`>

In [21]:
# all_hidden_states = defaultdict(lambda: [np.empty(
#     (Constants.LLM.NUM_ENCODERS,
#      Constants.LLM.EMBEDDING_SIZE,
#      0  # To append a variable size # of speech vectors determined by length of speech input
#      ),
#     dtype=float
# )])
all_hidden_states = defaultdict(list)

# 2) Calculate Boundaries for each Audio File

In [22]:
for path in speech_paths:
    # Step 1) Generate the hidden states and boundaries
    embedded_audio, num_speech_frames, sequence_length = AudioProcessing.process_audio(
        wav_path=path,
        embedding_model=Constants.EXPERIMENTATION.EMBEDDING_MODEL,
        inference_model=Constants.EXPERIMENTATION.INFERENCE_MODEL,
        sampling_rate=16000
    )

    scaled_segmentation = AudioProcessing.get_sequence_boundary(
        TIMIT_wav_path=path,
        num_speech_frames=num_speech_frames,
        num_speech_vec=sequence_length
    )

    # Step 2) Select boundaries for matching phonemes
    filtered_segmentation = AudioProcessing.filter_segmentation(
        combined_df=scaled_segmentation,
        desired_phonemes=TenseLax.getSet()
    )

    # Step 3) Place Hidden State into output matrix
    for row in filtered_segmentation.itertuples():
        print(row)
        _, seq_start_vec_idx, seq_end_vec_idx, phoneme = row
        
        # Step 3a) Get the Hidden States per encoder for the entire speech segment
        utterance_hidden_states = AudioProcessing.get_hidden_states(
            input_embedding=embedded_audio,
            inference_model=Constants.EXPERIMENTATION.INFERENCE_MODEL,
            start_idx=seq_start_vec_idx,
            end_idx=seq_end_vec_idx 
        )

        # Step 3b) Append hidden States to the existing hidden states for this row
        all_hidden_states[phoneme].append(
            utterance_hidden_states
        )
    

Pandas(Index=2, _1=10, _2=14, _3='iy')
(25, 4, 1024)
Pandas(Index=4, _1=19, _2=27, _3='ao')
(25, 8, 1024)
Pandas(Index=10, _1=40, _2=47, _3='ae')
(25, 7, 1024)
Pandas(Index=23, _1=81, _2=88, _3='ow')
(25, 7, 1024)


# 3) Save Phoneme Hidden States

In [23]:
for phoneme, hidden_state in all_hidden_states.items():
    save_path = os.path.join(Constants.PATHING.hidden_state_save_path, f"HS_{phoneme}.npy")
    for entry in hidden_state:
        print(entry.shape)
    combined_per_segment = np.array(hidden_state)
    print(combined_per_segment.shape)
    break
    # np.save(save_path, combined_per_segment)

(25, 4, 1024)
(1, 25, 4, 1024)


In [24]:
# x1 = np.array([
#     [1, 2, 3],
#     [4, 5, 6]
# ])

# x2 = np.array([
#     [10, 20, 30],
#     [40, 50, 60]
# ])
# print(x1.shape)
# print(x2.shape)

# print(np.append(x1, x2))
# print(np.append(x1, x2, axis=0))
# print(np.append(x1, x2, axis=1))

x3 = np.array([[1, 2, 3],[4, 5, 6]])
print(x3.shape)
print(x3)

print()
print(x3.T.shape)
print(x3.T)

(2, 3)
[[1 2 3]
 [4 5 6]]

(3, 2)
[[1 4]
 [2 5]
 [3 6]]


In [25]:
import numpy as np

# Define an array
my_array = np.array([1, 100, 3, 4, 5])

# Generate a random sample from the array
random_sample = np.random.choice(my_array,3, replace=True)

print("Random sample:", random_sample)


Random sample: [  4   3 100]


In [26]:
import pandas as pd

# Example DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'IsStudent': [True, False, True, False]
}

df = pd.DataFrame(data)

# Set of names to filter
names_to_filter = {'Alice', 'Charlie'}

# Boolean condition: filtering only rows where 'Name' is in the set
filtered_df = df[df['Name'].isin(names_to_filter)]

print(filtered_df)


      Name  Age  IsStudent
0    Alice   25       True
2  Charlie   35       True


In [27]:
import pandas as pd

# Example DataFrame without headers
data = [
    ['Alice', 25, True],
    ['Bob', 30, False],
    ['Charlie', 35, True],
    ['David', 40, False]
]

df = pd.DataFrame(data)

# Extracting the 3rd column (index 2)
third_column = df.iloc[:, 2]

print(third_column)


0     True
1    False
2     True
3    False
Name: 2, dtype: bool
