# Extracting HuBERT Hidden Representations

In [2]:
# Packages
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor
import numpy as np
import glob
import os

# Custom Helper Libraries
from helper_scripts.TenseLax import TenseLax
from helper_scripts.AudioProcessing import AudioProcessing
from helper_scripts.Constants import *

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForCTC: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForCTC were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-

# Roadmap
```
let n = number of samples of audio files
let s = number of segments/phonemes to be segmented
let k = number of encoders in LLM
let l[i] = number of speech vectors per input sequence
```

1) Select n=200 samples of audio files from a specific subset of dialects from `TIMIT`
2) Create the output `hidden_states`
    - s entries
    - each entry is a size k=25 array 
    - each array index is a numpy array representing `l[i]` speech vectors--where each speech vector is of size 1024
3) Load in each of the n samples using `librosa`
4) 

# 1) Import Audio Files
- Extract 200 path's to audio samples to save computation

In [3]:
# DESIRED_PHONEMES = *TenseLax.getPairs()
DESIRED_PHONEMES = [vowel for vowel_pair in TenseLax.getPairs() for vowel in vowel_pair]
print (DESIRED_PHONEMES)

['iy', 'ih', 'eh', 'ey', 'eh', 'ae', 'ow', 'ao', 'uw', 'uh']


In [4]:
# Path is TIMIT/<TEST or TRAIN>/<DIALECT>/<SPEAKER ID>/<SEGMENT ID>.wav
DATASET_PATH = "../Timit-Database/TIMIT/"
ALL_WAVS_PATH = os.path.join(DATASET_PATH, "*", "*", "*", "*.wav")

speech_paths = glob.glob(ALL_WAVS_PATH)
print(f"Importing {len(speech_paths)} speech samples")

speech_paths = AudioProcessing.select_samples(
    speech_paths,
    num_samples=Constants.EXPERIMENTATION.NUM_SPEECH_SAMPLES
)

print(f"Succesfully randomly sampled {len(speech_paths)} speech samples")

Importing 4969 speech samples
Succesfully randomly sampled 200 speech samples


# 2) Calculate Boundaries for each Audio File

In [5]:
for path in speech_paths:
    embedded_audio, sequence_length = AudioProcessing.process_audio(
        wav_path=path,
        embedding_model=Constants.EXPERIMENTATION.EMBEDDING_MODEL,
        inference_model=Constants.EXPERIMENTATION.INFERENCE_MODEL,
        sampling_rate=16000
    )
    

In [6]:
# x1 = np.array([
#     [1, 2, 3],
#     [4, 5, 6]
# ])

# x2 = np.array([
#     [10, 20, 30],
#     [40, 50, 60]
# ])
# print(x1.shape)
# print(x2.shape)

# print(np.append(x1, x2))
# print(np.append(x1, x2, axis=0))
# print(np.append(x1, x2, axis=1))

x3 = np.array([[1, 2, 3],[4, 5, 6]])
print(x3.shape)
print(x3)

print()
print(x3.T.shape)
print(x3.T)

(2, 3)
[[1 2 3]
 [4 5 6]]

(3, 2)
[[1 4]
 [2 5]
 [3 6]]


In [7]:
import numpy as np

# Define an array
my_array = np.array([1, 100, 3, 4, 5])

# Generate a random sample from the array
random_sample = np.random.choice(my_array,3, replace=True)

print("Random sample:", random_sample)


Random sample: [  4   1 100]
