In [56]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import os
import textgrid
import librosa
from numba import njit
import numpy as np
from transformers import AutoProcessor, AutoModelForCTC
processor_H = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
model_H = AutoModelForCTC.from_pretrained("facebook/hubert-large-ls960-ft")

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForCTC: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForCTC were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-

# 1. ASR model Latent Space Representation Similarity Calculation


### 1.1 Introduction HuBERT

We utilize the representations from different layers of an ASR (Automatic Speech Recognition) model to abstractly represent waveform signals, aiming to simulate the paradigm of how the brain stores auditory information. Here we use HuBERT, **HuBERT** (Hidden-Unit BERT) is a ​**self-supervised speech representation model** proposed by Meta AI (formerly Facebook AI Research). It learns hierarchical representations of speech signals through pre-training on large-scale unlabeled audio data. The core innovation lies in its ​**masked prediction task**, where the model predicts acoustically meaningful units (e.g., phonemes or subword units) for randomly masked speech segments, thereby capturing both acoustic and semantic dependencies.  

Key features of HuBERT include:  
- ​**Iterative clustering**: Pseudo-labels generated through clustering refine the learning objective during training.  
- ​**Hierarchical Transformer architecture**:  
  - ​**Lower layers** (e.g., layers 1-6) encode ​**acoustic details** (e.g., pitch, spectral patterns)  
  - ​**Middle layers** (7-12) capture ​**phoneme-level speech units**  
  - ​**Higher layers** (13-24) integrate ​**semantic and syntactic information**  

This layered structure aligns with hypothesized ​**hierarchical processing in the human auditory system**:  
- Primary auditory cortex (acoustic feature extraction) →  
- Secondary auditory regions (phoneme processing) →  
- Association cortex (semantic integration)  

By leveraging HuBERT's layer-wise representations as "abstract acoustic templates", we establish a computational framework to explore how the brain might store and process auditory information, bridging neuroscience hypotheses with machine learning interpretability.


In [16]:
# Since the ASR we are using now is implemented in Pytorch, you can easily export our model structure by print(you_model_name)
print(model_H)

HubertForCTC(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertLayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in

In [17]:
# Use "." you are able to access subcomponents inside you ASR model
# e.g. The 12th layer of transformer-layers from Encoder
print(model_H.hubert.encoder.layers[12])

HubertEncoderLayerStableLayerNorm(
  (attention): HubertAttention(
    (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
    (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
    (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
    (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (feed_forward): HubertFeedForward(
    (intermediate_dropout): Dropout(p=0.1, inplace=False)
    (intermediate_dense): Linear(in_features=1024, out_features=4096, bias=True)
    (intermediate_act_fn): GELUActivation()
    (output_dense): Linear(in_features=4096, out_features=1024, bias=True)
    (output_dropout): Dropout(p=0.1, inplace=False)
  )
  (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)


### 1.2 Loading audio data as Input

In [11]:

def get_pathset(paths,file_format=".wav"):
    return [os.path.join(dir, each_file) for dir, mid, files in os.walk(paths) for each_file in files if each_file.endswith(file_format)]
audio_dir =r".\data\speech_files"


In [34]:
# We use audio dataset from ALLSSTAR.
# A single data record contains two files, the .wav file contains the original sound record, and the .TextGrid contains the annotation record. 
audio_path=get_pathset(audio_dir,".wav")
print("Audio data: ",audio_path)
textgrid_path=get_pathset(audio_dir,".TextGrid")
print("Annotation data: ",textgrid_path)
print("\n")


# The annotation is performed at three levels, namely, sentence, word, and phoneme. At each level, the start, end, and mark of a single tag are recorded.
tg = textgrid.TextGrid.fromFile(textgrid_path[0])
print("sentence-level annotation:  ",tg[0][1])
print("word-level annotation:  ",tg[1][3])
print("phoneme-level annotation:  ",tg[2][3], " ",tg[2][4])
print("\n")


# You could read Interval() data by following method:
Interval_sample=tg[0][1]
print(Interval_sample.minTime)
print(Interval_sample.maxTime)
print(Interval_sample.mark)

Audio data:  ['.\\data\\speech_files\\ALL_016_M_CMN_ENG_HT1.wav', '.\\data\\speech_files\\ALL_021_M_CMN_ENG_HT1.wav', '.\\data\\speech_files\\ALL_032_M_CMN_ENG_HT1.wav', '.\\data\\speech_files\\ALL_035_M_CMN_ENG_HT1.wav', '.\\data\\speech_files\\ALL_037_M_CMN_ENG_HT1.wav', '.\\data\\speech_files\\ALL_043_M_CMN_ENG_HT1.wav', '.\\data\\speech_files\\ALL_055_M_ENG_ENG_HT1.wav', '.\\data\\speech_files\\ALL_066_M_ENG_ENG_HT1.wav', '.\\data\\speech_files\\ALL_070_M_ENG_ENG_HT1.wav', '.\\data\\speech_files\\ALL_131_M_ENG_ENG_HT1.wav', '.\\data\\speech_files\\ALL_133_M_ENG_ENG_HT1.wav']
Annotation data:  ['.\\data\\speech_files\\ALL_016_M_CMN_ENG_HT1.TextGrid', '.\\data\\speech_files\\ALL_021_M_CMN_ENG_HT1.TextGrid', '.\\data\\speech_files\\ALL_032_M_CMN_ENG_HT1.TextGrid', '.\\data\\speech_files\\ALL_035_M_CMN_ENG_HT1.TextGrid', '.\\data\\speech_files\\ALL_037_M_CMN_ENG_HT1.TextGrid', '.\\data\\speech_files\\ALL_043_M_CMN_ENG_HT1.TextGrid', '.\\data\\speech_files\\ALL_055_M_ENG_ENG_HT1.TextGri

In [41]:
# According to this start-time and end-time with marks, we may cut some waveform we want:


# NOTE:The HuBERT model fine-tuned on 960h of Librispeech on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz.
audio, sr = librosa.load(audio_path[0])
target_sr=16000
wave_res = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
tg_sentence = tg[0][1]


start_sentence = int(tg_sentence.minTime*target_sr) 
end_sentence = int(tg_sentence.maxTime*target_sr)
waveform_input=wave_res[start_sentence:end_sentence]
print(waveform_input.shape)



(24160,)


###  1.3 Get the feature output from different layers

In [None]:
# Since HuBERT model has a fixed torch structure, we need also normalize input format:
input=processor_H(wave_res[start_sentence:end_sentence], sampling_rate=16000, return_tensors="pt").input_values

# First, we could get result from CNN-layers by simply feature extractor functions
model_H.eval() 
conv_outputs = []
x = input.clone() # avoid change the original data
for conv_layer in model_H.hubert.feature_extractor.conv_layers:
    x = conv_layer(x)
    conv_outputs.append(x.unsqueeze(0).transpose(2,1))
    
for _,i in enumerate(conv_outputs):
    print(f"Feature output from {_+1}-CNN-Layer: ",i.shape)

Feature output from 1-CNN-Layer:  torch.Size([1, 4831, 512])
Feature output from 2-CNN-Layer:  torch.Size([1, 2415, 512])
Feature output from 3-CNN-Layer:  torch.Size([1, 1207, 512])
Feature output from 4-CNN-Layer:  torch.Size([1, 603, 512])
Feature output from 5-CNN-Layer:  torch.Size([1, 301, 512])
Feature output from 6-CNN-Layer:  torch.Size([1, 150, 512])
Feature output from 7-CNN-Layer:  torch.Size([1, 75, 512])


In [54]:
# 
with torch.no_grad():
    outputs = model_H(input, output_hidden_states=True)
    all_layer_outputs = outputs.hidden_states
for _,i in enumerate(all_layer_outputs):
    print(f"Feature output from {_+0}-Transformers-Layer: ",i.shape)


Feature output from 0-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 1-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 2-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 3-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 4-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 5-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 6-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 7-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 8-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 9-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 10-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 11-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 12-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 13-Transformers-Layer:  torch.Size([1, 75, 1024])
Feature output from 14-Transfo

### 1.4 Get the prediction of audio data

In [63]:
input=processor_H(wave_res[start_sentence:end_sentence], sampling_rate=16000, return_tensors="pt").input_values
model_H.eval() 
with torch.no_grad():
    output_encoder = model_H(input).logits
outind=torch.argmax(output_encoder,dim=-1).cpu().numpy()
transcription = processor_H.batch_decode(outind)[0]
print("Prediciton from HuBERT: ", transcription)
print("Actual Result: ", tg_sentence.mark)
print("Prediction Correct!")

Prediciton from HuBERT:  A BOY FELL FROM THE WINDOW
Actual Result:  A BOY FELL FROM A WINDOW
Prediction Correct!
