In [33]:
import warnings
warnings.filterwarnings('ignore')

In [34]:
import pandas as pd
import os

def create_dataframe(input_folder):
  data = []

  for folder_name in os.listdir(input_folder):
    folder_path = os.path.join(input_folder, folder_name)

    if os.path.isdir(folder_path):

      for file_name in os.listdir(folder_path):
        if file_name.endswith('.wav') and '16khz.wav' not in file_name:
          source_path = os.path.join(folder_path, file_name)

          category = "Unknown"
          if '_C1_' in folder_name:
            category = 'C1'
          if '_C2_' in folder_name:
            category = 'C2'
          if '_C3_' in folder_name:
            category = 'C3'
          if '_C4_' in folder_name:
            category = 'C4'
          if '_C5_' in folder_name:
            category = 'C5'
          if '_C6_' in folder_name:
            category = 'C6'

          if file_name.endswith('.wav'): 
            for valence_pattern in ['_Valence_A_Aligned', '_Valence_V_Aligned', '_Valence_AV_Aligned']:
              if valence_pattern in file_name:
                valence_files.append(file_name)
                break  


          data.append({
              'Culture ID': category,  
              'Audio Filename': file_name,
              'Source Path': source_path  
          })

  df = pd.DataFrame(data)
  return df

input_folder = 'SEWAv02'
df = create_dataframe(input_folder)
print(df)


    Culture ID                     Audio Filename  \
0           C3  SSD_C3_S092_VC1_003651_004631.wav   
1           C3  SVH_C3_S085_VC1_000901_001561.wav   
2           C2  SVL_C2_S062_VC1_000489_001586.wav   
3           C1  SSD_C1_S001_VC1_004201_005201.wav   
4           C2  SVL_C2_S048_VC1_000837_001711.wav   
..         ...                                ...   
533         C2  SAH_C2_S045_VC1_003626_005086.wav   
534         C1  SVH_C1_S194_VC1_005201_005901.wav   
535         C3  SSD_C3_S064_VC1_002051_002701.wav   
536         C1  SVH_C1_S192_VC1_006801_007351.wav   
537         C4  SVH_C4_S119_VC1_005251_005850.wav   

                                           Source Path  
0    SEWAv02/SSD_C3_S092_P183_VC1_003651_004631/SSD...  
1    SEWAv02/SVH_C3_S085_P170_VC1_000901_001561/SVH...  
2    SEWAv02/SVL_C2_S062_P123_VC1_000489_001586/SVL...  
3    SEWAv02/SSD_C1_S001_P001_VC1_004201_005201/SSD...  
4    SEWAv02/SVL_C2_S048_P096_VC1_000837_001711/SVL...  
..                   

In [35]:
df_C1 = df[df['Culture ID']=='C1']
df_C2 = df[df['Culture ID']=='C2']
df_C3 = df[df['Culture ID']=='C3']
df_C4 = df[df['Culture ID']=='C4']
df_C5 = df[df['Culture ID']=='C5']
df_C6 = df[df['Culture ID']=='C6']

In [36]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from transformers import Wav2Vec2Processor, HubertModel
import torch
from sklearn.decomposition import PCA

In [37]:
# Load processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
def extract_features_from_folders(input_folder, output_folder='HuBERT_extracted_features'):
    os.makedirs(output_folder, exist_ok=True) 
    n_components = 128
    pca = PCA(n_components=n_components)

    for folder_name in os.listdir(input_folder):
        folder_path = os.path.join(input_folder, folder_name)
        
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                if file_name.endswith('.wav'):
                    audio_path = os.path.join(folder_path, file_name)
                    try:
                        audio, sr = librosa.load(audio_path, sr=16000)
                        inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)

                        with torch.no_grad():
                            features = model(inputs.input_values).last_hidden_state.squeeze(0).numpy()  # Remove batch dimension
                        
                        # Apply PCA to reduce feature dimensions
                        reduced_features = pca.fit_transform(features.reshape(features.shape[0], -1))
                        
                        features_tensor = torch.tensor(reduced_features, dtype=torch.float32)
                        
                        tensor_file_path = os.path.join(output_folder, f"{folder_name}.pt")
                        torch.save(features_tensor, tensor_file_path)
                        print(f"Saved features for {folder_name} to {tensor_file_path}")

                    except Exception as e:
                        print(f"Error processing {audio_path}: {e}")
                    break  


input_folder = 'SEWAv02'  
extract_features_from_folders(input_folder)

Saved features for SSD_C3_S092_P183_VC1_003651_004631 to extracted_features/SSD_C3_S092_P183_VC1_003651_004631.pt
Saved features for SVH_C3_S085_P170_VC1_000901_001561 to extracted_features/SVH_C3_S085_P170_VC1_000901_001561.pt
Saved features for SVL_C2_S062_P123_VC1_000489_001586 to extracted_features/SVL_C2_S062_P123_VC1_000489_001586.pt
Saved features for SSD_C1_S001_P001_VC1_004201_005201 to extracted_features/SSD_C1_S001_P001_VC1_004201_005201.pt
Saved features for SVL_C2_S048_P096_VC1_000837_001711 to extracted_features/SVL_C2_S048_P096_VC1_000837_001711.pt
Saved features for SSD_C4_S111_P222_VC1_000901_001550 to extracted_features/SSD_C4_S111_P222_VC1_000901_001550.pt
Saved features for SSD_C5_S143_P285_VC1_002401_002651 to extracted_features/SSD_C5_S143_P285_VC1_002401_002651.pt
Saved features for SSD_C3_S074_P148_VC1_001201_001631 to extracted_features/SSD_C3_S074_P148_VC1_001201_001631.pt
Saved features for SSD_C6_S190_P380_VC1_001527_006847 to extracted_features/SSD_C6_S190_