In [None]:
import os
import csv
import noisereduce as nr
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import scipy.io.wavfile as wav
import librosa
import pandas as pd
from transformers import Wav2Vec2Processor, TFWav2Vec2Model

# Load models
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
hf_w2v2_model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
spice_w2v2_cls_model = hub.KerasLayer('https://tfhub.dev/google/euphonia_spice/classification/1')

# Ensure 16kHz sample rate
TARGET_SR = 16000

# --- Helper functions ---
def wavread(filename):
    samplerate, wave_data = wav.read(filename)
    data = np.asarray(wave_data, dtype=np.float32) / 32768.0
    return data, samplerate

def resample_aud(audio, sample_rate, target_sr=16000):
    return librosa.core.resample(audio, orig_sr=sample_rate, target_sr=target_sr, res_type='kaiser_best')

def read_wav_resample(filename):
    audio, sample_rate = wavread(filename)
    if sample_rate != TARGET_SR:
        audio = resample_aud(audio, sample_rate, target_sr=TARGET_SR)
    if audio.dtype != 'float32':
        audio = np.array(audio, dtype=np.float32)
    return audio

def samples_to_embedding(audio, processor, model, sample_rate=16000):
    if not tf.is_tensor(audio):
        audio = tf.convert_to_tensor(audio)
    if audio.shape.rank > 1:
        audio = tf.squeeze(audio)
    input_values = processor(audio.numpy(), sampling_rate=sample_rate, return_tensors='tf').input_values
    hidden_states = model(input_values).last_hidden_state
    return hidden_states.numpy()

def get_prediction(filepath):
    audio = read_wav_resample(filepath)
    # cleaned_audio = nr.reduce_noise(y=audio, sr=TARGET_SR)  # noise reduc
    emb = samples_to_embedding(audio, processor, hf_w2v2_model)
    prediction = spice_w2v2_cls_model(emb)[0].numpy()  # Shape: [1, 5] — probabilities
    predicted_class = int(np.argmax(prediction))       # Class 0 to 4
    return predicted_class

# --- Run inference over all files ---
INPUT_CSV = "C:/Users/YIDAN/Desktop/projects/dysarthria/dataset_youtube.csv"
df = pd.read_csv(INPUT_CSV)
results = []

# Paths
OUTPUT_CSV = "./spice_results_youtube_noiseReduce.csv"

for idx, row in df.iterrows():
    name = row['name']
    path = row['path']
    category = row['category']
    
    try:
        pred = get_prediction(path)
        results.append({
            "name": name,
            "path": path,
            "category": category,
            "predicted_score": pred
        })
        # print(f"[✓] {name} → predicted: {pred} (true: {category})")
    except Exception as e:
        print(f"[!] Error processing {name} at {path}: {e}")

# === Save results ===
out_df = pd.DataFrame(results)
out_df.to_csv(OUTPUT_CSV, index=False)
print(f"\n✅ Done! Results saved to: {OUTPUT_CSV}")


TFWav2Vec2Model has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2Model: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing TFWav2Vec2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFWav2Vec2Model were not initialized from the PyTorch model and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

[!] Error processing audio 16_speaker 1 sev_segment 1 chunk 1 at E:\labelled_speech\YouTube Audio\dysarthric speech\audio 16\speaker 1 sev\segment 1 chunk 1.wav: No module named 'resampy'

This error is lazily reported, having originally occured in
  File c:\Users\YIDAN\Desktop\projects\dysarthria-mtl-steal\venvspice\lib\site-packages\librosa\core\audio.py, line 33, in <module>

----> resampy = lazy.load("resampy")
[!] Error processing audio 16_speaker 1 sev_segment 1 chunk 2 at E:\labelled_speech\YouTube Audio\dysarthric speech\audio 16\speaker 1 sev\segment 1 chunk 2.wav: No module named 'resampy'

This error is lazily reported, having originally occured in
  File c:\Users\YIDAN\Desktop\projects\dysarthria-mtl-steal\venvspice\lib\site-packages\librosa\core\audio.py, line 33, in <module>

----> resampy = lazy.load("resampy")
[!] Error processing audio 16_speaker 1 sev_segment 1 chunk 3 at E:\labelled_speech\YouTube Audio\dysarthric speech\audio 16\speaker 1 sev\segment 1 chunk 3.wav: 