## Hidden States 

In [8]:
import torch
# from distilAlhubert.src.upstream.alhubert.expert import UpstreamExpert
# when using in s3prl, you can use 
from s3prl.s3prl.upstream.alhubert.expert import UpstreamExpert
model_ckpt_path = "small.ckpt"
model = UpstreamExpert(model_ckpt_path)
data = [torch.randn(10000) for _ in range(2)] # 16KHz
states = model(data)
print(states["last_hidden_state"].shape) # torch.Tensor: hidden state of the last layer
print(len(states["hidden_states"])) # list[torch.Tensor] hidden states of each layer

# please note that if layer_norm_first=False (default), "hidden_states" will be the outputs of transformer layer 0,1,...11
# layer_norm_first=True (for HuBERT Large teachers), "hidden_states" will be the outputs of the CNN feature estractor and transformer layer 0,1,...10.
# in that case, the output of transformer layer 11 is in states["last_hidden_state"].
# This is because that the feature after layer norm is better for distillation.

  WeightNorm.apply(module, name, dim)


torch.Size([2, 31, 768])
12


In [10]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print(torch.cuda.is_available())

if torch.cuda.is_available():
    print("CUDA is available!")
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("CUDA Version (PyTorch compiled):", torch.version.cuda)
    print("CUDA Version (runtime):", torch.cuda.get_device_capability(0))
else:
    print("CUDA not available.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device : {device}")


2.7.1+cu126
12.6
90701
True
CUDA is available!
GPU Name: NVIDIA GeForce RTX 4060
CUDA Version (PyTorch compiled): 12.6
CUDA Version (runtime): (8, 9)
Device : cuda


## Dataset

In [24]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio  # For loading audio
import os
import librosa

class AudioDataset(Dataset):
    def __init__(self, csv_file, split="train"):
        self.df = pd.read_csv(csv_file)
        self.df = self.df[self.df['split'] == split].reset_index(drop=True)
        self.sr = 16000

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = row['path']
        waveform, _ = librosa.load(path, sr=self.sr)
        waveform = torch.tensor(waveform, dtype=torch.float)
        label = torch.tensor(int(row['category']), dtype=torch.float)
        return row['name'], row['path'], waveform, label

def collate_fn(batch):
    names, paths, waveforms, labels = zip(*batch)
    # pad to longest in batch
    max_len = max([x.shape[0] for x in waveforms])
    padded_waveforms = [torch.nn.functional.pad(x, (0, max_len - x.shape[0])) for x in waveforms]
    return names, paths, torch.stack(padded_waveforms), torch.tensor(labels)

## Model

In [25]:
import torch.nn as nn

class AudioClassifier(nn.Module):
    def __init__(self, upstream_model, hidden_dim=768):
        super().__init__()
        self.upstream = upstream_model
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)  # binary classification
        )

    def forward(self, wavs):
        with torch.no_grad():
            states = self.upstream(wavs)
            features = states["last_hidden_state"]  # (B, T, 768)

        x = features.mean(dim=1)  # (B, 768), mean pooling across time
        out = self.classifier(x)  # (B, 1)
        return out.squeeze(dim=1)  # (B,)


## Training Loop

In [32]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for names, paths, wavs, labels in dataloader:
        wavs, labels = wavs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(wavs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Accuracy
        preds = (torch.sigmoid(outputs) > 0.5).int()
        correct += (preds == labels.int()).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy


## Evaluation + Save Predictions to CSV

In [33]:
def evaluate(model, dataloader, device, criterion=None, output_csv=None):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_rows = []

    with torch.no_grad():
        for names, paths, wavs, labels in dataloader:
            wavs, labels = wavs.to(device), labels.to(device)
            outputs = model(wavs)

            if criterion:
                loss = criterion(outputs, labels)
                total_loss += loss.item()

            preds = (torch.sigmoid(outputs) > 0.5).int()
            correct += (preds == labels.int()).sum().item()
            total += labels.size(0)

            if output_csv:
                preds_cpu = preds.cpu().tolist()
                for name, path, cat, pred in zip(names, paths, labels.cpu(), preds_cpu):
                    all_rows.append({
                        "name": name,
                        "path": path,
                        "category": int(cat),
                        "predicted_score": int(pred)
                    })

    if output_csv:
        pd.DataFrame(all_rows).to_csv(output_csv, index=False)

    avg_loss = total_loss / len(dataloader) if criterion else None
    accuracy = correct / total
    return avg_loss, accuracy


## Full training Script

In [36]:
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

upstream = UpstreamExpert("small.ckpt")
upstream.to(device)
for p in upstream.parameters():
    p.requires_grad = False

model = AudioClassifier(upstream).to(device)

csv_path = r"C:\Users\YIDAN\Desktop\projects\dysarthria\dataset_youtube_splits_binary.csv"

train_ds = AudioDataset(csv_path, split="train")
val_ds = AudioDataset(csv_path, split="valid")
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn=collate_fn, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size=2, shuffle=False, collate_fn=collate_fn, pin_memory=True)

optimizer = torch.optim.Adam(model.classifier.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()

best_val_loss = float("inf")
best_model_path = "best_model.pth"
training_stats = []

for epoch in range(25):
    train_loss, train_acc = train(model, train_dl, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_dl, device, criterion=criterion)

    print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | "
          f"Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    training_stats.append({
        "epoch": epoch,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "train_acc": train_acc,
        "val_acc": val_acc,
    })

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"✅ New best model saved at epoch {epoch}")

df_stats = pd.DataFrame(training_stats)
df_stats.to_csv("training_log.csv", index=False)
print("✅ Training stats saved to training_log.csv")

# Final evaluation on test set
test_ds = AudioDataset(csv_path, split="test")
test_dl = DataLoader(test_ds, batch_size=2, shuffle=False, collate_fn=collate_fn)

# Load best model
model.load_state_dict(torch.load(best_model_path))
model.to(device)

# Evaluate on test set
test_loss, test_acc = evaluate(model, test_dl, device, criterion=criterion, output_csv="test_predictions.csv")

print("✅ Test predictions saved to test_predictions.csv")
print(f"🧪 Test Loss: {test_loss:.4f} | Test Accuracy: {test_acc:.4f}")




  WeightNorm.apply(module, name, dim)


Epoch 0 | Train Loss: 0.6704 | Val Loss: 0.6632 | Train Acc: 0.6838 | Val Acc: 0.6154
✅ New best model saved at epoch 0
Epoch 1 | Train Loss: 0.6025 | Val Loss: 0.6318 | Train Acc: 0.8376 | Val Acc: 0.6667
✅ New best model saved at epoch 1
Epoch 2 | Train Loss: 0.5353 | Val Loss: 0.5789 | Train Acc: 0.8376 | Val Acc: 0.8462
✅ New best model saved at epoch 2
Epoch 3 | Train Loss: 0.4591 | Val Loss: 0.5549 | Train Acc: 0.8504 | Val Acc: 0.8462
✅ New best model saved at epoch 3
Epoch 4 | Train Loss: 0.4012 | Val Loss: 0.5309 | Train Acc: 0.8761 | Val Acc: 0.8205
✅ New best model saved at epoch 4
Epoch 5 | Train Loss: 0.3551 | Val Loss: 0.5038 | Train Acc: 0.8974 | Val Acc: 0.8718
✅ New best model saved at epoch 5
Epoch 6 | Train Loss: 0.3143 | Val Loss: 0.5033 | Train Acc: 0.8803 | Val Acc: 0.8718
✅ New best model saved at epoch 6
Epoch 7 | Train Loss: 0.2854 | Val Loss: 0.4803 | Train Acc: 0.9103 | Val Acc: 0.8974
✅ New best model saved at epoch 7
Epoch 8 | Train Loss: 0.2678 | Val Loss: