In [1]:
import pathlib
from torch.utils.data import DataLoader 
from src.data_load import load_speech_commands, SpeechCommandsDataset
from src.train_model import train_model
from src.ast_model import ASTModel

In [None]:
# !pip install kaggle --upgrade

# import os


# !kaggle competitions download -c tensorflow-speech-recognition-challenge -p data/
# !pip install py7zr --quiet
# !pip install librosa

In [3]:
# import zipfile
# from pathlib import Path
# import py7zr

# DATA_ROOT = Path("data")
# DATA_ROOT.mkdir(exist_ok=True)

# zip_path = DATA_ROOT / "tensorflow-speech-recognition-challenge.zip"
# with zipfile.ZipFile(zip_path, "r") as z:
#     z.extractall(DATA_ROOT)
# print("Files in data:", list(DATA_ROOT.iterdir()))

# for archive_name, out_subdir in [("train.7z", "train"), ("test.7z", "test"), ("sample_submission.7z", "sample_submission")]:
#     in_path  = DATA_ROOT / archive_name
#     out_path = DATA_ROOT / out_subdir
#     out_path.mkdir(exist_ok=True)
#     print(f"Extracting {in_path} - {out_path}")
#     with py7zr.SevenZipFile(in_path, mode="r") as archive:
#         archive.extractall(path=out_path)

# for d in DATA_ROOT.iterdir():
#     print(d.name, "-", list(d.glob("*"))[:5])

In [4]:
train_data = load_speech_commands("data", split="train", max_files=32, other_label="unknown")
val_data   = load_speech_commands("data", split="validation", max_files=32, other_label="unknown")

In [5]:
# test_data  = load_speech_commands("data", split="test", max_files=1)

In [6]:
all_labels = sorted({lbl for _, lbl in train_data} | {lbl for _, lbl in val_data})
label2id   = {lbl: i for i, lbl in enumerate(all_labels)}
num_labels = len(set(lbl for _, lbl in train_data) | set(lbl for _, lbl in val_data))

train_ds = SpeechCommandsDataset(train_data, label2id)
val_ds   = SpeechCommandsDataset(val_data,   label2id)

train_loader = DataLoader(
    train_ds,
    batch_size=32,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)
val_loader = DataLoader(
    val_ds,
    batch_size=32,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

In [7]:
model = ASTModel(num_labels)

In [8]:
model.config

ASTConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "ASTForAudioClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "frequency_stride": 10,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "layer_norm_eps": 1e-12,
  "max_length": 128,
  "model_type": "audio-spectrogram-transformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_mel_bins": 128,
  "patch_size": 16,
  "qkv_bias": true,
  "time_stride": 10,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

In [12]:
import os
import random
import json

param_space = {
    'num_attention_heads':    [6, 12, 18],  
    'head_dim':               [32, 64, 128],  
    'num_hidden_layers':      [6, 12, 18],
    'hidden_dropout_prob':    [0.0, 0.1, 0.2, 0.3], 
    'attention_probs_dropout_prob': [0.0, 0.1, 0.2, 0.3],
    'classifier_dropout':     [0.0, 0.1, 0.2, 0.3],  
    'initializer_range':      [0.01, 0.02]
}

training_params = {
    'lr': [1e-3, 1e-4]
}

def sample_hparams():
    hp = {k: random.choice(v) for k, v in param_space.items()}
    hp['hidden_size']       = hp['num_attention_heads'] * hp['head_dim']
    hp['intermediate_size'] = hp['hidden_size'] * 4
    return hp

results = []

for trial in range(1, 2):
    hparams = sample_hparams()
    lr = random.choice(training_params['lr'])
    print(f'Model {trial}, params {hparams}, learning rate {lr}')
    run_dir = f"runs/trial_{trial}"
    os.makedirs(run_dir, exist_ok=True)
    with open(os.path.join(run_dir, "hparams.json"), "w") as fp:
        json.dump({**hparams, 'lr': lr}, fp, indent=2)

    model = ASTModel(num_labels=num_labels, **hparams)

    train_model(
        model,
        train_loader,
        val_loader,
        epochs=1,
        lr=lr,         
        device=None,
        output_dir=run_dir,
        patience = 5,
        label2id=label2id
    )

    results.append({'trial': trial, **hparams, 'lr': lr})


Model 1, params {'num_attention_heads': 12, 'head_dim': 128, 'num_hidden_layers': 6, 'hidden_dropout_prob': 0.0, 'attention_probs_dropout_prob': 0.1, 'classifier_dropout': 0.1, 'initializer_range': 0.01, 'hidden_size': 1536, 'intermediate_size': 6144}, learning rate 0.0001
Setting attribute num_attention_heads to 12
Setting attribute num_hidden_layers to 6
Setting attribute hidden_dropout_prob to 0.0
Setting attribute attention_probs_dropout_prob to 0.1
Setting attribute initializer_range to 0.01
Setting attribute hidden_size to 1536
Setting attribute intermediate_size to 6144


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/1 - Train loss: 2.2553, Train F1: 0.0185 | Val loss: 3.1728, Val Acc: 0.6562, Val F1: 0.0881


In [11]:
trained_model = train_model(model, train_loader, val_loader, epochs=2, lr=1e-4)

TypeError: train_model() missing 1 required positional argument: 'label2id'