In [1]:
!pip -q install soundfile

# %%
import os, sys, math, random, time, json, gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import torchaudio
import torchaudio.transforms as T

from sklearn.metrics import roc_curve, accuracy_score


In [2]:
SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_ROOT = Path('/kaggle/input/asvpoof-2019-dataset/LA/LA')

# Protocol files (chuẩn ASVspoof2019 LA)
PROTO_DIR = DATA_ROOT / 'ASVspoof2019_LA_cm_protocols'
TRAIN_PROTO = PROTO_DIR / 'ASVspoof2019.LA.cm.train.trn.txt'
DEV_PROTO   = PROTO_DIR / 'ASVspoof2019.LA.cm.dev.trl.txt'
EVAL_PROTO  = PROTO_DIR / 'ASVspoof2019.LA.cm.eval.trl.txt'

# Audio dirs
TRAIN_AUDIO_DIR = DATA_ROOT / 'ASVspoof2019_LA_train'
DEV_AUDIO_DIR   = DATA_ROOT / 'ASVspoof2019_LA_dev'
EVAL_AUDIO_DIR  = DATA_ROOT / 'ASVspoof2019_LA_eval'

# Tập tin audio thường nằm trong subfolder 'flac' theo layout chuẩn
def with_flac_dir(p: Path) -> Path:
    return p / 'flac' if (p / 'flac').exists() else p

TRAIN_AUDIO_DIR = with_flac_dir(TRAIN_AUDIO_DIR)
DEV_AUDIO_DIR   = with_flac_dir(DEV_AUDIO_DIR)
EVAL_AUDIO_DIR  = with_flac_dir(EVAL_AUDIO_DIR)

SAMPLE_RATE = 16_000
TARGET_SEC = 4.0
TARGET_SAMPLES = int(TARGET_SEC * SAMPLE_RATE)

# Feature config cho log-Mel
N_MELS = 80
N_FFT = 1024
HOP_LENGTH = 160
WIN_LENGTH = 400

# Huấn luyện
BATCH_SIZE = 32
LR = 2e-4
EPOCHS = 12  # có thể tăng 20-30 để tối ưu EER
WEIGHT_DECAY = 1e-4
GRAD_CLIP = 5.0


In [3]:
# # 2) Protocol → DataFrame
# Format chuẩn: speaker_id, utterance_id, system_id, key (bonafide/spoof)
# File audio: {split}/flac/{utterance_id}.flac

def parse_protocol(proto_path: Path, split: str, audio_root: Path):
    rows = []
    with open(proto_path, 'r') as f:
        for line in f:
            if not line.strip() or line.startswith('#'):
                continue
            parts = line.strip().split()
            # Hỗ trợ cả format 4-5 cột
            # thường là: speaker_id, utt_id, _, sys_id, key  hoặc  speaker_id, utt_id, sys_id, key
            speaker_id = parts[0]
            utt_id = parts[1]
            sys_id = parts[-2]
            key = parts[-1]
            label = 1 if key.lower() == 'spoof' else 0  # positive=spoof cho EER ROC
            audio_path = audio_root / f"{utt_id}.flac"
            rows.append({
                'split': split,
                'speaker_id': speaker_id,
                'utt_id': utt_id,
                'sys_id': sys_id,
                'key': key.lower(),
                'label': label,
                'path': str(audio_path)
            })
    return pd.DataFrame(rows)

train_df = parse_protocol(TRAIN_PROTO, 'train', TRAIN_AUDIO_DIR)
dev_df   = parse_protocol(DEV_PROTO, 'dev', DEV_AUDIO_DIR)
eval_df  = parse_protocol(EVAL_PROTO, 'eval', EVAL_AUDIO_DIR)

print(train_df.head(), '\n', dev_df.head(), '\n', eval_df.head())
print('Counts:', train_df.key.value_counts())


   split speaker_id        utt_id sys_id       key  label  \
0  train    LA_0079  LA_T_1138215      -  bonafide      0   
1  train    LA_0079  LA_T_1271820      -  bonafide      0   
2  train    LA_0079  LA_T_1272637      -  bonafide      0   
3  train    LA_0079  LA_T_1276960      -  bonafide      0   
4  train    LA_0079  LA_T_1341447      -  bonafide      0   

                                                path  
0  /kaggle/input/asvpoof-2019-dataset/LA/LA/ASVsp...  
1  /kaggle/input/asvpoof-2019-dataset/LA/LA/ASVsp...  
2  /kaggle/input/asvpoof-2019-dataset/LA/LA/ASVsp...  
3  /kaggle/input/asvpoof-2019-dataset/LA/LA/ASVsp...  
4  /kaggle/input/asvpoof-2019-dataset/LA/LA/ASVsp...   
   split speaker_id        utt_id sys_id       key  label  \
0   dev    LA_0069  LA_D_1047731      -  bonafide      0   
1   dev    LA_0069  LA_D_1105538      -  bonafide      0   
2   dev    LA_0069  LA_D_1125976      -  bonafide      0   
3   dev    LA_0069  LA_D_1293230      -  bonafide      0   
4

In [4]:
# # 3) Dataset/Dataloader
# - Đọc .flac bằng torchaudio, resample về 16k nếu cần
# - Chuẩn hoá gain, crop/pad về 4s
# - Tạo log-Mel cho nhánh SPEC
# - Augment nhẹ: gain jitter, noise rất nhỏ, SpecAugment

# %%
resampler = T.Resample(orig_freq=SAMPLE_RATE, new_freq=SAMPLE_RATE)  # no-op nếu đã 16k
mel_extractor = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH,
    f_min=20, f_max=7600, n_mels=N_MELS, center=True, power=2.0
)
amplitude_to_db = T.AmplitudeToDB(stype='power')

# SpecAugment (nhẹ)
spec_time_mask = T.TimeMasking(time_mask_param=20)
spec_freq_mask = T.FrequencyMasking(freq_mask_param=8)

def pre_emphasis(wav, coeff=0.97):
    # wav: (T,)
    y = torch.clone(wav)
    y[1:] = wav[1:] - coeff * wav[:-1]
    return y

def crop_or_pad(wav, target_len=TARGET_SAMPLES):
    L = wav.shape[-1]
    if L > target_len:
        start = random.randint(0, L - target_len)
        return wav[:, start:start+target_len]
    elif L < target_len:
        # lặp/zero-pad về đúng độ dài
        repeat = (target_len // L) + 1
        wav = wav.repeat(1, repeat)[:, :target_len]
        return wav
    else:
        return wav

class LA19Dataset(Dataset):
    def __init__(self, df: pd.DataFrame, training: bool):
        self.df = df.reset_index(drop=True)
        self.training = training

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = row['path']
        wav, sr = torchaudio.load(path)  # (C, T)
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        if sr != SAMPLE_RATE:
            wav = resampler(wav)
        wav = wav / (wav.abs().max() + 1e-9)

        if self.training:
            # Gain jitter nhỏ
            gain = 10 ** (random.uniform(-1.0, 1.0) / 20)
            wav = wav * gain
        # Pre-emphasis
        wav = pre_emphasis(wav.squeeze(0)).unsqueeze(0)
        # Crop/Pad
        wav = crop_or_pad(wav, TARGET_SAMPLES)

        # Log-Mel
        with torch.no_grad():
            mel = mel_extractor(wav)  # (1, n_mels, Tm)
            mel = amplitude_to_db(mel)
            if self.training:
                mel = spec_freq_mask(spec_time_mask(mel))

        label = row['label'] if row['split'] != 'eval' else -1
        return {
            'wav': wav,             # (1, T)
            'mel': mel,             # (1, M, Tm)
            'label': label,
            'utt_id': row['utt_id'],
            'sys_id': row['sys_id']
        }

# Sampler có trọng số để cân bằng spoof/bonafide
cls_counts = train_df['label'].value_counts().to_dict()
pos_w = 1.0 / cls_counts.get(1,1)
neg_w = 1.0 / cls_counts.get(0,1)
weights = train_df['label'].map({1: pos_w, 0: neg_w}).values
sampler = WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True)

train_ds = LA19Dataset(train_df, training=True)
dev_ds   = LA19Dataset(dev_df,   training=False)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler,
                          num_workers=2, pin_memory=True, drop_last=True)
dev_loader   = DataLoader(dev_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=2, pin_memory=True, drop_last=False)


In [5]:
# # 4) Mô hình DRSAS‑Net 
# - Nhánh RAW: Res1D + SE
# - Nhánh SPEC: DS-Conv2D + SE
# - Fusion bằng gating attention
# - Head nhị phân: output logit (spoof score)

class SE1D(nn.Module):
    def __init__(self, c, r=8):
        super().__init__()
        self.fc1 = nn.Conv1d(c, c//r, 1)
        self.fc2 = nn.Conv1d(c//r, c, 1)
    def forward(self, x):
        w = x.mean(-1, keepdim=True)
        w = F.relu(self.fc1(w))
        w = torch.sigmoid(self.fc2(w))
        return x * w

class ResBlock1D(nn.Module):
    def __init__(self, c, k=5, d=1):
        super().__init__()
        pad = (k//2) * d
        self.conv1 = nn.Conv1d(c, c, k, padding=pad, dilation=d, bias=False)
        self.bn1 = nn.BatchNorm1d(c)
        self.conv2 = nn.Conv1d(c, c, k, padding=pad, dilation=d, bias=False)
        self.bn2 = nn.BatchNorm1d(c)
        self.se = SE1D(c)
        self.act = nn.PReLU(c)
    def forward(self, x):
        r = x
        x = self.act(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        x = self.se(x) + r
        x = self.act(x)
        return x

class RAWBranch(nn.Module):
    def __init__(self, emb_dim=256):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm1d(64),
            nn.PReLU(64),
        )
        self.layer1 = ResBlock1D(64, k=5, d=1)
        self.down1  = nn.Conv1d(64, 128, 3, stride=2, padding=1)
        self.layer2 = ResBlock1D(128, k=5, d=2)
        self.down2  = nn.Conv1d(128, 192, 3, stride=2, padding=1)
        self.layer3 = ResBlock1D(192, k=3, d=3)
        self.pool   = nn.AdaptiveAvgPool1d(1)
        self.fc     = nn.Linear(192, emb_dim)

    def forward(self, wav):  # (B, 1, T)
        x = self.stem(wav)
        x = self.layer1(x)
        x = self.down1(x)
        x = self.layer2(x)
        x = self.down2(x)
        x = self.layer3(x)
        x = self.pool(x).squeeze(-1)   # (B, C)
        e = self.fc(x)
        return e

class SE2D(nn.Module):
    def __init__(self, c, r=8):
        super().__init__()
        self.fc1 = nn.Conv2d(c, c//r, 1)
        self.fc2 = nn.Conv2d(c//r, c, 1)
    def forward(self, x):
        w = x.mean((-2,-1), keepdim=True)
        w = F.relu(self.fc1(w))
        w = torch.sigmoid(self.fc2(w))
        return x * w

class DSConv2dBlock(nn.Module):
    def __init__(self, c_in, c_out, stride=1):
        super().__init__()
        self.dw = nn.Conv2d(c_in, c_in, 3, stride=stride, padding=1, groups=c_in, bias=False)
        self.pw = nn.Conv2d(c_in, c_out, 1, bias=False)
        self.bn = nn.BatchNorm2d(c_out)
        self.act = nn.PReLU(c_out)
        self.se = SE2D(c_out)
        self.skip = None
        if stride != 1 or c_in != c_out:
            self.skip = nn.Sequential(
                nn.Conv2d(c_in, c_out, 1, stride=stride, bias=False),
                nn.BatchNorm2d(c_out)
            )
    def forward(self, x):
        r = x if self.skip is None else self.skip(x)
        x = self.dw(x)
        x = self.pw(x)
        x = self.bn(x)
        x = self.se(x)
        x = self.act(x + r)
        return x

class SPECBranch(nn.Module):
    def __init__(self, emb_dim=256):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.PReLU(32)
        )
        self.b1 = DSConv2dBlock(32, 64, stride=2)
        self.b2 = DSConv2dBlock(64, 96, stride=2)
        self.b3 = DSConv2dBlock(96, 128, stride=2)
        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(128, emb_dim)

    def forward(self, mel):  # (B, 1, M, Tm)
        x = self.stem(mel)
        x = self.b1(x)
        x = self.b2(x)
        x = self.b3(x)
        x = self.pool(x).flatten(1)
        e = self.fc(x)
        return e

class DRSASNet(nn.Module):
    def __init__(self, emb_dim=256, dropout=0.2):
        super().__init__()
        self.raw = RAWBranch(emb_dim)
        self.spec = SPECBranch(emb_dim)
        self.fuse_gate = nn.Sequential(
            nn.Linear(emb_dim*2, emb_dim),
            nn.PReLU(emb_dim),
            nn.Linear(emb_dim, 2),   # 2 logit -> softmax -> gate cho [raw, spec]
        )
        self.head = nn.Sequential(
            nn.Linear(emb_dim, emb_dim//2),
            nn.PReLU(emb_dim//2),
            nn.Dropout(dropout),
            nn.Linear(emb_dim//2, 1) # logit spoof
        )

    def forward(self, wav, mel):
        e_raw = self.raw(wav)              # (B, D)
        e_spec = self.spec(mel)            # (B, D)
        z = torch.cat([e_raw, e_spec], dim=1)
        g = F.softmax(self.fuse_gate(z), dim=1)  # (B,2)
        fused = g[:,0:1]*e_raw + g[:,1:2]*e_spec
        logit = self.head(fused).squeeze(1)
        return logit, fused, g


In [6]:
# %% [markdown]
# # 5) Vòng lặp train + EER
# - Loss: BCEWithLogitsLoss với pos_weight để cân bằng class (positive=spoof)
# - AMP + gradient clip
# - Đánh giá EER/ACC trên dev sau mỗi epoch, lưu best model

# %%
def compute_eer(y_true, scores):
    # y_true: 1=spoof (positive), 0=bonafide
    fpr, tpr, th = roc_curve(y_true, scores, pos_label=1)
    fnr = 1 - tpr
    idx = np.nanargmin(np.abs(fnr - fpr))
    eer = (fnr[idx] + fpr[idx]) / 2.0
    return float(eer*100), float(th[idx])

model = DRSASNet().to(DEVICE)

# pos_weight: tỷ lệ bonafide/spoof để cân bằng; ở LA train, spoof thường nhiều hơn
num_pos = train_df['label'].sum()
num_neg = len(train_df) - num_pos
pos_weight = torch.tensor([ (num_neg + 1e-6) / (num_pos + 1e-6) ], device=DEVICE)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=LR/50)

scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

best_eer = 1e9
best_path = '/kaggle/working/drsasnet_best.pt'

for epoch in range(1, EPOCHS+1):
    model.train()
    tr_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}")
    for batch in pbar:
        wav = batch['wav'].to(DEVICE)      # (B, 1, T)
        mel = batch['mel'].to(DEVICE)      # (B, 1, M, Tm)
        y = batch['label'].float().to(DEVICE)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logit, _, _ = model(wav, mel)
            loss = criterion(logit, y)
        scaler.scale(loss).backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        scaler.step(optimizer)
        scaler.update()
        tr_loss += loss.item()
        pbar.set_postfix(loss=f"{loss.item():.4f}", lr=f"{scheduler.get_last_lr()[0]:.2e}")
    scheduler.step()

    # Evaluate
    model.eval()
    all_scores, all_labels = [], []
    with torch.no_grad():
        for batch in dev_loader:
            wav = batch['wav'].to(DEVICE)
            mel = batch['mel'].to(DEVICE)
            logit, _, _ = model(wav, mel)
            score = torch.sigmoid(logit).detach().cpu().numpy()  # spoof probability
            all_scores.extend(score.tolist())
            all_labels.extend(batch['label'].numpy().tolist())
    eer, th = compute_eer(np.array(all_labels), np.array(all_scores))
    preds = (np.array(all_scores) >= 0.5).astype(int)
    acc = accuracy_score(all_labels, preds) * 100.0

    print(f"[Epoch {epoch}] train_loss={tr_loss/len(train_loader):.4f} | DEV: EER={eer:.3f}% @th={th:.3f} | ACC={acc:.2f}%")

    if eer < best_eer:
        best_eer = eer
        torch.save({'model': model.state_dict(),
                    'eer': best_eer,
                    'threshold_eer': th}, best_path)
        print(f"  ↳ Saved best to {best_path} (EER={best_eer:.3f}%)")


  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 1/12: 100%|██████████| 793/793 [04:05<00:00,  3.23it/s, loss=0.0430, lr=2.00e-04]


[Epoch 1] train_loss=0.1231 | DEV: EER=14.834% @th=0.096 | ACC=77.73%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=14.834%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 2/12: 100%|██████████| 793/793 [04:09<00:00,  3.18it/s, loss=0.0585, lr=1.97e-04]


[Epoch 2] train_loss=0.0849 | DEV: EER=13.328% @th=0.048 | ACC=78.78%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=13.328%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 3/12: 100%|██████████| 793/793 [04:11<00:00,  3.15it/s, loss=0.0362, lr=1.87e-04]


[Epoch 3] train_loss=0.0698 | DEV: EER=11.946% @th=0.039 | ACC=78.81%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=11.946%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 4/12: 100%|██████████| 793/793 [04:21<00:00,  3.04it/s, loss=0.0362, lr=1.71e-04]


[Epoch 4] train_loss=0.0662 | DEV: EER=7.643% @th=0.201 | ACC=89.06%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=7.643%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 5/12: 100%|██████████| 793/793 [04:29<00:00,  2.94it/s, loss=0.0426, lr=1.51e-04]


[Epoch 5] train_loss=0.0546 | DEV: EER=2.874% @th=0.938 | ACC=97.89%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=2.874%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 6/12: 100%|██████████| 793/793 [04:18<00:00,  3.07it/s, loss=0.0000, lr=1.27e-04]


[Epoch 6] train_loss=0.0397 | DEV: EER=2.507% @th=0.000 | ACC=50.63%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=2.507%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 7/12: 100%|██████████| 793/793 [04:04<00:00,  3.25it/s, loss=0.0000, lr=1.02e-04]


[Epoch 7] train_loss=0.0108 | DEV: EER=0.893% @th=1.000 | ACC=99.13%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=0.893%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 8/12: 100%|██████████| 793/793 [03:58<00:00,  3.32it/s, loss=0.0000, lr=7.66e-05]


[Epoch 8] train_loss=0.0080 | DEV: EER=0.309% @th=1.000 | ACC=99.90%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=0.309%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 9/12: 100%|██████████| 793/793 [03:55<00:00,  3.36it/s, loss=0.0000, lr=5.30e-05]


[Epoch 9] train_loss=0.0013 | DEV: EER=0.029% @th=0.966 | ACC=99.98%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=0.029%)


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 10/12: 100%|██████████| 793/793 [03:54<00:00,  3.38it/s, loss=0.0000, lr=3.27e-05]


[Epoch 10] train_loss=0.0011 | DEV: EER=0.031% @th=0.989 | ACC=99.99%


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 11/12: 100%|██████████| 793/793 [03:53<00:00,  3.40it/s, loss=0.0000, lr=1.71e-05]


[Epoch 11] train_loss=0.0004 | DEV: EER=0.084% @th=1.000 | ACC=99.98%


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Epoch 12/12: 100%|██████████| 793/793 [03:53<00:00,  3.40it/s, loss=0.0000, lr=7.34e-06]


[Epoch 12] train_loss=0.0001 | DEV: EER=0.024% @th=0.950 | ACC=99.98%
  ↳ Saved best to /kaggle/working/drsasnet_best.pt (EER=0.024%)


In [7]:
# %% [markdown]
# # 6) Đánh giá cuối cùng (dùng best checkpoint)

# %%
ckpt = torch.load('/kaggle/working/drsasnet_best.pt', map_location='cpu')
model.load_state_dict(ckpt['model'])
threshold_eer = ckpt.get('threshold_eer', 0.5)
best_eer = ckpt.get('eer', None)
print('Best EER (dev):', best_eer, 'Threshold@EER:', threshold_eer)

# Tính ACC tại ngưỡng EER
model.eval()
scores, labels = [], []
with torch.no_grad():
    for batch in dev_loader:
        wav = batch['wav'].to(DEVICE)
        mel = batch['mel'].to(DEVICE)
        logit, _, _ = model(wav, mel)
        prob = torch.sigmoid(logit).cpu().numpy()
        scores.extend(prob.tolist())
        labels.extend(batch['label'].numpy().tolist())

y_true = np.array(labels)
y_score = np.array(scores)
eer, th = compute_eer(y_true, y_score)
y_pred = (y_score >= th).astype(int)  # positive=spoof
acc = accuracy_score(y_true, y_pred)*100
print(f"DEV summary → EER={eer:.3f}% @th={th:.3f} | ACC={acc:.2f}%")


Best EER (dev): 0.02410834334561755 Threshold@EER: 0.9496035575866699
DEV summary → EER=0.004% @th=0.998 | ACC=99.99%


In [12]:
# %% [markdown]
# ## 8) Full EVAL scoring → CSV
# - Dùng DRSASNet đã train
# - Xuất spoof probability cho từng utt_id

from torch.utils.data import DataLoader
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torch, numpy as np

try:
    eval_df
except NameError:
    # Nếu bạn đã có parse_protocol ở trên
    PROTO_DIR = DATA_ROOT / 'ASVspoof2019_LA_cm_protocols'
    EVAL_PROTO  = PROTO_DIR / 'ASVspoof2019.LA.cm.eval.trl.txt'
    EVAL_AUDIO_DIR  = (DATA_ROOT / 'ASVspoof2019_LA_eval' / 'flac'
                       if (DATA_ROOT / 'ASVspoof2019_LA_eval' / 'flac').exists()
                       else DATA_ROOT / 'ASVspoof2019_LA_eval')
    eval_df  = parse_protocol(EVAL_PROTO, 'eval', EVAL_AUDIO_DIR)

# Dataloader cho eval
eval_ds = LA19Dataset(eval_df, training=False)
eval_loader = DataLoader(eval_ds, batch_size=BATCH_SIZE, shuffle=False,
                         num_workers=min(8, os.cpu_count() or 2),
                         pin_memory=True, persistent_workers=True)

# Load best checkpoint nếu cần
best_path = '/kaggle/working/drsasnet_best.pt'
if Path(best_path).exists():
    ckpt = torch.load(best_path, map_location='cpu')
    model.load_state_dict(ckpt['model'])
model.to(DEVICE).eval()

# Chấm điểm
utt_ids, scores = [], []
with torch.no_grad():
    for batch in tqdm(eval_loader, desc='Scoring EVAL'):
        wav = batch['wav'].to(DEVICE, non_blocking=True)
        mel = batch['mel'].to(DEVICE, non_blocking=True)
        logit, _, _ = model(wav, mel)
        prob = torch.sigmoid(logit).float().cpu().numpy()
        utt_ids.extend(batch['utt_id'])
        scores.extend(prob.tolist())

eval_scores = pd.DataFrame({'utt_id': utt_ids, 'cm_score': scores})
save_path = '/kaggle/working/drsasnet_eval_scores.csv'
eval_scores.to_csv(save_path, index=False)
print(f'✔ Saved: {save_path}')
eval_scores.head()


Scoring EVAL: 100%|██████████| 2227/2227 [07:33<00:00,  4.91it/s]

✔ Saved: /kaggle/working/drsasnet_eval_scores.csv





Unnamed: 0,utt_id,cm_score
0,LA_E_2834763,0.999998
1,LA_E_8877452,1.0
2,LA_E_6828287,1.0
3,LA_E_6977360,1.0
4,LA_E_5932896,1.0
