In [1]:
import os
import pandas as pd
import numpy as np


In [None]:
base_path = '..\data\ASVSPOOF 2019\LA' #This folder contains the ASVspoof audio files. However, I have already extracted them into numeric and image formats, so you don’t need to extract them again.
protocol_dir = os.path.join(base_path, 'ASVspoof2019_LA_cm_protocols')
train_dir = os.path.join(base_path, 'ASVspoof2019_LA_train', 'flac')
dev_dir = os.path.join(base_path, 'ASVspoof2019_LA_dev', 'flac')
eval_dir = os.path.join(base_path, 'ASVspoof2019_LA_eval', 'flac')

In [3]:
def get_file_path(directory, filename):
    return os.path.join(directory, f'{filename}.flac')

In [4]:
def read_dataset(protocol_path, directory):
    """Reads CM protocol correctly and adds attack_id as-is."""
    df = pd.read_csv(protocol_path, 
                     delim_whitespace=True, 
                     header=None,
                     names=['speaker_id', 'filename', 'null', 'system_id', 'class_name'])

    df['filepath'] = df['filename'].apply(lambda x: get_file_path(directory, x))
    df.drop('null', axis=1, inplace=True)  # Kolom ke-3 tidak digunakan
    df.rename(columns={'system_id': 'attack_id'}, inplace=True)  # Ganti nama agar lebih deskriptif

    return df


In [5]:

def label_to_int(class_name):
    if class_name == 'bonafide':
        return 0
    else:
        return 1
    
def add_columns(df, subset):
    df['target'] = df['class_name'].apply(label_to_int)
    df['subset'] = subset
    return df

In [7]:
train_df = read_dataset(os.path.join(protocol_dir, 'ASVspoof2019.LA.cm.train.trn.txt'), train_dir)
dev_df = read_dataset(os.path.join(protocol_dir, 'ASVspoof2019.LA.cm.dev.trl.txt'), dev_dir)
eval_df = read_dataset(os.path.join(protocol_dir, 'ASVspoof2019.LA.cm.eval.trl.txt'), eval_dir)

train_df = add_columns(train_df, 'training')
dev_df = add_columns(dev_df, 'validation')
eval_df = add_columns(eval_df, 'testing')


  df = pd.read_csv(protocol_path,
  df = pd.read_csv(protocol_path,
  df = pd.read_csv(protocol_path,


In [7]:
train_df["attack_id"].value_counts()

attack_id
A01    3800
A02    3800
A03    3800
A04    3800
A05    3800
A06    3800
-      2580
Name: count, dtype: int64

In [8]:
dev_df["attack_id"].value_counts()

attack_id
A01    3716
A02    3716
A03    3716
A04    3716
A05    3716
A06    3716
-      2548
Name: count, dtype: int64

In [9]:
eval_df["attack_id"].value_counts()

attack_id
-      7355
A11    4914
A14    4914
A16    4914
A09    4914
A13    4914
A12    4914
A18    4914
A15    4914
A08    4914
A17    4914
A10    4914
A07    4914
A19    4914
Name: count, dtype: int64

In [8]:
def balanced_resample(df, n_bonafide, n_per_attack):
    # Pisahkan bonafide dan spoof
    bonafide = df[df['attack_id'] == '-'].sample(n=n_bonafide, random_state=42)
    
    spoof_ids = [aid for aid in df['attack_id'].unique() if aid != '-']
    
    spoof = pd.concat([
        df[df['attack_id'] == aid].sample(n=n_per_attack, random_state=42)
        for aid in spoof_ids
    ])
    
    return pd.concat([bonafide, spoof]).reset_index(drop=True)


In [9]:
train_df["class_name"].value_counts()

class_name
spoof       22800
bonafide     2580
Name: count, dtype: int64

In [11]:
train_df_balanced = balanced_resample(train_df, n_bonafide=2580, n_per_attack=860)

# Cek hasil
print(train_df_balanced["class_name"].value_counts())


class_name
spoof       5160
bonafide    2580
Name: count, dtype: int64


In [13]:
train_df_balanced["attack_id"].value_counts()

attack_id
-      2580
A01     860
A02     860
A03     860
A04     860
A05     860
A06     860
Name: count, dtype: int64

In [14]:
dev_df["class_name"].value_counts()

class_name
spoof       22296
bonafide     2548
Name: count, dtype: int64

In [18]:
dev_df_balanced = balanced_resample(dev_df, n_bonafide=2548, n_per_attack=849)

# Cek hasil
print(dev_df_balanced["class_name"].value_counts())


class_name
spoof       5094
bonafide    2548
Name: count, dtype: int64


In [19]:
dev_df_balanced["attack_id"].value_counts()

attack_id
-      2548
A01     849
A02     849
A03     849
A04     849
A05     849
A06     849
Name: count, dtype: int64

In [20]:
eval_df["class_name"].value_counts()

class_name
spoof       63882
bonafide     7355
Name: count, dtype: int64

In [23]:
test_df_balanced = balanced_resample(eval_df, n_bonafide=7355, n_per_attack=1132)

# Cek hasil
print(test_df_balanced["class_name"].value_counts())


class_name
spoof       14716
bonafide     7355
Name: count, dtype: int64


In [24]:
test_df_balanced["attack_id"].value_counts()

attack_id
-      7355
A11    1132
A14    1132
A16    1132
A09    1132
A13    1132
A12    1132
A18    1132
A15    1132
A08    1132
A17    1132
A10    1132
A07    1132
A19    1132
Name: count, dtype: int64

In [35]:
final_full_df = pd.concat([train_df_balanced, dev_df_balanced, test_df_balanced], axis=0).reset_index(drop=True)
final_full_df

Unnamed: 0,speaker_id,filename,attack_id,class_name,filepath,target,subset
0,LA_0085,LA_T_3891170,-,bonafide,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,0,training
1,LA_0097,LA_T_2948101,-,bonafide,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,0,training
2,LA_0084,LA_T_8919107,-,bonafide,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,0,training
3,LA_0081,LA_T_6650735,-,bonafide,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,0,training
4,LA_0086,LA_T_5362691,-,bonafide,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,0,training
...,...,...,...,...,...,...,...
37448,LA_0013,LA_E_7178035,A19,spoof,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,1,testing
37449,LA_0013,LA_E_5285610,A19,spoof,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,1,testing
37450,LA_0007,LA_E_5912220,A19,spoof,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,1,testing
37451,LA_0015,LA_E_5036389,A19,spoof,D:\sem 7\Bismillah Skripsi\Voice Recog\Data\AS...,1,testing


In [36]:
final_full_df[["subset", "class_name"]].value_counts()

subset      class_name
testing     spoof         14716
            bonafide       7355
training    spoof          5160
validation  spoof          5094
training    bonafide       2580
validation  bonafide       2548
Name: count, dtype: int64

In [None]:
final_full_df.to_csv("../data/processed/data_asv_spoof.csv", index=False)