In [1]:
import os
import torch
import librosa
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchaudio import load
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
import torch.optim as optim

In [2]:
# ! pip install wandb

In [3]:
import wandb

In [4]:
wandb.init(project="Speech Assignment 3", entity="m23csa014", name = "task 4")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mm23csa014[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
class AudioDataset(Dataset):
    def __init__(self, root):
        self.root = root
        self.classes, self.class_to_idx = self._find_classes()
        self.samples = self._make_dataset()

    def _find_classes(self):
        classes = [d.name for d in os.scandir(self.root) if d.is_dir()]
        classes.sort()
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def _make_dataset(self):
        samples = []
        for target_class in self.classes:
            class_index = self.class_to_idx[target_class]
            target_dir = os.path.join(self.root, target_class)
            for root_dir, _, file_names in os.walk(target_dir):
                for file_name in file_names:
                    if file_name.endswith('.wav') or file_name.endswith('.mp3') or file_name.endswith('.ogg'):
                        file_path = os.path.join(root_dir, file_name)
                        samples.append((file_path, class_index))
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        audio_path, class_index = self.samples[idx]
        # Load the audio file and preprocess
        waveform, _ = load(audio_path)
        waveform = self._preprocess_audio(waveform)
        return waveform, class_index

    def _preprocess_audio(self, waveform):
        waveform = waveform.numpy()[0]  # Convert tensor to numpy array
        max_len = 64600
        if waveform.shape[0] >= max_len:
            return waveform[:max_len]
        else:
            num_repeats = int(max_len / waveform.shape[0]) + 1
            padded_waveform = np.tile(waveform, (1, num_repeats))[:, :max_len][0]
            return padded_waveform


In [6]:
# Define the root directory where your data is stored
root = "/teamspace/studios/this_studio/SpeechAssign3/for-2seconds"  # Replace this with the path to your data folder

# Create datasets
train_dataset = AudioDataset(root=os.path.join(root, "training"))
test_dataset = AudioDataset(root=os.path.join(root, "testing"))
validation_dataset = AudioDataset(root=os.path.join(root, "validation"))

In [7]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=6)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False, num_workers=6)

In [8]:
# Iterate through the train_loader to inspect the data with tqdm progress bar
for batch_idx, (data, target) in enumerate(tqdm(train_loader, desc="Training")):
    print("Batch:", batch_idx)
    print("Data shape:", data.shape)  # Shape of the input data tensor
    print("Target shape:", target.shape)  # Shape of the target tensor
    print("Data sample:", data[0])  # Print the first data sample in the batch
    print("Target sample:", target[0])  # Print the target label of the first sample
    break  # Break after printing the first batch to keep the output concise

Training:   0%|          | 0/873 [00:00<?, ?it/s]

Batch: 0
Data shape: torch.Size([16, 64600])
Target shape: torch.Size([16])
Data sample: tensor([ 0.2978,  0.3043,  0.2757,  ..., -0.1729, -0.2001, -0.2222])
Target sample: tensor(1)





In [9]:
#GPU device
device = 'cuda' if torch.cuda.is_available() else 'cpu'                  
print('Device: {}'.format(device))

# Load the SSL W2V model trained for LA and DF tracks
from model import Model

model = Model(None, device=device)
model = nn.DataParallel(model).to(device)
model.load_state_dict(torch.load('/teamspace/studios/this_studio/SpeechAssign3/Best_LA_model_for_DF.pth'))

model.eval()

Device: cuda




DataParallel(
  (module): Model(
    (ssl_model): SSLModel(
      (model): Wav2Vec2Model(
        (feature_extractor): ConvFeatureExtractionModel(
          (conv_layers): ModuleList(
            (0): Sequential(
              (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
              (1): Dropout(p=0.0, inplace=False)
              (2): Sequential(
                (0): TransposeLast()
                (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                (2): TransposeLast()
              )
              (3): GELU(approximate='none')
            )
            (1-4): 4 x Sequential(
              (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
              (1): Dropout(p=0.0, inplace=False)
              (2): Sequential(
                (0): TransposeLast()
                (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                (2): TransposeLast()
              )
              (3): GELU(approximate='none')
            )
     

In [10]:
torch.cuda.empty_cache()

In [11]:
def train_epoch(train_loader, model, lr, optimizer, device):
    running_loss = 0.0
    num_total = 0.0
    num_batches = len(train_loader)
    
    model.train()

    # Set objective (Loss) functions
    weight = torch.FloatTensor([0.1, 0.9]).to(device)
    criterion = nn.CrossEntropyLoss(weight=weight)
    
    # Initialize tqdm with the length of the train_loader
    progress_bar = tqdm(train_loader, desc='Training', leave=False)
    
    for batch_x, batch_y in progress_bar:
        batch_size = batch_x.size(0)
        num_total += batch_size
        
        batch_x = batch_x.to(device)
        batch_y = batch_y.view(-1).type(torch.int64).to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        batch_out = model(batch_x)
        
        # Compute loss
        batch_loss = criterion(batch_out, batch_y)
        
        running_loss += batch_loss.item() * batch_size
        
        # Backward pass and optimization
        batch_loss.backward()
        optimizer.step()
        
        # Update progress bar
        progress_bar.set_postfix(loss=running_loss / num_batches)
    
    progress_bar.close()
    
    # Compute average loss
    running_loss /= len(train_loader.dataset)

    wandb.log({"train_loss": running_loss})
    
    return running_loss


In [12]:
def validate(val_loader, model, device):
    running_loss = 0.0
    num_total = 0.0
    num_batches = len(val_loader)
    
    model.eval()

    # Set objective (Loss) functions
    weight = torch.FloatTensor([0.1, 0.9]).to(device)
    criterion = nn.CrossEntropyLoss(weight=weight)
    
    # Initialize tqdm with the length of the val_loader
    progress_bar = tqdm(val_loader, desc='Validation', leave=False)
    
    with torch.no_grad():
        for batch_x, batch_y in progress_bar:
            batch_size = batch_x.size(0)
            num_total += batch_size
            
            batch_x = batch_x.to(device)
            batch_y = batch_y.view(-1).type(torch.int64).to(device)
            
            # Forward pass
            batch_out = model(batch_x)
            
            # Compute loss
            batch_loss = criterion(batch_out, batch_y)
            
            running_loss += batch_loss.item() * batch_size
            
            # Update progress bar
            progress_bar.set_postfix(loss=running_loss / num_batches)
    
    progress_bar.close()
    
    # Compute average loss
    running_loss /= len(val_loader.dataset)
    
    # Log validation loss to wandb
    wandb.log({"val_loss": running_loss})
    
    return running_loss

In [13]:
lr = 5e-5

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)

# Number of epochs
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    # Run training epoch
    train_loss = train_epoch(train_loader, model, lr, optimizer, device)
    
    # Run validation epoch
    val_loss = validate(validation_loader, model, device)
    
    # Print training and validation losses for each epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Training:   0%|          | 0/873 [00:00<?, ?it/s]

                                                                          

Epoch [1/10], Train Loss: 0.0421, Val Loss: 0.0059


                                                                          

Epoch [2/10], Train Loss: 0.0228, Val Loss: 0.0047


                                                                          

Epoch [3/10], Train Loss: 0.0094, Val Loss: 0.0049


                                                                          

Epoch [4/10], Train Loss: 0.0081, Val Loss: 0.0039


                                                                          

Epoch [5/10], Train Loss: 0.0093, Val Loss: 0.0030


Training:   8%|▊         | 66/873 [01:15<15:48,  1.18s/it, loss=0.0235]  wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
                                                                          

Epoch [6/10], Train Loss: 0.0068, Val Loss: 0.0026


                                                                          

Epoch [7/10], Train Loss: 0.0055, Val Loss: 0.0040


                                                                          

Epoch [8/10], Train Loss: 0.0111, Val Loss: 0.0037


                                                                          

Epoch [9/10], Train Loss: 0.0003, Val Loss: 0.0039


                                                                          

Epoch [10/10], Train Loss: 0.0054, Val Loss: 0.0063




In [14]:
# Save the final model
torch.save(model.state_dict(), "final_model.pth")

In [15]:
import os

# Assuming you have a trained model named 'model'

# Define the directory path where you want to save the model
save_dir = "/teamspace/studios/this_studio/SpeechAssign3"

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# Save the model
torch.save(model.state_dict(), os.path.join(save_dir, "model.pth"))

In [17]:
import os

# Assuming you have a trained model named 'model'

# Define the directory path where you want to save the model
save_dir = "/teamspace/studios/this_studio/SpeechAssign3/SpeechAssign/SSL_Anti-spoofing"

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# Save the model
torch.save(model.state_dict(), os.path.join(save_dir, "final_model.pth"))

In [16]:
from sklearn.metrics import roc_auc_score
from scipy.optimize import brentq
from scipy.interpolate import interp1d
from sklearn.metrics import roc_curve
import numpy as np

# Assuming you have a trained model named 'model' and you want to evaluate it on the test dataset using the provided test_loader

# Lists to store true labels and predicted scores
true_labels = []
predicted_scores = []

# Set the model to evaluation mode
model.eval()

# Iterate through the test_loader to get true labels and predicted scores
with torch.no_grad():
    for data, target in tqdm(test_loader, desc="Testing"):
        # Assuming your model outputs probabilities or scores
        output = model(data)
        predicted_scores.append(output.item())  # Assuming output is a scalar, adjust accordingly
        true_labels.append(target.item())  # Assuming target is a scalar, adjust accordingly

# Convert lists to numpy arrays
true_labels = np.array(true_labels)
predicted_scores = np.array(predicted_scores)

# Calculate AUC
auc_score = roc_auc_score(true_labels, predicted_scores)

# Calculate EER
fpr, tpr, thresholds = roc_curve(true_labels, predicted_scores, pos_label=1)
eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
threshold = interp1d(fpr, thresholds)(eer)

print("AUC:", auc_score)
print("EER:", eer)
print("Threshold at EER:", threshold)


Testing:   0%|          | 0/68 [00:00<?, ?it/s]


RuntimeError: a Tensor with 32 elements cannot be converted to Scalar