In [1]:
import os
import torch
import librosa
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchaudio import load
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class AudioDataset(Dataset):
    def __init__(self, root):
        self.root = root
        self.classes, self.class_to_idx = self._find_classes()
        self.samples = self._make_dataset()

    def _find_classes(self):
        classes = [d.name for d in os.scandir(self.root) if d.is_dir()]
        classes.sort()
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def _make_dataset(self):
        samples = []
        for target_class in self.classes:
            class_index = self.class_to_idx[target_class]
            target_dir = os.path.join(self.root, target_class)
            for root_dir, _, file_names in os.walk(target_dir):
                for file_name in file_names:
                    if file_name.endswith('.wav') or file_name.endswith('.mp3') or file_name.endswith('.ogg'):
                        file_path = os.path.join(root_dir, file_name)
                        samples.append((file_path, class_index))
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        audio_path, class_index = self.samples[idx]
        # Load the audio file and preprocess
        waveform, _ = load(audio_path)
        waveform = self._preprocess_audio(waveform)
        return waveform, class_index

    def _preprocess_audio(self, waveform):
        waveform = waveform.numpy()[0]  # Convert tensor to numpy array
        max_len = 64600
        if waveform.shape[0] >= max_len:
            return waveform[:max_len]
        else:
            num_repeats = int(max_len / waveform.shape[0]) + 1
            padded_waveform = np.tile(waveform, (1, num_repeats))[:, :max_len][0]
            return padded_waveform


In [3]:
# Define the root directory where your data is stored
root = "./for-2seconds"  # Replace this with the path to your data folder

# Create datasets
train_dataset = AudioDataset(root=os.path.join(root, "training"))
test_dataset = AudioDataset(root=os.path.join(root, "testing"))
validation_dataset = AudioDataset(root=os.path.join(root, "validation"))

In [4]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=1, shuffle=False)

In [5]:
# Iterate through the train_loader to inspect the data with tqdm progress bar
for batch_idx, (data, target) in enumerate(tqdm(train_loader, desc="Training")):
    print("Batch:", batch_idx)
    print("Data shape:", data.shape)  # Shape of the input data tensor
    print("Target shape:", target.shape)  # Shape of the target tensor
    print("Data sample:", data[0])  # Print the first data sample in the batch
    print("Target sample:", target[0])  # Print the target label of the first sample
    break  # Break after printing the first batch to keep the output concise

Training:   0%|          | 0/13956 [00:00<?, ?it/s]

Batch: 0
Data shape: torch.Size([1, 64600])
Target shape: torch.Size([1])
Data sample: tensor([-0.0628, -0.0626, -0.0641,  ...,  0.1205,  0.1102,  0.0983])
Target sample: tensor(0)





In [6]:
#GPU device
device = 'cuda' if torch.cuda.is_available() else 'cpu'                  
print('Device: {}'.format(device))

# Load the SSL W2V model trained for LA and DF tracks
from model import Model

model = Model(None, device=device)
model = nn.DataParallel(model).to(device)
model.load_state_dict(torch.load('/mnt/c/Users/Manish/Desktop/SpeechAssign/SSL_Anti-spoofing/Best_LA_model_for_DF.pth'))

model.eval()

Device: cuda


DataParallel(
  (module): Model(
    (ssl_model): SSLModel(
      (model): Wav2Vec2Model(
        (feature_extractor): ConvFeatureExtractionModel(
          (conv_layers): ModuleList(
            (0): Sequential(
              (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
              (1): Dropout(p=0.0, inplace=False)
              (2): Sequential(
                (0): TransposeLast()
                (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                (2): TransposeLast()
              )
              (3): GELU()
            )
            (1): Sequential(
              (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
              (1): Dropout(p=0.0, inplace=False)
              (2): Sequential(
                (0): TransposeLast()
                (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                (2): TransposeLast()
              )
              (3): GELU()
            )
            (2): Sequential(
              (0):

In [7]:
model(data.unsqueeze(2))

tensor([[ 4.7339, -3.6386]], device='cuda:0', grad_fn=<AddmmBackward>)

In [8]:
torch.cuda.empty_cache()

In [9]:
def train_epoch(train_loader, model, lr, optimizer, device):
    running_loss = 0.0
    num_total = 0.0
    num_batches = len(train_loader)
    
    model.train()

    # Set objective (Loss) functions
    weight = torch.FloatTensor([0.1, 0.9]).to(device)
    criterion = nn.CrossEntropyLoss(weight=weight)
    
    # Initialize tqdm with the length of the train_loader
    progress_bar = tqdm(train_loader, desc='Training', leave=False)
    
    for batch_x, batch_y in progress_bar:
        batch_size = batch_x.size(0)
        num_total += batch_size
        
        batch_x = batch_x.to(device)
        batch_y = batch_y.view(-1).type(torch.int64).to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        batch_out = model(batch_x)
        
        # Compute loss
        batch_loss = criterion(batch_out, batch_y)
        
        running_loss += batch_loss.item() * batch_size
        
        # Backward pass and optimization
        batch_loss.backward()
        optimizer.step()
        
        # Update progress bar
        progress_bar.set_postfix(loss=running_loss / num_batches)
    
    progress_bar.close()
    
    # Compute average loss
    running_loss /= len(train_loader.dataset)
    
    return running_loss


In [10]:
lr = 0.001

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)

# Number of epochs
num_epochs = 2

# Training loop
for epoch in range(num_epochs):
    # Run training epoch
    train_loss = train_epoch(train_loader, model, lr, optimizer, device)
    
    # Print training loss for each epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}")


Training:   0%|          | 0/13956 [00:00<?, ?it/s]

                                                   

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemmStridedBatched( handle, opa, opb, m, n, k, &alpha, a, lda, stridea, b, ldb, strideb, &beta, c, ldc, stridec, num_batches)`