In [1]:
# Install necessary libraries if not already installed
# !pip install librosa tqdm matplotlib torch torchvision

In [2]:
# Import the necessary libraries
import os
import numpy as np
import librosa
import torch
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
# Define input and output folders
input_folder = 'wavFiles'
output_folder = 'spectrograms'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Parameters for spectrogram generation and segmentation
sr = 44100  # Target sample rate
n_mels = 128  # Number of Mel bands
hop_length = 512  # Hop length for time resolution of the spectrogram
snippet_duration = 5  # Duration of each snippet in seconds
snippet_stride = 2.5  # Offset between the start of each snippet in seconds

# Define a resize transform to ensure spectrograms are 128x128
resize_transform = T.Compose([
    T.ToPILImage(),
    T.Resize((128, 128)),
    T.ToTensor()
])

# Loop through each WAV file, segment, convert to spectrogram, resize, and save
for filename in tqdm(os.listdir(input_folder)):
    if filename.endswith('.wav'):
        input_path = os.path.join(input_folder, filename)
        
        # Load the audio file with librosa
        try:
            y, _ = librosa.load(input_path, sr=sr)
        except Exception as e:
            print(f"Failed to load {filename}: {e}")
            continue
        
        # Calculate the number of samples per snippet and stride
        snippet_samples = int(snippet_duration * sr)
        stride_samples = int(snippet_stride * sr)
        
        # Process overlapping 5-second snippets
        for start_sample in range(0, len(y) - snippet_samples + 1, stride_samples):
            end_sample = start_sample + snippet_samples
            snippet = y[start_sample:end_sample]

            # Generate the Mel spectrogram for the snippet
            S = librosa.feature.melspectrogram(y=snippet, sr=sr, n_mels=n_mels, hop_length=hop_length)
            S_dB = librosa.power_to_db(S, ref=np.max)  # Convert to dB scale

            # Convert the spectrogram to a Torch tensor and resize
            spectrogram_tensor = torch.tensor(S_dB, dtype=torch.float32)
            spectrogram_tensor = resize_transform(spectrogram_tensor.unsqueeze(0)).squeeze(0)  # Resize and remove extra dimension

            # Save the resized spectrogram with a unique filename
            snippet_filename = f"{filename.replace('.wav', '')}_start{start_sample}.npy"
            output_path = os.path.join(output_folder, snippet_filename)
            np.save(output_path, spectrogram_tensor.numpy())

100%|███████████████████████████████████████████| 65/65 [01:00<00:00,  1.07it/s]


In [21]:
# Custom Dataset class to load precomputed spectrograms from .npy files and assign labels
class SpectrogramDataset(Dataset):
    def __init__(self, spectrogram_folder, transform=None):
        self.spectrogram_folder = spectrogram_folder
        self.spectrogram_files = [f for f in os.listdir(spectrogram_folder) if f.endswith('.npy')]
        self.transform = transform

    def __len__(self):
        return len(self.spectrogram_files)

    def __getitem__(self, idx):
        spectrogram_path = os.path.join(self.spectrogram_folder, self.spectrogram_files[idx])
        spectrogram = np.load(spectrogram_path)
        
        filename = self.spectrogram_files[idx]
        label = 0 if filename[0].lower() == 'c' else 1  # 0 for 'C' (clean), 1 for 'D' (distorted)
        
        spectrogram = torch.tensor(spectrogram, dtype=torch.float32).unsqueeze(0)  # Add channel dimension
        label = torch.tensor(label, dtype=torch.float32)  # Float for BCELoss compatibility

        if self.transform:
            spectrogram = self.transform(spectrogram)
        
        return spectrogram, label

In [23]:
# Define the folder containing the precomputed spectrograms
spectrogram_folder = 'spectrograms'

# Instantiate the dataset
dataset = SpectrogramDataset(spectrogram_folder=spectrogram_folder)

# Check class balance
clean_count = sum(1 for _, label in dataset if label == 0)
distorted_count = len(dataset) - clean_count
print(f"Clean samples: {clean_count}, Distorted samples: {distorted_count}")

# Split the dataset into training and testing sets
train_ratio = 0.8
train_size = int(train_ratio * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders for training and testing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

Clean samples: 736, Distorted samples: 736


In [25]:
# Define an enhanced CNN model for binary classification
class EnhancedBinaryClassifier(torch.nn.Module):
    def __init__(self, input_height=128, input_width=128):
        super(EnhancedBinaryClassifier, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = torch.nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = torch.nn.MaxPool2d(2, 2)
        
        self.flattened_size = self._get_flattened_size(input_height, input_width)
        self.fc1 = torch.nn.Linear(self.flattened_size, 1)

    def _get_flattened_size(self, height, width):
        with torch.no_grad():
            x = torch.zeros(1, 1, height, width)
            x = self.pool(torch.relu(self.conv1(x)))
            x = self.pool(torch.relu(self.conv2(x)))
            return x.numel()

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = torch.sigmoid(self.fc1(x))
        return x

# Instantiate the model, optimizer, and loss function
model = EnhancedBinaryClassifier(input_height=128, input_width=128)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)  # Reduced learning rate
criterion = torch.nn.BCELoss()

In [27]:
# Define a learning rate scheduler to decay the learning rate over time
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Helper function to monitor gradients
def monitor_gradients():
    for name, param in model.named_parameters():
        if param.grad is not None:
            print(f"{name}: Gradient norm = {param.grad.norm().item()}")

In [31]:
num_epochs = 5
model_save_path = 'trained_model.pth'  # Path to save the trained model

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for spectrograms, labels in train_loader:
        # Forward pass
        outputs = model(spectrograms).squeeze(1)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Print epoch information
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss/len(train_loader):.4f}")
    scheduler.step()  # Adjust the learning rate
    
    # Evaluate on test set
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for spectrograms, labels in test_loader:
            outputs = model(spectrograms).squeeze(1)
            test_loss += criterion(outputs, labels).item()
            
            # Calculate accuracy
            predicted = (outputs >= 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Testing Loss: {test_loss/len(test_loader):.4f}, Accuracy: {accuracy:.4f}")

# Save the trained model to a file
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Epoch [1/5], Training Loss: 0.1016
Epoch [1/5], Testing Loss: 0.0662, Accuracy: 0.9729
Epoch [2/5], Training Loss: 0.0911
Epoch [2/5], Testing Loss: 0.0957, Accuracy: 0.9661
Epoch [3/5], Training Loss: 0.0776
Epoch [3/5], Testing Loss: 0.0361, Accuracy: 0.9898
Epoch [4/5], Training Loss: 0.0710
Epoch [4/5], Testing Loss: 0.0304, Accuracy: 0.9932
Epoch [5/5], Training Loss: 0.0626
Epoch [5/5], Testing Loss: 0.0481, Accuracy: 0.9864
Model saved to trained_model.pth
