# 50.039 Deep Learning Project

Group Members:
- Lee Chang Zheng
- Lee Cheng Xin
- Jason Peng Jing Ming

## Imports and CUDA

In [84]:
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchaudio import transforms
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import os

In [85]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Converting Audio samples into Spectrograms

Before we can begin, we first have to convert the audio samples from .webm format into a standardised format. We will convert the files to .wav with single channel, a sample rate of 48000 Hz, and pad the audio files to 10 seconds long. 

In [86]:
# Convert webm and ogg files to wav with single channel, sample rate of 48000 Hz, padded to 10 seconds long
import subprocess
from pydub import AudioSegment

def convert_audio(in_path, out_path):
    if '.webm' in in_path:
        subprocess.run(["ffmpeg", "-i", in_path, "-y", "-ac", "1", "-ar", "48000", out_path.replace('.webm', '.wav')])
    elif '.ogg' in in_path:
        subprocess.run(["ffmpeg", "-i", in_path, "-y", "-ac", "1", "-ar", "48000", out_path.replace('.ogg', '.wav')])

def pad_trim_audio(in_path, out_path):
    audio = AudioSegment.from_wav(in_path)
    if len(audio) < 10000:
        padding = AudioSegment.silent(duration=10000 - len(audio))
        padded_audio = audio + padding
        padded_audio.export(out_path, format='wav')
    elif len(audio) > 10000:
        trimmed_audio = audio[:10000]
        trimmed_audio.export(out_path, format='wav')
    
# Note: These are commented out as the conversion has been done, it is simply for reference. You will need FFmpeg to run this.
# for filename in os.listdir('./Data/Covid'):
#     convert_audio(f'./Data/Covid/{filename}', f'./Converted/{filename}')
# for filename in os.listdir('./Data/Healthy'):
#     convert_audio(f'./Data/Healthy/{filename}', f'./Converted/{filename}')

# # Padding/trimming the audio to 10 seconds long
# for filename in os.listdir('./Converted'):
#     pad_trim_audio(f'./Converted/{filename}', f'./Converted/{filename}')

After standardizing the audio samples, we need to convert them into a Mel Spectrogram for the CNN model to process. 

In [87]:
# Converts the audio waveform into a spectrogram
def audio_to_spec(audio, sample_rate, n_mels=128, n_fft=400, win_length=None, hop_length=None, top_db=80):
    mel_spectrogram = transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        center=True,
        pad_mode="reflect",
        n_mels=n_mels,
    )
    
    melspec = mel_spectrogram(audio)
    
    transform = transforms.AmplitudeToDB(top_db=top_db)
    final_spec = transform(melspec)
    return final_spec

## Custom Dataset and Dataloader

In [106]:
class CovidCoughDataset(Dataset):
    def __init__(self, filename, datapath):
        self.df = pd.read_excel(filename)
        self.datapath = datapath
        self.max_spec_length = 2400

    def __getitem__(self, index):
        audio_file = self.datapath + self.df.iloc[index, 0] + '.wav'
        status = 1 if self.df.iloc[index, 1] == 'COVID' else 0        
        audio, sample_rate = torchaudio.load(audio_file)
        spec = audio_to_spec(audio=audio, sample_rate=sample_rate)
        
        # Note: There might be some minor differences in the length of the audio clips, resulting in spectrograms of different
        #       dimensions. We need to pad/trim the spectrograms to ensure consistency before we can feed into the model. 
        # Pad the shorter spectrograms to the maximum length
        if spec.shape[2] < self.max_spec_length:
            spec = F.pad(spec, (0, self.max_spec_length - spec.shape[2]), value=0)
        # Trim the longer spectrograms to the maximum length
        elif spec.shape[2] > self.max_spec_length:
            spec = spec[:, :, :self.max_spec_length]
        return spec, status
        
    def __len__(self):        
        return len(self.df)

In [107]:
# Load the dataset
batch_size = 16

dataset = CovidCoughDataset('./Data/Dataset.xlsx', './Converted/')
for batch in dataset:
    print(batch[0][0].shape)
train_dataset, valid_dataset, test_dataset = random_split(dataset, [0.8, 0.1, 0.1])
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size = batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle=True)



before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400])
before: torch.Size([1, 128, 2400])
torch.Size([128, 2400

In [90]:
class CovidClassifer(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=3)
        
        # Fully connected layers
        self.fc1 = nn.Linear(64 * (128//9) * (2400//9), 2)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = x.view(-1, 64 * (128//9) * (2400//9))
        x = self.fc1(x)
        
        return x

In [91]:
def train(model, train_dataloader, valid_dataloader, epochs = 10, lr = 0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    train_losses = []
    train_accuracies = []
    test_losses = []
    test_accuracies = []
    
    for epoch in range(epochs):
        
        model.train()
        
        # Initialize epoch loss and accuracy
        epoch_loss = 0.0
        correct = 0
        total = 0
        
        for batch_number, (inputs, labels) in enumerate(train_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(inputs)
            _, pred = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            total += labels.size(0)
            correct += (pred == labels).sum().item()
            
            print(f'Epoch {epoch+1}/{epochs}, Batch number: {batch_number}, Cumulated accuracy: {correct/total}')
        
        # Calculate epoch loss and accuracy
        epoch_loss /= len(train_dataloader)
        epoch_acc = correct/total
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_acc)
        print(f'--- Epoch {epoch+1}/{epochs}: Train loss: {epoch_loss:.4f}, Train accuracy: {epoch_acc:.4f}')
    
    return train_losses, train_accuracies

In [92]:
model = CovidClassifer().to(device)

train_losses, train_accuracies = train(model, train_dataloader, valid_dataloader, epochs = 3, lr = 1e-3)

Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2401])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])
Spectrogram dimensions: torch.Size([1, 128, 2400])


RuntimeError: stack expects each tensor to be equal size, but got [1, 128, 2400] at entry 0 and [1, 128, 2401] at entry 5