# Model lab

## Imports

In [1]:
from torch import no_grad, optim, max
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import torch.nn as nn
import pandas as pd
import librosa
import numpy as np
import torch.nn.functional as F
import torch
from tqdm import tqdm

## Data Acquisition & Preprocessing
- Load the dataset
- Prepare labels
- Prepare data
- Create loaders

In [2]:
df = pd.read_csv('../data/data.csv')
df.head()

Unnamed: 0,path,sentence
0,common_voice_fr_22098482.wav,trois
1,common_voice_fr_21955578.wav,quatre
2,common_voice_fr_22500710.wav,un
3,common_voice_fr_21964070.wav,non
4,common_voice_fr_22357111.wav,trois


In [3]:
sentence = {'oui': 0, 'non': 1, 'un': 2, 'deux': 3, 'trois': 4, 'quatre': 5}
df['sentence'] = df['sentence'].map(sentence)
df.head()

Unnamed: 0,path,sentence
0,common_voice_fr_22098482.wav,4
1,common_voice_fr_21955578.wav,5
2,common_voice_fr_22500710.wav,2
3,common_voice_fr_21964070.wav,1
4,common_voice_fr_22357111.wav,4


* Sample Rate : 44100, 22050, 16000
* Taille de la fenêtre de transformation de Fourier : n_fft
* Décalage : hop_length

In [4]:
def audio_to_spectrogram(audio_file, n_fft=2048, hop_length=512):
    y, sr = librosa.load(audio_file)
    spectrogram = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    spectrogram = np.abs(spectrogram)
    spectrogram = spectrogram ** 2
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, S=spectrogram)
    spectrogram = librosa.amplitude_to_db(spectrogram, ref=np.max)
    return spectrogram

df['path'] = df['path'].apply(lambda x: '../data/clips/' + x)
df['spectrogram'] = df['path'].apply(lambda x: audio_to_spectrogram(x))

### Padding

In [5]:
max_x = 0
max_y = 0

for spectrogram in df['spectrogram']:
    if spectrogram.shape[0] > max_x:
        max_x = spectrogram.shape[0]
    if spectrogram.shape[1] > max_y:
        max_y = spectrogram.shape[1]

df['spectrogram'] = df['spectrogram'].apply(lambda x: np.resize(x, (max_x, max_y)))

In [6]:
df['spectrogram'] = df['spectrogram'].apply(lambda x: torch.tensor(x))
df['sentence'] = df['sentence'].apply(lambda x: torch.tensor(x))

### Split

In [7]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        label = self.dataframe.iloc[idx]['sentence']
        data = self.dataframe.iloc[idx]['spectrogram']
        return label, data

NameError: name 'Dataset' is not defined

In [None]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=42)

print("Taille de l'ensemble d'entraînement :", len(train_df))
print("Taille de l'ensemble de test :", len(test_df))


In [None]:
train_df = CustomDataset(train_df)
test_df = CustomDataset(test_df)

batch_size = 32

loaders = {
    "train" : DataLoader(train_df, batch_size=batch_size, shuffle=True, num_workers=6),
    "test" : DataLoader(test_df, batch_size=batch_size, num_workers=6)
}

## Model Architecture
* CNN
  

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 256 * 116, 128)
        self.fc2 = nn.Linear(128, 6)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 64 * 256 * 116)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
def evaluate(model):
    model.eval
    correct = 0
    total = 0
    with torch.no_grad():
        for labels, inputs in tqdm(loaders['test']):
            outputs = model(inputs.unsqueeze(1).float())
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    return accuracy

In [None]:
model = CNN()
accuracies = []
losses = []

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0
    for labels, inputs in tqdm(loaders['train']):
        optimizer.zero_grad()
        outputs = model(inputs.unsqueeze(1).float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        total += labels.size(0)
    losses.append(loss/total)
    accuracies.append(evaluate(model))