In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


import torchvision.models as models
import random
import torchaudio
from torch.utils.data import DataLoader, Dataset

In [2]:
s_dir = 'speech-command-classification/'
audio_dir = s_dir+'audio_files/'

In [3]:
test_df = pd.read_csv(s_dir+'sample_submission.csv')
test_df['file_name'] = audio_dir + test_df['file_name'].astype(str)

In [4]:
train_df = pd.read_csv(s_dir+'train.csv')

train_df['file_name'] = audio_dir + train_df['file_name'].astype(str)

id = 0
class_to_id = {}
id_to_class = {}

for cl_name in train_df['target']:
    if cl_name not in class_to_id.keys():
        class_to_id[cl_name] = id
        id+=1


for cl_name, cl_id in class_to_id.items():
    if cl_id not in id_to_class.keys():
        id_to_class[cl_id] = cl_name
        

train_df['num_target'] = [class_to_id[cl_name] for cl_name in train_df['target']]


data = train_df

train_data, val_data = train_test_split(data,test_size=0.2, random_state=42)

In [5]:
class SoundDS(Dataset):
    def __init__(self, file_paths, train_mode=True):
        self.file_paths = file_paths
        self.duration = 1000
        self.sr = 16000
        self.n_fft = 1024
        self.hop_length = None
        self.n_mels = 64
        self.top_db = 80
        self.train_mode=train_mode
        
        
    def __len__(self):
        return len(self.file_paths)
    
    
    def __getitem__(self, idx):
        
        
        file_path = self.file_paths.iloc[[idx]].values[0][0]
       
        
        samples, sr = torchaudio.load(file_path)
        samples = self._pad_trunc(samples, self.sr)
        
        spect = torchaudio.transforms.MelSpectrogram(
            self.sr,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            n_mels=self.n_mels
        )(samples)
        

        spect = torchaudio.transforms.AmplitudeToDB(top_db=self.top_db)(spect)

        spect = self.rechannel(spect, self.sr, 3)
        
#         print(self.train_mode)
        
        if self.train_mode:           
            class_id = self.file_paths.iloc[[idx]].values[0][2]
            return spect, class_id
        
        return spect



    def _pad_trunc(self, samples, sr):
        num_rows, signal_len = samples.shape
        max_len = sr // 1000 * self.duration

        if (signal_len > max_len):
            # Truncate the signal to the given length
            samples = samples[:, max_len]

        elif (signal_len < max_len):
            # Length of padding to add at the beginning and end of the signal
            pad_begin_len = random.randint(0, max_len - signal_len)
            pad_end_len = max_len - signal_len - pad_begin_len

            # Pad with 0s
            pad_begin = torch.zeros((1, pad_begin_len))
            pad_end = torch.zeros((1, pad_end_len))

            samples = torch.cat((pad_begin, samples, pad_end), 1)

        return samples

    def rechannel(self, spect, sr, num_channel):
        if (spect.shape[0] == num_channel):
            # Nothing to do
            return spect

        if (num_channel == 1):
            # Convert from stereo to mono by selecting only the first channel
            spect = spect[:1, :]
        else:
            # Convert from mono to stereo by duplicating the first channel
            spect = torch.cat([spect, spect, spect])

        return spect

    def _time_shift(self, samples, sr, shift_limit):
        _, sig_len = samples.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return samples.roll(shift_amt)




In [6]:
train_dataset = SoundDS(file_paths=train_data)
valid_dataset = SoundDS(file_paths=val_data)
test_dataset = SoundDS(file_paths=test_df, train_mode=False)

In [7]:
batch_size = 128

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
device = 'cpu' 

model = models.resnet18()
n_inputs=model.fc.in_features
model.fc = nn.Sequential(
                      nn.Linear(n_inputs, 35), 
                      nn.ReLU(),
)

# model.load_state_dict(torch.load('models/resnet50.pt'))
# model.eval()

model.to(device)


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [9]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1, verbose=True) 

Adjusting learning rate of group 0 to 1.0000e-04.


In [10]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):        
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [11]:
def validation(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
    test_loss /= num_batches
    correct /= size
    print(f"Val Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

In [None]:
epochs = 30
for t in range(epochs):
    print(f"Epoch {t + 1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    validation(val_dataloader, model, loss_fn)
    scheduler.step()

print("Done!")

torch.save(model.state_dict(), 'models/resnet18_30e.pt')

Epoch 1
-------------------------------
loss: 3.637887  [    0/59264]
loss: 2.667962  [12800/59264]
loss: 2.089060  [25600/59264]
loss: 1.641281  [38400/59264]
loss: 1.241233  [51200/59264]
Val Error: 
 Accuracy: 68.3%, Avg loss: 1.236901 

Adjusting learning rate of group 0 to 1.0000e-04.
Epoch 2
-------------------------------
loss: 0.873310  [    0/59264]
loss: 1.188784  [12800/59264]
loss: 0.976807  [25600/59264]
loss: 1.403313  [38400/59264]
loss: 0.919188  [51200/59264]
Val Error: 
 Accuracy: 72.3%, Avg loss: 1.046289 

Adjusting learning rate of group 0 to 1.0000e-04.
Epoch 3
-------------------------------
loss: 0.795661  [    0/59264]
loss: 0.824849  [12800/59264]
loss: 1.130418  [25600/59264]
loss: 1.039290  [38400/59264]
loss: 1.006916  [51200/59264]
Val Error: 
 Accuracy: 74.0%, Avg loss: 0.973522 

Adjusting learning rate of group 0 to 1.0000e-04.
Epoch 4
-------------------------------
loss: 0.708033  [    0/59264]
loss: 0.646970  [12800/59264]
loss: 0.978067  [25600/5926

loss: 0.509539  [    0/59264]
loss: 0.588643  [12800/59264]
loss: 0.502320  [25600/59264]
loss: 0.567757  [38400/59264]
loss: 0.588521  [51200/59264]
Val Error: 
 Accuracy: 77.7%, Avg loss: 0.874132 

Adjusting learning rate of group 0 to 1.0000e-06.
Epoch 30
-------------------------------
loss: 0.738439  [    0/59264]
loss: 0.619769  [12800/59264]


In [14]:
# torch.save(model.state_dict(), 'models/resnet50.pt')

In [10]:

preds = []
# model.eval()
for batch, X in enumerate(test_dataloader):
    X= X.to(device)
    pred = model(X)
    for row in pred.detach().numpy():
        preds.append(np.argmax(row))


In [11]:
res_classes = []

for idx in preds:
    res_classes.append(id_to_class[idx])


In [12]:
res_df = pd.read_csv(s_dir+'sample_submission.csv')
res_df['target'] = res_classes
res_df.to_csv(r'submission5.csv', index=False, header=True)