# This notebook is for run in kaggle 

https://www.kaggle.com/c/silero-audio-classifier/

In [1]:
import numpy as np  
import pandas as pd 
import os

import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.io.wavfile import read

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm as tqdm
import warnings

In [2]:
import torch
import torch.nn as nn

from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

In [3]:
warnings.simplefilter('ignore')

## Take part of all data because full dataset is too large 

In [4]:
TRAIN_PART = 0.05 # part of full data used to train our model 
VAL_PART = 0.2 # part of train data for validation 

In [5]:
train_df = pd.read_csv('/kaggle/input/silero-audio-classifier/train.csv')
_, train_df, _, _ = train_test_split(train_df, train_df['label'].values, test_size = TRAIN_PART, stratify = train_df['label'].values)
train, val, _, _ = train_test_split(train_df, train_df['label'].values, test_size = VAL_PART, stratify = train_df['label'].values)

In [6]:
def to_onehot(label, num_class = 3):
    one_hot = torch.zeros(num_class)
    one_hot[label] = 1
    return one_hot

def read_audio(path):
            sr, wav = read(path)
            assert sr == 16000
            assert len(wav) == 16000 * 3
            assert len(wav.shape) == 1
            return wav
        
def read_audio_norm(path):
            wav = read_audio(path)
            abs_max = np.abs(wav).max()
            wav = wav.astype('float32')
            if abs_max > 0:
                wav *= 1 / abs_max
            return wav

window_size = 0.02
window_stride = 0.01
sample_rate = 16000

n_fft = int(sample_rate * (window_size + 1e-8))
win_length = n_fft
hop_length = int(sample_rate * (window_stride + 1e-8))

kwargs = {
    'n_fft': n_fft,
    'hop_length': hop_length,
    'win_length': n_fft
}

def stft(wav):
    D = librosa.stft(wav,
                     **kwargs)
    mag, phase = librosa.magphase(D)    
    return mag

In [7]:
class SoundDataset(Dataset):
    
    def __init__(self, df, test = False, data_path = '/kaggle/input/silero-audio-classifier/train'):
        super().__init__()
        self.data_path = data_path
        self.df = df
        self.test = test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
                
        label_dict = {'speech':0,
                      'music':1,
                      'noise':2}
        
        wav = read_audio_norm(os.path.join(self.data_path, self.df.iloc[idx].wav_path))
        mag = torch.tensor(stft(wav), dtype = torch.float32).permute(1,0) #.unsqueeze(0)
        
        if not self.test:
            label = self.df.iloc[idx].target #label_dict[self.df.iloc[idx].label]
            label_one_hot = to_onehot(label)
            return mag, label_one_hot, torch.tensor(label, dtype = torch.int)
        
        return mag

## Build RNN-Model

In [8]:
class RNNModel(nn.Module):
    
    def __init__(self, h_size = 256, num_class = 3, n_layers = 3, bidirectional = True):
        super().__init__()
        
        self.rnn = nn.GRU(input_size = 161, hidden_size = h_size, batch_first = True, num_layers = n_layers, bidirectional = bidirectional)
        n_directions = 2 if bidirectional else 1
        self.fc = nn.Linear(h_size*n_layers*n_directions, num_class)
        self.softmax = nn.Softmax()
        
        
        
    def forward(self, x):
        x, h = self.rnn(x)
        n_layers, batch_size, h_size = h.shape
        h = h.permute(1,0,2).reshape(batch_size, h_size*n_layers)
        return self.softmax(self.fc(h))
        

In [9]:
train_dataset = SoundDataset(train)
val_dataset = SoundDataset(val)

train_loader = DataLoader(train_dataset, batch_size = 64, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 64, shuffle = False)

In [10]:
n_epoch = 15
lr = 3e-4

model = RNNModel()
optimizer = Adam(model.parameters(), lr = lr)
scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', patience=1, verbose=True, factor=0.2)
criterion = nn.BCELoss()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Training

In [11]:


#tkepoch = tqdm(range(n_epoch), total = n_epoch)
model.to(device)
best_val_acc = 0
patience = 3

for i in range(n_epoch):
     
    
    print(f"Training epoch {i}...")
    epoch_train_loss = 0
    model.train()
    #tkloader = tqdm(train_loader, total = len(train_loader))
    for x, y, _ in train_loader:
        
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_train_loss += loss.item()
    
    #print(f"Epoch {i} loss is {epoch_train_loss}")
    print("Validation...")
    model.eval()
    #tkval = tqdm(val_loader, total = len(val_loader))
    preds = []
    labels = []
    for x, y, l in val_loader:
        with torch.no_grad():
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            #print(y_pred.shape)
            batch_pred = list(y_pred.argmax(axis = 1).cpu().detach().numpy())
            batch_labels = list(l.cpu().detach().numpy())
            #print(batch_pred.shape, batch_labels.shape)
            preds = preds + batch_pred
            labels = labels + batch_labels
        
    val_acc = accuracy_score(labels, preds)
    print(f"vall accuracy is {val_acc}") 
    
    if val_acc > best_val_acc:
        patience = 3
        best_val_acc = val_acc
        torch.save(model, 'model.pth')
        print(f"Model saved at {i} epoch")
        
    else:
        patience -= 1
        if patience == 0:
            print("Early stopping...")
            break
        
    
    scheduler.step(val_acc)
          

Training epoch 0...
Validation...
vall accuracy is 0.8615384615384616
Model saved at 0 epoch
Training epoch 1...
Validation...
vall accuracy is 0.9168498168498168
Model saved at 1 epoch
Training epoch 2...
Validation...
vall accuracy is 0.9384615384615385
Model saved at 2 epoch
Training epoch 3...
Validation...
vall accuracy is 0.9351648351648352
Training epoch 4...
Validation...
vall accuracy is 0.945054945054945
Model saved at 4 epoch
Training epoch 5...
Validation...
vall accuracy is 0.9326007326007326
Training epoch 6...
Validation...
vall accuracy is 0.9545787545787546
Model saved at 6 epoch
Training epoch 7...
Validation...
vall accuracy is 0.9454212454212454
Training epoch 8...
Validation...
vall accuracy is 0.954945054945055
Model saved at 8 epoch
Training epoch 9...
Validation...
vall accuracy is 0.9498168498168498
Training epoch 10...
Validation...
vall accuracy is 0.9556776556776557
Model saved at 10 epoch
Training epoch 11...
Validation...
vall accuracy is 0.946886446886446

## Submit to Kaggle

In [12]:
sub = pd.read_csv('/kaggle/input/silero-audio-classifier/sample_submission.csv')
test_data_path = '/kaggle/input/silero-audio-classifier/val'
test_dataset = SoundDataset(sub, test=True, data_path = test_data_path)
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = False)

In [13]:
model_path = 'model.pth'
model = torch.load(model_path)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()
preds = []

tkloader = tqdm(test_loader, total = len(test_loader))
for x in tkloader:
    with torch.no_grad():
        logits = model(x.to(device)).cpu()
        preds.append(logits)
        
labels = torch.cat(preds, axis = 0).argmax(axis = 1)
labels.detach().numpy()

sub = pd.read_csv('/kaggle/input/silero-audio-classifier/sample_submission.csv')
sub['target'] = labels

sub.to_csv('submission.csv', index=False)
sub.head()

HBox(children=(FloatProgress(value=0.0, max=863.0), HTML(value='')))




Unnamed: 0,wav_path,target
0,val/e/b7cf2c4.wav,2
1,val/0/8f1489d.wav,2
2,val/f/14b7304.wav,0
3,val/2/3763132.wav,0
4,val/0/51c4271.wav,1
