In [1]:
import numpy as np  
import pandas as pd 
import os

import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.io.wavfile import read

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm as tqdm
import warnings

In [2]:
import torch
import torch.nn as nn

from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

In [3]:
warnings.simplefilter('ignore')

In [4]:
TRAIN_PART = 0.05 # part of full data used to train our model 
VAL_PART = 0.2 # part of train data for validation 

In [5]:
train_df = pd.read_csv('/kaggle/input/silero-audio-classifier/train.csv')
_, train_df, _, _ = train_test_split(train_df, train_df['label'].values, test_size = TRAIN_PART, stratify = train_df['label'].values)
train, val, _, _ = train_test_split(train_df, train_df['label'].values, test_size = VAL_PART, stratify = train_df['label'].values)

In [6]:
def to_onehot(label, num_class = 3):
    one_hot = torch.zeros(num_class)
    one_hot[label] = 1
    return one_hot

def read_audio(path):
            sr, wav = read(path)
            assert sr == 16000
            assert len(wav) == 16000 * 3
            assert len(wav.shape) == 1
            return wav
        
def read_audio_norm(path):
            wav = read_audio(path)
            abs_max = np.abs(wav).max()
            wav = wav.astype('float32')
            if abs_max > 0:
                wav *= 1 / abs_max
            return wav

window_size = 0.02
window_stride = 0.01
sample_rate = 16000

n_fft = int(sample_rate * (window_size + 1e-8))
win_length = n_fft
hop_length = int(sample_rate * (window_stride + 1e-8))

kwargs = {
    'n_fft': n_fft,
    'hop_length': hop_length,
    'win_length': n_fft
}

def stft(wav):
    D = librosa.stft(wav,
                     **kwargs)
    mag, phase = librosa.magphase(D)    
    return mag

In [7]:
class SoundDataset(Dataset):
    
    def __init__(self, df, test = False, data_path = '/kaggle/input/silero-audio-classifier/train'):
        super().__init__()
        self.data_path = data_path
        self.df = df
        self.test = test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
                
        label_dict = {'speech':0,
                      'music':1,
                      'noise':2}
        
        wav = read_audio_norm(os.path.join(self.data_path, self.df.iloc[idx].wav_path))
        wav = torch.tensor(wav).unsqueeze(0)
        
        return wav

In [8]:
class CNNEncoder(nn.Module):
    
    def __init__(self, n_ch = 64, out_ch = 128):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(1,n_ch,10,5),
            nn.GroupNorm(1, n_ch),
            nn.ReLU(), 
            
            nn.Conv1d(n_ch,n_ch,8,4),
            nn.GroupNorm(1, n_ch),
            nn.ReLU(),
            
            nn.Conv1d(n_ch,n_ch,4,2),
            nn.GroupNorm(1, n_ch),
            nn.ReLU(),
            
            nn.Conv1d(n_ch,n_ch,4,2),
            nn.GroupNorm(1, n_ch),
            nn.ReLU(),
            
            nn.Conv1d(n_ch, out_ch,4,2,padding = 2),
            nn.GroupNorm(1, out_ch),
            nn.ReLU(),
        )
        
    def forward(self, x):
        return self.encoder(x)


In [9]:
class CNNDecoder(nn.Module):
    
    def __init__(self, n_ch = 64, out_ch = 128):
        super().__init__()
        self.encoder = nn.Sequential(
            
            nn.ConvTranspose1d(out_ch,n_ch,4,2, padding = 1, output_padding = 0),
            nn.GroupNorm(1, n_ch),
            nn.ReLU(),
            
            nn.ConvTranspose1d(n_ch,n_ch,4,2, padding = 1, output_padding = 0),
            nn.GroupNorm(1, n_ch),
            nn.ReLU(), 
            
            nn.ConvTranspose1d(n_ch,n_ch,4,2, padding = 1, output_padding = 0),
            nn.GroupNorm(1, n_ch),
            nn.ReLU(), 
            
            nn.ConvTranspose1d(n_ch,n_ch,8,4, padding = 2, output_padding = 0),
            nn.GroupNorm(1, n_ch),
            nn.ReLU(),
            
            nn.ConvTranspose1d(n_ch,1,10,5, padding = 3, output_padding = 1),
            nn.GroupNorm(1, 1),
            nn.ReLU(), 
            
        )
        
    def forward(self, x):
        return self.encoder(x)
    


In [10]:
class RNNAutoEncoder(nn.Module):
    
    def __init__(self, in_size = 128, trg_len = 300, h_size = 128, n_layers = 2, bidirectional = True, device = 'cpu'):
        super().__init__()
        
        self.in_size = in_size
        self.trg_len = trg_len
        self.device = device
        n_directions = 2 if bidirectional else 1
        
        self.rnn_encoder = nn.GRU(input_size = in_size, hidden_size = h_size, batch_first = True, num_layers = n_layers, bidirectional = bidirectional)
        self.rnn_decoder = nn.GRU(input_size = in_size, hidden_size = h_size, batch_first = True, num_layers = n_layers, bidirectional = bidirectional)
        
        self.rnn_decoder_output = nn.Linear(h_size*n_directions, in_size)
        
        self.cnn_encoder = CNNEncoder()
        self.cnn_decoder = CNNDecoder()
        
    
    
    
    def encode(self, x):
        z = self.cnn_encoder(x).permute(0, 2, 1)
        _, h = self.rnn_encoder(z)
        return h
    
    
    def decode(self, h):
        
        batch_size = h.shape[1]
        outputs = torch.zeros(batch_size, 1, self.in_size).to(self.device)
        
        for t in range(1, self.trg_len + 1):
            output, h = self.rnn_decoder(outputs[:,-1,:].unsqueeze(1), h)
            output = self.rnn_decoder_output(output[:,-1,:]).unsqueeze(1)
            outputs = torch.cat([outputs, output], axis = 1)
        
        #print(outputs.shape)
        outputs = self.cnn_decoder(outputs[:,1:].permute(0,2,1))
            
        return outputs
    
    def forward(self, x):
        hidden = self.encode(x)
        outputs = self.decode(hidden)
        return outputs
        
        

In [11]:
train_dataset = SoundDataset(train)
val_dataset = SoundDataset(val)

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = False)

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [13]:
n_epoch = 15
lr = 3e-4

model = RNNAutoEncoder(device = device)
optimizer = Adam(model.parameters(), lr = lr)
scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='min', patience=1, verbose=True, factor=0.2)
criterion = nn.MSELoss()



In [14]:


#tkepoch = tqdm(range(n_epoch), total = n_epoch)
model.to(device)
best_val_loss = np.float('inf')
patience = 3

for i in range(n_epoch):
     
    
    print(f"Training epoch {i}...")
    epoch_train_loss = []
    model.train()
    #tkloader = tqdm(train_loader, total = len(train_loader))
    for x in train_loader:
        
        x = x.to(device)
        x_pred = model(x)
        
        loss = criterion(x_pred, x)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_train_loss.append(loss.item())
    
    print(f"Epoch {i} train loss is {sum(epoch_train_loss)/len(epoch_train_loss)}\n")
    
    print("Validation...")
    model.eval()
    epoch_val_loss = []
    #tkval = tqdm(val_loader, total = len(val_loader))

    for x in val_loader:
        with torch.no_grad():
            x = x.to(device)
            x_pred= model(x)
            loss = criterion(x_pred, x)
            epoch_val_loss.append(loss.item())
        
    ep_val_loss = sum(epoch_val_loss)/len(epoch_val_loss)
    print(f"Epoch {i} val loss is {ep_val_loss}\n")
    
    if ep_val_loss < best_val_loss:
        patience = 3
        best_val_loss = ep_val_loss
        torch.save(model, 'model.pth')
        print(f"Model saved at {i} epoch")
        
    else:
        patience -= 1
        if patience == 0:
            print("Early stopping...")
            break
        
    
    scheduler.step(ep_val_loss)
    print("\n\n")
          

Training epoch 0...
Epoch 0 train loss is 0.05803881448350454

Validation...
Epoch 0 val loss is 0.04501417214267476

Model saved at 0 epoch



Training epoch 1...
Epoch 1 train loss is 0.043013304292240685

Validation...
Epoch 1 val loss is 0.04448712435226108

Model saved at 1 epoch



Training epoch 2...
Epoch 2 train loss is 0.04281011086000859

Validation...
Epoch 2 val loss is 0.044476193809058776

Model saved at 2 epoch



Training epoch 3...
Epoch 3 train loss is 0.04281146187139185

Validation...
Epoch 3 val loss is 0.044343429595925084

Model saved at 3 epoch



Training epoch 4...
Epoch 4 train loss is 0.04274219055033741

Validation...
Epoch 4 val loss is 0.04429002619500077

Model saved at 4 epoch



Training epoch 5...
Epoch 5 train loss is 0.042757562644866826

Validation...
Epoch 5 val loss is 0.04432337391081938




Training epoch 6...
Epoch 6 train loss is 0.042715867491144886

Validation...
Epoch 6 val loss is 0.04441318749775027

Epoch     7: reducing learning rate 