In [1]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, layer_size, output_size, device):
        super(RNN, self).__init__()
        
        self.device = device
        self.hidden_size = hidden_size
        self.layer_size = layer_size
        
        self.rnn = nn.RNN(input_size, hidden_size, layer_size, batch_first=True, nonlinearity="relu")
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        h0 = torch.zeros(self.layer_size, x.size(0), self.hidden_size).to(self.device)
        out, hidden = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [2]:
input_size = 13
hidden_size = 200
layer_size = 10
output_size = 2
device = "cuda" if torch.cuda.is_available() else "cpu"

model = RNN(input_size, hidden_size, layer_size, output_size, device).to(device)

In [3]:
import torchaudio
import pandas as pd
from torch.utils.data import Dataset

train_data = pd.read_csv('Datasets/train.csv').to_numpy()
test_data = pd.read_csv('Datasets/test.csv').to_numpy()

In [4]:
num_epochs = 100
lr = 0.001
error = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

loss_list = []
iteration_list = []

In [None]:
for epoch in range(num_epochs):
    for i, (audio, language) in enumerate(train_data,start=1):
        waveform, sample_rate = torchaudio.load(audio)
        mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate)
        mfcc = mfcc_transform(waveform)
        mfcc = mfcc.reshape(mfcc.shape[0], mfcc.shape[2], mfcc.shape[1]).to(device)
        language = torch.tensor([language]).to(device)
        
        optimizer.zero_grad()
        output = model(mfcc)
        loss = error(output, language)
        
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
            loss_list.append(loss.data)
            iteration_list.append(i)
            print('Iteration: {}  Loss: {}'.format(i, loss.data))

In [None]:
import matplotlib.pyplot as plt

plt.plot(iteration_list,loss_list)
plt.xlabel("Number of iteration")
plt.ylabel("Loss")
plt.title("RNN: Loss vs Number of iteration")
plt.show()

In [None]:
iteration_list = []
accuracy_list = []
total = 0
correct = 0

In [None]:
for i, (audio, language) in enumerate(test_data,start=1):
    waveform, sample_rate = torchaudio.load(audio)
    mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate)
    mfcc = mfcc_transform(waveform)
    mfcc = mfcc.reshape(mfcc.shape[0], mfcc.shape[2], mfcc.shape[1]).to(device)
    
    output = model(mfcc)
    predicted = torch.argmax(output.data)
    
    total += 1
    correct += 1 if predicted == language else 0
    
    if total % 100 == 0:
        accuracy = 100 * correct / float(total)
        iteration_list.append(i)
        accuracy_list.append(accuracy)
        print("Iteration:",i,accuracy)

In [None]:
plt.plot(iteration_list, accuracy_list, color = "red")
plt.xlabel("Number of iteration")
plt.ylabel("Accuracy")
plt.title("RNN: Accuracy vs Number of iteration")
plt.savefig('graph.png')
plt.show()