In [0]:
import scipy.io.wavfile as wav
import librosa as lr
import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
import os
import torch.nn.functional as F
from torch import optim
import torch.nn as nn
import torchvision.models as models
import torch

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
data_path='/content/drive/My Drive/'
os.listdir('/content/drive/My Drive/')

In [0]:
tr_mini=[data_path+'tr_mini/'+i for i in os.listdir(data_path+'tr_mini')]
br_mini=[data_path+'br_mini/'+i for i in os.listdir(data_path+'br_mini')]
mn_mini=[data_path+'mn_mini/'+i for i in os.listdir(data_path+'mn_mini')]
dv_mini=[data_path+'dv_mini/'+i for i in os.listdir(data_path+'dv_mini')]

In [0]:
data=pd.DataFrame({'fname':tr_mini,'label': [1 for i in range(len(tr_mini))]})
data=pd.concat([data,pd.DataFrame({'fname':br_mini,'label': [4 for i in range(len(br_mini))]})])
data=pd.concat([data,pd.DataFrame({'fname':dv_mini,'label': [3 for i in range(len(dv_mini))]})])
data=pd.concat([data,pd.DataFrame({'fname':mn_mini,'label': [2 for i in range(len(mn_mini))]})])

In [0]:
df = data.sample(frac=1).reset_index(drop=True)

In [0]:
def prepare_data_2(df,config):
    X = []
    input_length = config.max_len
    for i,fname in enumerate(df.fname):
        file_path =fname
        data, _ = lr.core.load(file_path, sr=config.sample_freq, res_type="kaiser_fast")
        data = lr.feature.mfcc(data,sr=config.sample_freq, n_mfcc=config.n_mfcc)
        data=torch.tensor(data)
        data=data.float()
        X.append(data)
    return X

In [0]:
rnn_data=prepare_data_2(df,config)

In [0]:
mx=0
for i in rnn_data:
  mx=max(mx,i.shape[1])
mx

912

In [0]:
rnn_data_3=[]
for i in range(len(rnn_data)):
  if(rnn_data[i].shape[1]<912):
    rnn_data_3.append(torch.cat((rnn_data[i],torch.zeros(20,mx-rnn_data[i].shape[1])),1))
  else:
    rnn_data_3.append(rnn_data[i])

In [0]:
x_train=torch.zeros(4000,20,912)
for i in range(4000):
  x_train[i]=rnn_data_3[i]
  
x_val=torch.zeros(len(rnn_data_3)-4000,20,912)
for i in range(4000,len(rnn_data_3)-4000):
  x_val[i]=rnn_data_3[i]
x_train.shape,x_val.shape

(torch.Size([4000, 20, 912]), torch.Size([1541, 20, 912]))

In [0]:
seq_lengths, perm_idx = x_train.sort(0, descending=True)

In [0]:
del seq_lengths

In [0]:
x_3=x_train[perm_idx]
y_3=y_train[perm_idx]

In [0]:
Y_train=np.asarray(df.label)
y=torch.tensor(Y_train)
y_train=y[:4000]
y_val=y[4000:]
y_train=y[:4000]
y_val=y[4000:]
y_train=y_train-1
y_val=y_val-1

In [0]:
x_train=x_train.permute(0,2,1)
x_val=x_val.permute(0,2,1)

In [0]:
def get_batches(x,y,bs):
  for n in range(0,len(x),bs):
    yield x[n:n+bs],y[n:n+bs]
    
batch=get_batches(x_train,y_train,128)

In [0]:
class LSTM(nn.Module):
  
  def __init__(self, input_dim, hidden_dim, batch_size, output_dim=4,
                    num_layers=2):
    super(LSTM, self).__init__()
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.batch_size = batch_size
    self.num_layers = num_layers

        # Define the LSTM layer
    self.lstm = nn.LSTM(input_size=self.input_dim,hidden_size=self.hidden_dim,num_layers=self.num_layers,batch_first=True)

    self.l1 = nn.Linear(self.hidden_dim, self.hidden_dim//2)
    self.l2 = nn.Linear(self.hidden_dim//2,output_dim)

  def init_hidden(self):
    # This is what we'll initialise our hidden state as
    return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
            torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

  def forward(self, input):
    # Forward pass through LSTM layer
    # shape of lstm_out: [input_size, batch_size, hidden_dim]
    # shape of self.hidden: (a, b), where a and b both 
    # have shape (num_layers, batch_size, hidden_dim).
    lstm_out, self.hidden = self.lstm(input)
        
    # Only take the output from the final timetep
    # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
    y_pred = self.l1(lstm_out[:,-1,:])
    y_pred=y_pred.clamp(min=0)
    y_pred=self.l2(y_pred)
    return y_pred
#         return y_pred

model = LSTM(20, 100, batch_size=128, output_dim=4, num_layers=1).to('cuda')

In [0]:
def accuracy(out,yb): return (torch.argmax(out,dim=1)==yb).float().mean()
loss_func=F.cross_entropy

In [0]:
n=len(x_train)
bs=128
def train_loop(epoch,lr):
  train_loss=[]
  val_loss=[]
  acc=[]
  opt=optim.Adam(model.parameters(),lr)
  for epoch in range(epoch):
    lol=0
    acc_tmp=0
    cnt=0
    for xb,yb in get_batches(x_train,y_train,bs):
      xb=xb.to('cuda')
      yb=yb.to('cuda') 
      pred = model(xb)
      loss = loss_func(pred, yb)
#       print(loss)
      lol+=(loss)
      loss.backward()
      opt.step()
      opt.zero_grad()
      cnt+=1
      acc_tmp+=accuracy(pred,yb)
#     val_loss.append(loss_func(model(x_val),y_val))
#     print("validation-{}".format(val_loss[epoch]))
    acc.append(acc_tmp/cnt)
    print("accuracy-{}".format(acc[-1]))
    train_loss.append(lol/cnt)
    print("train_loss-{}".format(train_loss[-1]))
  plt.plot(acc,label='accuracy')
  plt.plot(train_loss,label='train_loss')

In [0]:
train_loop(10,0.0001)

In [0]:
train_loss=[]
val_loss=[]
acc=[]
for epoch in range(1):
  lol=0
  acc_tmp=0
  cnt=0
  for xb,yb in get_batches(x_val,y_val,bs):
    xb=xb.to('cuda')
    yb=yb.to('cuda') 
    pred = model(xb)
    loss = loss_func(pred, yb)
    lol+=(loss)
    cnt+=1
    acc_tmp+=accuracy(pred,yb)
  print("accuracy-{}".format(acc_tmp/cnt))
  print("train_loss-{}".format(lol/cnt))

In [0]:
model_2 = LSTM(20, 100, batch_size=128, output_dim=4, num_layers=1).to('cuda')

In [0]:
opt=optim.Adam(model_2.parameters(),0.00003)
for i in range(10):
  xb=x_train[:128].to('cuda')
  yb=y_train[:128].to('cuda') 
  pred = model_2(xb)
  loss = loss_func(pred, y)
  loss.backward()
  opt.step()
  opt.zero_grad()
  acc_tmp=accuracy(pred,yb)
  print(loss)
  print(acc_tmp)

In [0]:
torch.cuda.is_available()

True

In [0]:
z=torch.zeros(128,912,20)

In [0]:
ya=torch.zeros(128)
for i in range(50,128):
  z[i-50]=x_train[i]
  ya[i-50]=y_train[i]

for i in range(50):
  z[78+i]=x_train[i]
  ya[78+i]=y_train[i]

In [0]:
ya[1]

tensor(2.)

In [0]:
#   z=z.to('cuda')
#   ya=ya.to('cuda') 
  pred = model_2(xb)
  loss = loss_func(pred, yb)
#   loss.backward()
#   opt.step()
#   opt.zero_grad()
  acc_tmp=accuracy(pred,yb)
  print(loss)
  print(acc_tmp)

In [0]:
data_path='/content/drive/My Drive/test_set/'
tr_mini=[data_path+'tr_test/'+i for i in os.listdir(data_path+'tr_test')]
br_mini=[data_path+'br_test/'+i for i in os.listdir(data_path+'br_test')]
mn_mini=[data_path+'mn_test/'+i for i in os.listdir(data_path+'mn_test')]
dv_mini=[data_path+'dv_test/'+i for i in os.listdir(data_path+'dv_test')]

In [0]:
data=pd.DataFrame({'fname':tr_mini,'label': [1 for i in range(len(tr_mini))]})
data=pd.concat([data,pd.DataFrame({'fname':br_mini,'label': [4 for i in range(len(br_mini))]})])
data=pd.concat([data,pd.DataFrame({'fname':dv_mini,'label': [3 for i in range(len(dv_mini))]})])
data=pd.concat([data,pd.DataFrame({'fname':mn_mini,'label': [2 for i in range(len(mn_mini))]})])

In [0]:
df = data.sample(frac=1).reset_index(drop=True)

In [0]:
class Config(object):
  
  def __init__(self,
                 lr,epochs,sample_freq=44000, audio_length=5, n_classes=4,n_mfcc=20):
    self.sample_freq=sample_freq
    self.n_classes=4
    self.n_mfcc=n_mfcc
    self.audio_length=audio_length;
    self.max_len=audio_length*sample_freq
    self.epochs=epochs
    self.lr=lr
    self.dim = (self.n_mfcc, 1 + int(np.floor(self.max_len/512)), 1)

config=Config(lr=0.0001,epochs=20) 

In [0]:
def prepare_data(df,config):
    X = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1], 1))
    input_length = config.max_len
    for i, fname in enumerate(df.fname):
        file_path =fname
        data, _ = lr.core.load(file_path, sr=config.sample_freq, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data = lr.feature.mfcc(data,sr=config.sample_freq, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        X[i,] = data
    return X

In [0]:
X_train=prepare_data(df,config)

In [0]:
X_train.shape

(1930, 20, 430)

In [0]:
mean=-19.2144
std=113.9375

In [0]:
X_train=X_train.squeeze()
x=X_train.reshape(X_train.shape[0],-1)
x=torch.tensor(x)
x=x.float()

In [0]:
x=(x-mean)/std


In [0]:
Y_train=np.asarray(df.label)
y=torch.tensor(Y_train)
y=y-1

In [0]:
def accuracy(out,yb): return (torch.argmax(out,dim=1)==yb).float().mean()
loss_func=F.cross_entropy

In [0]:

model_2=torch.load('/content/drive/My Drive/error')

In [0]:
torch.cuda.is_available()

True

In [0]:
model=torch.load('finalized_2')

In [0]:
model=model.to('cuda')

In [0]:
x=x.to('cuda')
out1=model(x)
print(loss_func(out1,y.to('cuda')))
print(accuracy(out1,y.to('cuda')))

tensor(1.0119, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7430, device='cuda:0')


In [0]:
model_2=torch.load('finalized_2',map_location='cpu')
# model_2=model.to('cuda')

In [0]:
list(model_2.parameters())[-1]

Parameter containing:
tensor([ 0.0764,  0.0727,  0.0927, -0.0239], device='cuda:0',
       requires_grad=True)

In [0]:
list(model.parameters())[-1]

Parameter containing:
tensor([ 0.0764,  0.0727,  0.0927, -0.0239], device='cuda:0',
       requires_grad=True)

In [0]:
out2=model_2(x.to('cuda'))
print(loss_func(out2,y.to('cuda')))
print(accuracy(out2,y.to('cuda')))
print(out2[:3])
y[:3]

tensor(1.4120, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7311, device='cuda:0')
tensor([[-2.4097, -3.1650,  5.2534, -3.0029],
        [ 4.6440, -0.6217, -0.3443, -1.0942],
        [ 5.2288, -0.7557, -0.4082, -1.1286]], device='cuda:0',
       grad_fn=<SliceBackward>)


tensor([2, 0, 0])

In [0]:
def get_batches(y,bs):
  for n in range(0,len(y),bs):
    yield x[n:n+bs].to('cuda'),y[n:n+bs].to('cuda')

In [0]:
type(x),type(y)

(torch.Tensor, torch.Tensor)

In [0]:
batch=get_batches(y,64)

In [0]:
xb,yb=next(batch)

In [0]:
xb.shape,yb.shape



(torch.Size([64, 8600]), torch.Size([64]))

In [0]:
bs=64
def train_loop(epoch,lr):
  train_loss=[]
  val_loss=[]
  acc=[]
  opt=optim.Adam(model.parameters(),lr)
  batch=get_batches(y,bs)  
  cnt=0
  for epoch in range(epoch):
    lol=0
    
    for xb,yb in batch:
        cnt=cnt+1
        pred = model(xb)
        loss = loss_func(pred, yb)
#         print(loss)
        lol+=loss
        loss.backward()
        opt.step()
        opt.zero_grad()
#         for p in model.parameters():
#           p.register_hook(lambda grad: torch.clamp(grad, -1, 1))
#     ans=model(x_val.to('cuda'))
#     val_tmp=loss_func(ans,y_val)
#     if val_loss>val_tmp:
#       val_loss=val_tmp
#       torch.save('stop')
#     val_loss.append(val_tmp)
#     print("validation-{}".format(val_loss[epoch]))
    
#     acc.append(accuracy(ans,y_val))
#     print("accuracy-{}".format(acc[-1]))
    train_loss.append(lol/cnt)
    print("train_loss-{}".format(train_loss[-1]))
#   plt.plot(val_loss,label='val_loss')
#   plt.plot(train_loss,label='train_loss')
  plt.plot(acc,label='acc')
  plt.legend()

In [0]:
train_loop(3,0.0003)

In [0]:
fname='common_voice_mn_18579830.wav'
fname_2='common_voice_mn_18579831.wav'
fname_3='common_voice_dv_18588176.wav'

In [0]:
class Config(object):
  
  def __init__(self,sample_freq=44000, audio_length=5, n_classes=4,n_mfcc=20):
    self.sample_freq=sample_freq
    self.n_classes=4
    self.n_mfcc=n_mfcc
    self.audio_length=audio_length;
    self.max_len=audio_length*sample_freq
    self.dim = (self.n_mfcc, 1 + int(np.floor(self.max_len/512)), 1)

config=Config() 

In [0]:
file_path =fname_3 # path of file uploaded
input_length = config.max_len
data_2, _ = lr.core.load(file_path, sr=config.sample_freq, res_type="kaiser_fast")
if len(data_2) > input_length:
  max_offset = len(data_2) - input_length
  offset = np.random.randint(max_offset)
  data_2 = data_2[offset:(input_length+offset)]
else:
  if input_length > len(data_2):
    max_offset = input_length - len(data)
    offset = np.random.randint(max_offset)
  else:
    offset = 0
  data_2 = np.pad(data_2, (offset, input_length - len(data_2) - offset), "constant")

data_2 = lr.feature.mfcc(data_2,sr=config.sample_freq, n_mfcc=config.n_mfcc)
data_2 = np.expand_dims(data_2, axis=-1)

In [0]:
ans=['tr','mn','dv','bt']

In [0]:
data_2.shape

(20, 430, 1)

In [0]:
# data_2=data_2.squeeze()
data_2=data_2.reshape(1,-1)
data_2=torch.tensor(data_2)
data_2=data_2.float()

In [0]:
data_2.shape

torch.Size([1, 8600])

In [0]:
data_2=(data_2-mean)/std

In [0]:
fx=torch.cat((data_2,data_2),dim=0)

In [0]:
fx=torch.zeros(10,8600)

In [0]:
# data.shape
model.eval()
out=model_2(fx.to('cuda'))
print(out)
print(out.argmax(dim=1))
x=out.argmax(dim=1)[0]
print(x)
print(ans[x])

tensor([[ 0.0445, -0.0764, -0.0353, -0.5797],
        [ 0.0445, -0.0764, -0.0353, -0.5797]], device='cuda:0',
       grad_fn=<AddmmBackward>)
tensor([0, 0], device='cuda:0')
tensor(0, device='cuda:0')
tr
