<a href="https://colab.research.google.com/github/JordanFoss/STAT3007_Project/blob/main/Autoencoder_classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/JordanFoss/STAT3007_Project.git
%cd STAT3007_Project/

Cloning into 'STAT3007_Project'...
remote: Enumerating objects: 3888, done.[K
remote: Counting objects: 100% (166/166), done.[K
remote: Compressing objects: 100% (132/132), done.[K
remote: Total 3888 (delta 67), reused 110 (delta 32), pack-reused 3722[K
Receiving objects: 100% (3888/3888), 676.10 MiB | 16.85 MiB/s, done.
Resolving deltas: 100% (652/652), done.
Checking out files: 100% (2839/2839), done.
/content/STAT3007_Project


In [26]:
import librosa
from librosa import display
import torch
import numpy as np
from IPython.display import Audio
import glob
from pre_process import *
import os
import torch.optim as optim
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.transforms import ToTensor

import scipy
from torchsummary import summary

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np


# Denoising Autoencoder architecture

In [5]:
# Autoencoder with optimal hyper-parameters already put in
class Encoder(nn.Module):
  def __init__(self, filters = 8):
    super(Encoder, self).__init__()
    self.conv = nn.Sequential(nn.Conv2d(1, filters, kernel_size= (3,4), stride = 1),
                              nn.ReLU(),
                              nn.MaxPool2d(kernel_size = 2),
                              nn.ReLU(),
                              nn.Conv2d(filters,filters,kernel_size= (3,4), stride = 1),
                              nn.ReLU(),
                              nn.MaxPool2d(kernel_size = 2),
                              nn.Conv2d(filters,filters,kernel_size= (3,4), stride = 1),
                              nn.ReLU()
                              )
    
  def forward(self, x):
    x = self.conv(x)
    return x


class Decoder(nn.Module):
  def __init__(self, filters = 8):
    super(Decoder, self).__init__()
    (K, S) = (2, 1)
    self.conv = nn.Sequential(nn.ConvTranspose2d(filters,filters, kernel_size = (3,4)),
                              nn.ReLU(),
                              nn.Upsample(size = (61,27)),
                              nn.ConvTranspose2d(filters,filters, kernel_size = (3,4)),
                              nn.ReLU(),
                              nn.Upsample(size = (126,60)),
                              nn.ConvTranspose2d(filters,1, kernel_size = (3,4)),
                              )
    
  def forward(self, x):
    x = self.conv(x)
    return x

class Autoencoder(nn.Module):
  def __init__(self, filters = 8):
    super(Autoencoder,self).__init__()
    self.filters = filters
    self.encoder = Encoder(filters)
    self.decoder = Decoder(filters)
  def forward(self, x):
    x = self.encoder(x)
    x = self.decoder(x)
    return x

In [7]:
summary(Autoencoder().cuda(), (1,128,63))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 8, 126, 60]             104
              ReLU-2           [-1, 8, 126, 60]               0
         MaxPool2d-3            [-1, 8, 63, 30]               0
              ReLU-4            [-1, 8, 63, 30]               0
            Conv2d-5            [-1, 8, 61, 27]             776
              ReLU-6            [-1, 8, 61, 27]               0
         MaxPool2d-7            [-1, 8, 30, 13]               0
            Conv2d-8            [-1, 8, 28, 10]             776
              ReLU-9            [-1, 8, 28, 10]               0
          Encoder-10            [-1, 8, 28, 10]               0
  ConvTranspose2d-11            [-1, 8, 30, 13]             776
             ReLU-12            [-1, 8, 30, 13]               0
         Upsample-13            [-1, 8, 61, 27]               0
  ConvTranspose2d-14            [-1, 8,

# Classification Networks

## 1. CNN

In [13]:
# input: batch_sizex8x28x10
#output: batch_sizex5
class ConvClassification(nn.Module):
    def __init__(self, input_channel,contain_linear = False, filter_num = 6):
        super(ConvClassification, self).__init__()
        self.flatten = nn.Flatten()
        self.conv1 = nn.Conv2d(input_channel, filter_num, kernel_size = (2,3))
        self.conv2 = nn.Sequential(nn.ReLU(),
            nn.MaxPool2d(kernel_size=(1,2), stride = 2),
            nn.Dropout(0.25),
            nn.Conv2d(filter_num, 4, kernel_size = (2,3)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(1,2), stride = 2),
        )

        self.contain_linear = contain_linear

        if contain_linear:
          self.linear = nn.Sequential(
              nn.Linear(4*7, 10),
              nn.Linear(10, 5),
          )

    def forward(self, x, inspect_feature = False):

      first_layer = self.conv1(x)
      conv_x = self.conv2(first_layer)

      output_x = conv_x
      if self.contain_linear:
        conv_x_flat  = self.flatten(conv_x)
        output_x = self.linear(conv_x_flat)
      
      if inspect_feature:
        return first_layer,conv_x,output_x
      return output_x

In [14]:
summary(ConvClassification(8, contain_linear= True).cuda(),(8,28,10))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1             [-1, 6, 27, 8]             294
              ReLU-2             [-1, 6, 27, 8]               0
         MaxPool2d-3             [-1, 6, 14, 4]               0
           Dropout-4             [-1, 6, 14, 4]               0
            Conv2d-5             [-1, 4, 13, 2]             148
              ReLU-6             [-1, 4, 13, 2]               0
         MaxPool2d-7              [-1, 4, 7, 1]               0
           Flatten-8                   [-1, 28]               0
            Linear-9                   [-1, 10]             290
           Linear-10                    [-1, 5]              55
Total params: 787
Trainable params: 787
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.03
Params size (MB): 0.00
Estimated Total Siz

## 2. FC Linear network

In [24]:
class LinearClassification(nn.Module):
  def __init__(self):
    super(LinearClassification, self).__init__()

    self.flatten = nn.Flatten()
    self.linear = nn.Sequential(nn.Linear(8*28*10, 1024),
                                nn.Sigmoid(),
                                nn.Linear(1024,5))
    
  def forward(self,x):
    x_flat = self.flatten(x)

    pred = self.linear(x_flat)

    return pred
    

In [25]:
summary(LinearClassification().cuda(),(8,28,10))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                 [-1, 2240]               0
            Linear-2                 [-1, 1024]       2,294,784
           Sigmoid-3                 [-1, 1024]               0
            Linear-4                    [-1, 5]           5,125
Total params: 2,299,909
Trainable params: 2,299,909
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.03
Params size (MB): 8.77
Estimated Total Size (MB): 8.81
----------------------------------------------------------------


## 3. Logistic regression

In [None]:

# building a naive bayes classifier
def classification_model(X_train,y_train, X_test, y_test):
  NB = MultinomialNB()
  NB.fit(X_train,y_train)
  y_pred = NB.predict(X_test)

  # metrics
  # count correctly classified samples
  correct_count = np.count_nonzero(y_pred == y_test)
  accuracy = metrics.accuracy_score(y_test,y_pred)
  balanced_accuracy = metrics.balanced_accuracy_score(y_test,y_pred)

  print('total sample counts:',y_pred.shape[0])
  print('correct classification counts:',correct_count,'(',accuracy * 100,'%)')
  print('{0}: {1:.1%}'.format('balanced accuracy', balanced_accuracy))
  print('misclasification counts:',y_pred.shape[0] - correct_count,'(',(1 - accuracy)*100,'%)')

  return NB

# next step: training procedure

In [None]:
spectro_data = DatasetWrapper(X,y)
train_size = int(X.shape[0] * 0.7)
test_size = X.shape[0] - train_size

loss = nn.MSELoss()

data_test, data_train = random_split(spectro_data,[test_size,train_size], generator = torch.Generator().manual_seed(10))
nepoch = 100
batch_size = 30

torch.manual_seed(69)
AE = Autoencoder()

trained_net = train_model(data_train,AE, loss, nepoch = nepoch,lr = 0.01, batch_size = batch_size, use_cuda = True,print_output = True)

In [None]:
Xc = []
yc = []
target_map = {'02':0,'03':1,'04':2,'05':3,'08':4}

for file in glob.glob(os.getcwd() + '/sample-noisy-speech-actor-11/*.wav'):
  name = file.split('/')[-1]
  emotion = name.split('-')[0]

  samples, sampling_rate = librosa.load(file, sr = 16000)
  spec = data_gen(samples, sampling_rate, 2)

  Xc.append(spec)
  yc.append(target_map[emotion])

Xc = torch.tensor(Xc)
Xc = Xc.reshape(Xc.shape[0],1,Xc.shape[1],Xc.shape[2])

X_feature = feature_net.to(torch.device('cpu'))(Xc)

yc = torch.tensor(yc)
yc = yc.type(torch.LongTensor)

In [None]:
X_feature = torch.tensor(X_feature.detach().numpy())

In [None]:
classification_data = DatasetWrapper(X_feature,yc)
train_size = int(X_feature.shape[0] * 0.7)
test_size = X_feature.shape[0] - train_size

loss = nn.CrossEntropyLoss()

data_test, data_train = random_split(classification_data,[test_size,train_size], generator = torch.Generator().manual_seed(10))
nepoch = 100
batch_size = 30

torch.manual_seed(69)
CNN = ConvNet(input_channel = 16, contain_linear= True).cuda()



trained_net = train_model(data_train,CNN, loss, nepoch = nepoch,lr = 0.01, batch_size = batch_size, use_cuda = True,print_output = True, classification = True)

epoch: 0
loss: 6.794913929297763e-07
------------
epoch: 1
loss: 0.6743510961532593
------------
epoch: 2
loss: 0.0003673128376249224
------------
epoch: 3
loss: 0.0
------------
epoch: 4
loss: 0.0
------------
epoch: 5
loss: 0.0
------------
epoch: 6
loss: 12.359458923339844
------------
epoch: 7
loss: 0.0
------------
epoch: 8
loss: 16.698596954345703
------------
epoch: 9
loss: 0.0
------------
epoch: 10
loss: 72.20507049560547
------------
epoch: 11
loss: 0.0
------------
epoch: 12
loss: 0.0
------------
epoch: 13
loss: 0.0
------------
epoch: 14
loss: 0.0
------------
epoch: 15
loss: 0.0
------------
epoch: 16
loss: 0.0
------------
epoch: 17
loss: 0.0
------------
epoch: 18
loss: 0.0
------------
epoch: 19
loss: 0.0
------------
epoch: 20
loss: 0.0
------------
epoch: 21
loss: 0.0
------------
epoch: 22
loss: 0.0
------------
epoch: 23
loss: 0.0
------------
epoch: 24
loss: 0.0
------------
epoch: 25
loss: 0.0
------------
epoch: 26
loss: 0.0
------------
epoch: 27
loss: 0.0
----

In [None]:
X_feature.shape

torch.Size([790, 16, 28, 12])

In [None]:
X_feature_flat = torch.flatten(X_feature,start_dim = 1)
classification_data = DatasetWrapper(X_feature_flat,yc)
train_size = int(X_feature.shape[0] * 0.7)
test_size = X_feature.shape[0] - train_size

data_test, data_train = random_split(classification_data,[test_size,train_size], generator = torch.Generator().manual_seed(10))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np

# building a naive bayes classifier
def classification_model(X_train,y_train, X_test, y_test):
  NB = MultinomialNB()
  NB.fit(X_train,y_train)
  y_pred = NB.predict(X_test)

  # metrics
  # count correctly classified samples
  correct_count = np.count_nonzero(y_pred == y_test)
  accuracy = metrics.accuracy_score(y_test,y_pred)
  balanced_accuracy = metrics.balanced_accuracy_score(y_test,y_pred)

  print('total sample counts:',y_pred.shape[0])
  print('correct classification counts:',correct_count,'(',accuracy * 100,'%)')
  print('{0}: {1:.1%}'.format('balanced accuracy', balanced_accuracy))
  print('misclasification counts:',y_pred.shape[0] - correct_count,'(',(1 - accuracy)*100,'%)')

  return NB

In [None]:
training_pixel_feature,training_pixel_target = data_train.dataset.get_data()
test_pixel_feature,test_pixel_target = data_test.dataset.get_data()


NB_pixel = classification_model(training_pixel_feature.detach().numpy(),training_pixel_target.detach().numpy(),test_pixel_feature.detach().numpy(),test_pixel_target.detach().numpy())

total sample counts: 790
correct classification counts: 783 ( 99.1139240506329 %)
balanced accuracy: 99.1%
misclasification counts: 7 ( 0.8860759493670933 %)
