# Installs

In [None]:
!pip install kaggle



In [1]:
!pip install torch torchvision torchaudio

Collecting torchaudio
  Downloading torchaudio-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.2 MB/s 
  Downloading torchaudio-0.9.1-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 59.4 MB/s 
[?25h  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 49.8 MB/s 
[?25hInstalling collected packages: torchaudio
Successfully installed torchaudio-0.9.0


In [2]:
!pip install allosaurus

Collecting allosaurus
  Downloading allosaurus-1.0.2-py3-none-any.whl (52 kB)
[?25l[K     |██████▎                         | 10 kB 32.0 MB/s eta 0:00:01[K     |████████████▋                   | 20 kB 10.3 MB/s eta 0:00:01[K     |██████████████████▉             | 30 kB 8.8 MB/s eta 0:00:01[K     |█████████████████████████▏      | 40 kB 8.3 MB/s eta 0:00:01[K     |███████████████████████████████▌| 51 kB 4.4 MB/s eta 0:00:01[K     |████████████████████████████████| 52 kB 709 kB/s 
Collecting panphon
  Downloading panphon-0.19-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 471 kB/s 
Collecting munkres
  Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Collecting unicodecsv
  Downloading unicodecsv-0.14.1.tar.gz (10 kB)
Building wheels for collected packages: unicodecsv
  Building wheel for unicodecsv (setup.py) ... [?25l[?25hdone
  Created wheel for unicodecsv: filename=unicodecsv-0.14.1-py3-none-any.whl size=10765 sha256=ed73a58b14eba40

In [None]:
!pip install tqdm



In [None]:
!ls -a

.   .config  iemocap_full_dataset.csv  Ses01F_script02_1_F000.wav
..  drive    sample_data


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [4]:
#These libraries help to interact with the operating system and the runtime environment respectively
import os
import sys

#Model/Training related libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

#Dataloader libraries
from torch.utils.data import DataLoader, Dataset

# Transforms and datasets
import torchvision.transforms as transforms
import torchvision.datasets as dset

import time
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
from tqdm import tqdm

# Allosaurus
from allosaurus.audio import read_audio
from allosaurus.app import read_recognizer
from allosaurus.am.utils import *

In [5]:
torch.cuda.is_available()

True

In [6]:
torch.cuda.current_device()

0

# Process IEMOCAP dataset csv

In [7]:
df = pd.read_csv("iemocap_full_dataset.csv")
df

Unnamed: 0,session,method,gender,emotion,n_annotators,agreement,path
0,1,script,F,neu,3,3,Session1/sentences/wav/Ses01F_script02_1/Ses01...
1,1,script,F,fru,3,2,Session1/sentences/wav/Ses01F_script02_1/Ses01...
2,1,script,F,xxx,0,0,Session1/sentences/wav/Ses01F_script02_1/Ses01...
3,1,script,F,sur,3,2,Session1/sentences/wav/Ses01F_script02_1/Ses01...
4,1,script,F,neu,3,2,Session1/sentences/wav/Ses01F_script02_1/Ses01...
...,...,...,...,...,...,...,...
10034,5,impro,F,neu,3,2,Session5/sentences/wav/Ses05F_impro06/Ses05F_i...
10035,5,impro,F,neu,3,2,Session5/sentences/wav/Ses05F_impro06/Ses05F_i...
10036,5,impro,F,neu,3,2,Session5/sentences/wav/Ses05F_impro06/Ses05F_i...
10037,5,impro,F,neu,3,2,Session5/sentences/wav/Ses05F_impro06/Ses05F_i...


In [8]:
df = df[df.emotion != 'xxx']  # only keep data that has emotion label
# only keep 'neu', 'hap', 'sad', 'ang' labels
df = df.drop(df[~ ((df.emotion == 'neu') | (df.emotion == 'hap') | (df.emotion == 'sad') | (df.emotion == 'ang'))].index)

df_unedit = df.copy()
df_unedit["path"] = df_unedit["path"].apply(lambda x : x.split('/')[-1])
all_files = list(df_unedit.path)
file_to_emotion = dict(zip(df_unedit.path, df_unedit.emotion))

all_full_files = list(df.path)
print(df)
print(df_unedit)
print(len(file_to_emotion))
print(file_to_emotion)
print(all_full_files)

       session  ...                                               path
0            1  ...  Session1/sentences/wav/Ses01F_script02_1/Ses01...
4            1  ...  Session1/sentences/wav/Ses01F_script02_1/Ses01...
6            1  ...  Session1/sentences/wav/Ses01F_script02_1/Ses01...
7            1  ...  Session1/sentences/wav/Ses01F_script02_1/Ses01...
8            1  ...  Session1/sentences/wav/Ses01F_script02_1/Ses01...
...        ...  ...                                                ...
10034        5  ...  Session5/sentences/wav/Ses05F_impro06/Ses05F_i...
10035        5  ...  Session5/sentences/wav/Ses05F_impro06/Ses05F_i...
10036        5  ...  Session5/sentences/wav/Ses05F_impro06/Ses05F_i...
10037        5  ...  Session5/sentences/wav/Ses05F_impro06/Ses05F_i...
10038        5  ...  Session5/sentences/wav/Ses05F_impro06/Ses05F_i...

[4490 rows x 7 columns]
       session  method  ... agreement                        path
0            1  script  ...         3  Ses01F_script02_1_

In [9]:
from collections import Counter

# get unique emotions
# emotion_to_label = {'neu': 0, 'fru': 1, 'sad': 2, 'sur': 3, 'ang': 4, 'hap': 5, 'exc': 6, 'fea': 7, 'dis': 8, 'oth': 9}
emotion_to_label = {'neu': 0, 'hap': 1, 'sad': 2, 'ang': 3}
label_to_emotion = {v: k for k, v in emotion_to_label.items()}
print(emotion_to_label)
print(label_to_emotion)

# counter number of class instances
emotion_instances_list = [v for v in file_to_emotion.values()]
counter = Counter(emotion_instances_list)
print(counter)

{'neu': 0, 'hap': 1, 'sad': 2, 'ang': 3}
{0: 'neu', 1: 'hap', 2: 'sad', 3: 'ang'}
Counter({'neu': 1708, 'ang': 1103, 'sad': 1084, 'hap': 595})


In [10]:
file_to_label = {k: emotion_to_label[v] for k, v in file_to_emotion.items()}
print(file_to_label)

{'Ses01F_script02_1_F000.wav': 0, 'Ses01F_script02_1_F004.wav': 0, 'Ses01F_script02_1_F006.wav': 3, 'Ses01F_script02_1_F007.wav': 3, 'Ses01F_script02_1_F008.wav': 0, 'Ses01F_script02_1_F009.wav': 1, 'Ses01F_script02_1_F010.wav': 2, 'Ses01F_script02_1_F015.wav': 0, 'Ses01F_script02_1_F024.wav': 0, 'Ses01F_script02_1_F025.wav': 0, 'Ses01F_script02_1_M007.wav': 0, 'Ses01F_script02_1_M008.wav': 0, 'Ses01F_script02_1_M009.wav': 0, 'Ses01F_script02_1_M010.wav': 0, 'Ses01F_script02_1_M011.wav': 0, 'Ses01F_script02_1_M013.wav': 0, 'Ses01F_script02_1_M014.wav': 0, 'Ses01F_script02_1_M015.wav': 0, 'Ses01F_script02_1_M020.wav': 0, 'Ses01F_script02_1_M021.wav': 0, 'Ses01F_script02_1_M023.wav': 0, 'Ses01F_script02_1_M024.wav': 0, 'Ses01F_script02_1_M025.wav': 0, 'Ses01F_script02_1_M026.wav': 0, 'Ses01F_script02_1_M029.wav': 0, 'Ses01F_script02_1_M031.wav': 0, 'Ses01F_script02_1_M032.wav': 0, 'Ses01F_script02_1_M033.wav': 0, 'Ses01F_script02_1_M034.wav': 0, 'Ses01F_script02_1_M036.wav': 1, 'Ses01F_s

# Data Processing and Dataset

In [None]:
!ls

drive  iemocap_full_dataset.csv  sample_data


In [11]:
data_dir = os.path.join("drive", "MyDrive", "18786 IDL", "IDL Project", "data", "IEMOCAP_full_release")
# data_dir = os.path.join("drive", "MyDrive", "IDL Project", "data", "IEMOCAP_full_release")
print(data_dir)

drive/MyDrive/18786 IDL/IDL Project/data/IEMOCAP_full_release


In [12]:
recognizer = read_recognizer()

downloading model  latest
from:  https://github.com/xinjli/allosaurus/releases/download/v1.0/latest.tar.gz
to:    /usr/local/lib/python3.7/dist-packages/allosaurus/pretrained
please wait...


In [13]:
class MyDataset(Dataset):
    def __init__(self, file_list, target_list):
        
        self.file_list = file_list
        self.target_list = target_list
        self.num_classes = len(list(set(target_list)))

        # self.recognizer = read_recognizer()

        # feats, feat_lens = [], []
        # for file in tqdm(file_list):
            
        #     feat = torch.tensor(recognizer.pm.compute(read_audio(file))) # batch, len, features
        #     feat_len = torch.tensor(np.array([feat.shape[0]], dtype=np.int32)) # 1D array
            
        #     feats.append(feat)
        #     feat_lens.append(feat_len)
            

        # feats = pad_sequence(feats,batch_first=True,padding_value=0) # batch,features,len
        # feat_lens = pad_sequence(feat_lens,batch_first=True,padding_value=0).squeeze()
        # idx = torch.argsort(feat_lens,descending=True) # sorting the input in descending order as required by the lstms in AM.
        # self.y = np.array(self.target_list)[idx].tolist()   # reorder
        # tensor_batch_feat, tensor_batch_feat_len = move_to_tensor([feats[idx], feat_lens[idx]], device_id=-1) # converting to the required tensors

        # # Features
        # output_tensor, input_lengths = recognizer.am(tensor_batch_feat, tensor_batch_feat_len, return_lstm=True) # output_shape: [len,batch,features]
        # assert(len(file_list) == output_tensor.shape[1])

        # self.x = output_tensor
        self.x = file_list
        self.y = target_list

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, index):
        # x = self.x[:, index, :]
        # y = self.y[index]
        # y = torch.Tensor([y])
        # print("inside get item")
        filepath = self.file_list[index]
        x = torch.tensor(recognizer.pm.compute(read_audio(filepath)))
        x = x.detach()
        x_len = torch.tensor(np.array([x.shape[0]], dtype=np.int32))
        x_len = x_len.detach()
        y = torch.Tensor([self.target_list[index]])
        return x, x_len, y

In [14]:
# collate function
def pad_collate(batch):
    # print("inside collate")
    # batch looks like [(x0, xlen0, y0), (x4, xlen4, y4), (x2, xlen2, y2)... ]
    feats = [sample[0] for sample in batch]
    feat_lens = [sample[1] for sample in batch]
    target_list = torch.Tensor([sample[2] for sample in batch])

    feats = pad_sequence(feats, batch_first=True, padding_value=0) # batch, features, len
    feat_lens = pad_sequence(feat_lens, batch_first=True, padding_value=0).squeeze()
    idx = torch.argsort(feat_lens, descending=True) # sorting the input in descending order as required by the lstms in AM.

    # reorder
    # tensor_batch_feat = feats[idx]
    # tensor_batch_feat_len = feat_lens[idx]
    targets = target_list[idx]
    tensor_batch_feat, tensor_batch_feat_len = move_to_tensor([feats[idx], feat_lens[idx]], device_id=-1) # converting to the required tensors

    # Features
    output_tensor, input_lengths = recognizer.am(tensor_batch_feat, tensor_batch_feat_len, return_lstm=True) # output_shape: [len,batch,features]
    output_tensor = output_tensor.detach()
    input_lengths = input_lengths.detach()
    
    return output_tensor, input_lengths, targets

In [None]:
def parse_data(data_dir):
    all_file_paths = []  # full file paths from drive to wav file

    for root, directories, filenames in os.walk(data_dir):
            for filename in filenames:
                if filename.endswith('.wav') and filename[0] != '.' and filename in all_files:
                    filei = os.path.join(root, filename)
                    all_file_paths.append(filei)

    return all_file_paths

# assert(len(all_file_paths) == len(all_files))

## random shuffle data order for train, val, test split

In [15]:
# all_file_paths = parse_data(data_dir)
all_file_paths = [os.path.join("drive", "MyDrive", "18786 IDL", "IDL Project", "data", "IEMOCAP_full_release", file_path) for file_path in all_full_files]
# all_file_paths = [os.path.join("drive", "MyDrive", "IDL Project", "data", "IEMOCAP_full_release", file_path) for file_path in all_full_files]
total_instances = len(all_file_paths)
print(total_instances)

4490


In [16]:
num_train = round(0.7 * total_instances)
num_test_all = total_instances - num_train
num_val = round(0.5 * num_test_all)
num_test = num_test_all - num_val

print("number training instances:", str(num_train))
print("number validation instances:", str(num_val))
print("number test instances:", str(num_test))
assert(num_train + num_val + num_test == total_instances)

number training instances: 3143
number validation instances: 674
number test instances: 673


In [17]:
# shuffle data
import random
random.seed(420)

shuffled_data_paths = random.sample(all_file_paths, k=total_instances)
train_list_paths = shuffled_data_paths[:num_train]
testall_list_paths = shuffled_data_paths[num_train:]
val_list_paths = testall_list_paths[:num_val]
test_list_paths = testall_list_paths[num_val:]

assert(len(train_list_paths) + len(val_list_paths) + len(test_list_paths) == total_instances)

# train, val, test variables:
# train_list_paths
# val_list_paths
# test_list_paths

In [18]:
# get corresponding labels for data
train_list_labels = [file_to_label[filepath.split('/')[-1]] for filepath in train_list_paths]
val_list_labels = [file_to_label[filepath.split('/')[-1]] for filepath in val_list_paths]
test_list_labels = [file_to_label[filepath.split('/')[-1]] for filepath in test_list_paths]

assert(len(train_list_labels) == len(train_list_paths))
assert(len(val_list_labels) == len(val_list_paths))
assert(len(test_list_labels) == len(test_list_paths))

## Create datasets and dataloaders

In [19]:
batch_size = 32

In [None]:
# train dataloader
train_dset = MyDataset(train_list_paths, train_list_labels)
train_args = dict(shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=pad_collate, drop_last=True)  # change to num_workers=4 on diff platform
train_loader = DataLoader(train_dset, **train_args)

In [None]:
# val dataloader
val_dset = MyDataset(val_list_paths, val_list_labels)
val_args = dict(shuffle=False, batch_size=batch_size, num_workers=2, collate_fn=pad_collate, drop_last=True)
val_loader = DataLoader(val_dset, **val_args)

## Check a batch from dataloader

In [None]:
test_batch = next(iter(train_loader))

In [None]:
x, x_len, y = test_batch
print(x.shape)  # seq_len, batch_size, input_size
print(x_len)
print(y)

torch.Size([371, 32, 640])
tensor([371, 315, 284, 255, 241, 231, 202, 195, 194, 194, 179, 165, 164, 151,
        150, 149, 136, 133, 119, 117, 117, 113,  96,  90,  80,  76,  73,  73,
         64,  63,  62,  54], dtype=torch.int32)
tensor([0., 2., 3., 0., 2., 0., 0., 0., 1., 2., 2., 1., 0., 3., 0., 3., 0., 2.,
        2., 0., 2., 2., 1., 2., 2., 3., 0., 3., 1., 3., 0., 0.])


# Test Allosaurus

In [None]:
recognizer = read_recognizer()
wav_paths = ["Ses01F_script02_1_F000.wav", "Ses01F_script02_1_F001.wav"]

feats, feat_lens = [], []
for wav_path in wav_paths:
    
    feat = torch.tensor(recognizer.pm.compute(read_audio(wav_path))) # batch, len, features
    feat_len = torch.tensor(np.array([feat.shape[0]], dtype=np.int32)) # 1D array

    print(feat.shape)
    print(feat_len)
    
    feats.append(feat)
    feat_lens.append(feat_len)
    

feats = pad_sequence(feats,batch_first=True,padding_value=0) # batch,features,len
feat_lens = pad_sequence(feat_lens,batch_first=True,padding_value=0).squeeze()
idx = torch.argsort(feat_lens,descending=True) # sorting the input in descending order as required by the lstms in AM.
tensor_batch_feat, tensor_batch_feat_len = move_to_tensor([feats[idx], feat_lens[idx]], recognizer.config.device_id) # converting to the required tensors

# Features
output_tensor, input_lengths = recognizer.am(tensor_batch_feat, tensor_batch_feat_len, return_lstm=True) # output_shape: [len,batch,features]

torch.Size([69, 120])
tensor([69], dtype=torch.int32)
torch.Size([50, 120])
tensor([50], dtype=torch.int32)


In [None]:
print(output_tensor.shape)
print(input_lengths)

torch.Size([69, 2, 640])
tensor([69, 50], dtype=torch.int32)


## see how many test files produce 0 length phonetic transcriptions (don't run again)

In [None]:
from allosaurus.app import read_recognizer

# load your model
model = read_recognizer()

num_no_transcription = 0

# run inference -> æ l u s ɔ ɹ s
for file in tqdm(all_file_paths):
    output = model.recognize(file)
    if len(output) == 0:
        num_no_transcription += 1

print(f"number of data instances with no phonetic transcription: {num_no_transcription}")
print(f"total number of data instances: {len(all_file_paths)}")

100%|██████████| 7532/7532 [1:21:29<00:00,  1.54it/s]

number of data instances with no phonetic transcription: 53
total number of data instances: 7532





# Models

In [24]:
class BaseLSTM(nn.Module):
    def __init__(self, num_layers, num_classes, input_size, hidden_size, dropout, bidirectional=False):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        
        # self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=False)

        self.linear = nn.Linear(in_features=2 * hidden_size if bidirectional else hidden_size, 
                                out_features=num_classes)
        
    def forward(self, x, lengths=None):
        # input = self.embed(x)
        
        # batch_size = input.size(0)
        # input = input.transpose(1,2)    # (B,T,H) -> (B,H,T)

        # cnn_output = torch.cat([self.cnn(input), self.cnn2(input), self.cnn3(input)], dim=1)

        # input = F.relu(self.batchnorm(cnn_output))

        # input = input.transpose(1,2)

        _, (hn, cn) = self.lstm(x)

        # pack_tensor = nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=True)
        # _, (hn, cn) = self.lstm(pack_tensor)

        if self.bidirectional:
            h_n = hn.view(self.num_layers, 2, batch_size, self.hidden_size)
            h_n = torch.cat([ h_n[-1, 0,:], h_n[-1,1,:] ], dim = 1)
        else:
            h_n = hn[-1]
        
        logits = self.linear(h_n)

        return logits

In [None]:
class BaseLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(BaseLSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
class BidirectionalLSTM(nn.Module):
    def __init__(self, hidden_size, nlayers, out_size=47, embed_size=40):
        super(BidirectionalLSTM, self).__init__()
        self.nlayers = nlayers
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.out_size = out_size
        self.cnns = torch.nn.Sequential(
            nn.Conv1d(self.embed_size, self.hidden_size, 3, padding=1, bias=False),
            nn.BatchNorm1d(self.hidden_size),
            nn.ReLU(inplace=True))
        self.rnns = nn.LSTM(input_size=self.hidden_size,
                            hidden_size=self.hidden_size,
                            num_layers=3,
                            bias=True,
                            batch_first=True,
                            dropout=0.2, # regularization
                            bidirectional=True)
        self.hidden2label = torch.nn.Sequential(
            nn.Linear(self.hidden_size*2, self.hidden_size),
            nn.Linear(self.hidden_size, self.out_size))
    def forward(self, x, xLens): # x dim (B, T_in, C_in=40)
        x_cnn_input = x.permute(0, 2, 1) # (B, C_in, T_in)
        x_post_cnn = self.cnns(x_cnn_input) # (B, C_out, T_out)
        x_rnn_in = x_post_cnn.permute(2, 0, 1) # (T, B, C_out)
        x_packed = pack_padded_sequence(x_rnn_in, xLens, enforce_sorted=False)
        out_packed, hidden = self.rnns(x_packed)
        out, out_lens = pad_packed_sequence(out_packed, batch_first=True) # (B, T, C)
        
        # Log softmax after output layer is required since nn.CTCLoss expect log prob
        out_prob = self.hidden2label(out).log_softmax(2) # (B, T, Classes=47)
        
        # Permute to fit for input format of CTCLoss
        out_prob = out_prob.permute(1, 0, 2) #torch.transpose(out_prob, 0, 1) # (T, B, C)
        
        # TODO: calculate new xLens
        return out_prob, xLens

In [None]:
class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)

        return outputs

# Instantiate Model 

In [None]:
run_num = 4
batch_size = 32
# lr = 0.01
lr = 0.001
weight_decay = 5e-5
# weight_decay = 0.0001
num_epochs = 40
# in_features = 3 # RGB channels
# momentum = 0.9

num_classes = 4  # 'neu', 'hap', 'sad', 'ang'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = BaseLSTM(num_layers=3, num_classes=num_classes, input_size=640, hidden_size=256, dropout=0.1, bidirectional=True)

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
print(model)

# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(network.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum, nesterov=False)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=2, threshold=0.04, threshold_mode='abs', verbose=True)  # used for up to run 8 (inclusive)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.2, patience=3, threshold=0.04, threshold_mode='abs', verbose=True)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.5, verbose=True)

BaseLSTM(
  (lstm): LSTM(640, 256, num_layers=3, dropout=0.1, bidirectional=True)
  (linear): Linear(in_features=512, out_features=4, bias=True)
)
Adjusting learning rate of group 0 to 1.0000e-03.


# Training

In [None]:
# Train!
for epoch in range(num_epochs):
    
    # Train
    model.train()
    avg_loss = 0.0
    
    start_time = time.time()
    train_num_correct = 0
    
    for batch_num, (xbatch, xlen, ybatch) in enumerate(train_loader, 0):
        assert(xbatch.shape[2] == 640)
        optimizer.zero_grad()
        
        xbatch, ybatch = xbatch.to(device), ybatch.to(device)

#         outputs, _ = network(x)  # returns output, embeddings_out
        logits = model(xbatch)  # returns output, embeddings_out_norelu, embeddings_out_relu
#         print("outputs:", outputs.shape)
#         print("argmax of output:", torch.argmax(outputs, axis=1).shape)
#         print("y long:", y.long().shape)
        train_num_correct += (torch.argmax(logits, axis=1) == ybatch).sum().item()

        loss = criterion(logits, ybatch.long())
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()
        training_loss = avg_loss

        # if batch_num % 5 == 0:
        #     print("5 batches have passed")

        if batch_num % 10 == 9:
            print('Epoch: {}\tBatch: {}\tAvg-Loss: {:.4f}'.format(epoch, batch_num + 1, avg_loss / 10))
            training_loss = avg_loss / 10
            avg_loss = 0.0
            
    stop_time = time.time()
    print(f"Training Time {stop_time - start_time} seconds")
    
    train_acc = train_num_correct / len(train_dset)
    print(f"Training Acc: {train_acc}")
    
    # Validate
    model.eval()
    num_correct = 0
    
    start_time = time.time()
    
    # Validation
    for batch_num, (x, xlen, y) in enumerate(val_loader, 0):
        x, y = x.to(device), y.to(device)
#         outputs, _ = network(x)  # returns output, embeddings
        logits = model(x)  # returns output, embeddings_out_norelu, embeddings_out_relu
        num_correct += (torch.argmax(logits, axis=1) == y).sum().item()
    
    val_acc = num_correct / len(val_dset)
        
    print('Epoch: {}, Validation Accuracy: {:.2f}'.format(epoch, val_acc))
    
    stop_time = time.time()
    print(f"Validation Time {stop_time - start_time} seconds")
    
    # scheduler
    # scheduler.step(val_acc)  # don't use with StepLR
    scheduler.step()
    
    # save model
    print("SAVING CHECKPOINT")
    save_path = os.path.join("drive", "MyDrive", "18786 IDL", "IDL Project", "saved_models", f"run{run_num}", f"epoch{epoch}_batchsize{batch_size}_lr{lr}.pth")
    # save_path = os.path.join("drive", "MyDrive", "IDL Project", "saved_models", f"run{run_num}", f"epoch{epoch + num_epochs}_batchsize{batch_size}_lr{lr}.pth")
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': training_loss,
            'train_acc': train_acc,
            'val_acc': val_acc,
            'scheduler_state_dict': scheduler.state_dict()
        }, save_path)

Epoch: 0	Batch: 10	Avg-Loss: 1.3536
Epoch: 0	Batch: 20	Avg-Loss: 1.2892
Epoch: 0	Batch: 30	Avg-Loss: 1.2581
Epoch: 0	Batch: 40	Avg-Loss: 1.2758
Epoch: 0	Batch: 50	Avg-Loss: 1.2511
Epoch: 0	Batch: 60	Avg-Loss: 1.2654
Epoch: 0	Batch: 70	Avg-Loss: 1.2016
Epoch: 0	Batch: 80	Avg-Loss: 1.1765
Epoch: 0	Batch: 90	Avg-Loss: 1.2031
Training Time 843.5502893924713 seconds
Training Acc: 0.4209354120267261
Epoch: 0, Validation Accuracy: 0.44
Validation Time 193.56710290908813 seconds
Adjusting learning rate of group 0 to 1.0000e-03.
SAVING CHECKPOINT
Epoch: 1	Batch: 10	Avg-Loss: 1.1872
Epoch: 1	Batch: 20	Avg-Loss: 1.1651
Epoch: 1	Batch: 30	Avg-Loss: 1.1624
Epoch: 1	Batch: 40	Avg-Loss: 1.1815
Epoch: 1	Batch: 50	Avg-Loss: 1.1696
Epoch: 1	Batch: 60	Avg-Loss: 1.1581
Epoch: 1	Batch: 70	Avg-Loss: 1.2127
Epoch: 1	Batch: 80	Avg-Loss: 1.0790
Epoch: 1	Batch: 90	Avg-Loss: 1.2226
Training Time 512.1933341026306 seconds
Training Acc: 0.47120585427935097
Epoch: 1, Validation Accuracy: 0.49
Validation Time 114.30

## Resume Training if needed

In [None]:
resume_net_pth = os.path.join("drive", "MyDrive", "18786 IDL", "IDL Project", "saved_models", "run1", "epoch2_batchsize64_lr0.001.pth")
# resume_net_pth = os.path.join("drive", "MyDrive", "IDL Project", "saved_models", "run1", "epoch2_batchsize64_lr0.001.pth")
checkpoint = torch.load(resume_net_pth)
model = model.to(device)

model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
scheduler.load_state_dict(checkpoint["scheduler_state_dict"])

add_epochs = 40

In [None]:
for epoch in range(add_epochs):
    
    # Train
    model.train()
    avg_loss = 0.0
    
    start_time = time.time()
    train_num_correct = 0
    
    for batch_num, (xbatch, xlen, ybatch) in enumerate(train_loader, 0):
        assert(xbatch.shape[2] == 640)
        optimizer.zero_grad()
        
        xbatch, ybatch = xbatch.to(device), ybatch.to(device)

#         outputs, _ = network(x)  # returns output, embeddings_out
        logits = model(xbatch)  # returns output, embeddings_out_norelu, embeddings_out_relu
#         print("outputs:", outputs.shape)
#         print("argmax of output:", torch.argmax(outputs, axis=1).shape)
#         print("y long:", y.long().shape)
        train_num_correct += (torch.argmax(logits, axis=1) == ybatch).sum().item()

        loss = criterion(logits, ybatch.long())
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()
        training_loss = avg_loss

        if batch_num % 5 == 0:
            print("5 batches have passed")

        if batch_num % 10 == 9:
            print('Epoch: {}\tBatch: {}\tAvg-Loss: {:.4f}'.format(epoch, batch_num + 1, avg_loss / 10))
            training_loss = avg_loss / 10
            avg_loss = 0.0
            
    stop_time = time.time()
    print(f"Training Time {stop_time - start_time} seconds")
    
    train_acc = train_num_correct / len(train_dset)
    print(f"Training Acc: {train_acc}")
    
    # Validate
    model.eval()
    num_correct = 0
    
    start_time = time.time()
    
    # Validation
    for batch_num, (x, xlen, y) in enumerate(val_loader, 0):
        x, y = x.to(device), y.to(device)
#         outputs, _ = network(x)  # returns output, embeddings
        logits = model(x)  # returns output, embeddings_out_norelu, embeddings_out_relu
        num_correct += (torch.argmax(logits, axis=1) == y).sum().item()
    
    val_acc = num_correct / len(val_dset)
        
    print('Epoch: {}, Validation Accuracy: {:.2f}'.format(epoch + num_epochs, val_acc))
    
    stop_time = time.time()
    print(f"Validation Time {stop_time - start_time} seconds")
    
    # scheduler
    scheduler.step(val_acc)
    
    # save model
    print("SAVING CHECKPOINT")
    save_path = os.path.join("drive", "MyDrive", "18786 IDL", "IDL Project", "saved_models", f"run{run_num}", f"epoch{epoch + num_epochs}_batchsize{batch_size}_lr{lr}.pth")
    # save_path = os.path.join("drive", "MyDrive", "IDL Project", "saved_models", f"run{run_num}", f"epoch{epoch + num_epochs}_batchsize{batch_size}_lr{lr}.pth")
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': training_loss,
            'train_acc': train_acc,
            'val_acc': val_acc,
            'scheduler_state_dict': scheduler.state_dict()
        }, save_path)

5 batches have passed
5 batches have passed
Epoch: 0	Batch: 10	Avg-Loss: 1.6448
5 batches have passed
5 batches have passed
Epoch: 0	Batch: 20	Avg-Loss: 1.6760
5 batches have passed
5 batches have passed
Epoch: 0	Batch: 30	Avg-Loss: 1.5691
5 batches have passed
5 batches have passed
Epoch: 0	Batch: 40	Avg-Loss: 1.6351
5 batches have passed
5 batches have passed
Epoch: 0	Batch: 50	Avg-Loss: 1.7053
5 batches have passed
5 batches have passed
Epoch: 0	Batch: 60	Avg-Loss: 1.6685
5 batches have passed
5 batches have passed
Epoch: 0	Batch: 70	Avg-Loss: 1.6528
5 batches have passed
5 batches have passed
Epoch: 0	Batch: 80	Avg-Loss: 1.7092
5 batches have passed
5 batches have passed
Epoch: 0	Batch: 90	Avg-Loss: 1.6550
5 batches have passed
Training Time 2337.0674788951874 seconds
Training Acc: 0.3176236309326253
Epoch: 40, Validation Accuracy: 0.33
Validation Time 296.99290442466736 seconds
SAVING CHECKPOINT
5 batches have passed
5 batches have passed
Epoch: 1	Batch: 10	Avg-Loss: 1.6275
5 batc

# Inference on Test Set

## test loader

In [20]:
# test loader
test_dset = MyDataset(test_list_paths, test_list_labels)
test_args = dict(shuffle=False, batch_size=32, num_workers=2, collate_fn=pad_collate, drop_last=True)
test_loader = DataLoader(test_dset, **test_args)

## load model for inference

In [35]:
load_pth = "/content/drive/MyDrive/18786 IDL/IDL Project/saved_models/run3/epoch10_batchsize32_lr0.001.pth"

In [36]:
checkpoint = torch.load(load_pth)
print(checkpoint["val_acc"])

0.4896142433234421


In [37]:
model = BaseLSTM(num_layers=3, num_classes=4, input_size=640, hidden_size=512, dropout=0.1, bidirectional=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BaseLSTM(
  (lstm): LSTM(640, 512, num_layers=3, dropout=0.1, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=4, bias=True)
)

## get test accuracy

In [38]:
def get_test_acc(model_pth, test_loader):
    checkpoint = torch.load(model_pth)
    model.load_state_dict(checkpoint["model_state_dict"])

    model.eval()
    test_num_correct = 0
    total = 0
    for batch_num, (x, lengths, y) in enumerate(test_loader):
        x = x.to(device)
        y = y.long().to(device)

        logits = model(x, lengths)
        test_num_correct += (torch.argmax(logits, axis=1) == y).sum().item()
        total += len(y)

    test_acc = test_num_correct / total
    return test_acc

In [None]:
test_acc = get_test_acc(load_pth, test_loader)
print(test_acc)