In [31]:
# Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import numpy as np
import pandas as pd
import re
import seaborn as sns
import contractions
import operator
import matplotlib.pyplot as plt
from itertools import islice
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix

from seqeval.metrics import f1_score
from seqeval.scheme import IOB2

from tqdm import tqdm

import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Set device = CUDA if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: ', device)

Device:  cpu


In [32]:
# Reading CSV File into Pandas DataFrame

# Reading Train Dataset into DataFrame
dataF = pd.read_csv('Data/hw2_train.csv', index_col = 0)
dataF.columns = ['texts', 'tags']
print('Train Set Shape:', dataF.shape)
# print(dataF.tail())

# Removing unequal texts and tags from train dataset
list_of_dict = []
for idx, row in dataF.iterrows():
    temp_dict = {}
    len_text = len(row['texts'].split())
    len_tag = len(row['tags'].split())
    if(len_text == len_tag):
        temp_dict['texts'] = row['texts']
        temp_dict['tags'] = row['tags']
        list_of_dict.append(temp_dict)
df = pd.DataFrame.from_dict(list_of_dict)
print('Train Set with unequal rows removed Shape: ', df.shape)
# print(df.tail())

# Reading Test Dataset into DataFrame
test_data = pd.read_csv('Data/hw2_test.csv', index_col = 0)
test_data.columns = ['texts']
print('Test Set Shape:', test_data.shape)
# print(test_df.tail())

Train Set Shape: (2312, 2)
Train Set with unequal rows removed Shape:  (2253, 2)
Test Set Shape: (981, 1)


In [33]:
# Splitting the Training dataset into the Training set and Validation set
train_data, val_data = train_test_split(df,
                                        random_state = 0, 
                                        test_size = 0.25, 
                                        shuffle = True)
print('Train Data Shape: ', train_data.shape)
print('Test Data Shape: ', val_data.shape)

Train Data Shape:  (1689, 2)
Test Data Shape:  (564, 2)


In [34]:
# Pre-Trained GloVe Word Embeddings
glove = pd.read_csv('Data/gloVe/glove.840B.300d.txt', sep = ' ', quoting = 3, header = None, index_col = 0)
glove_embedding = {key: val.values for key, val in glove.T.items()}
print(len(glove_embedding))

2196009


In [35]:
# Create NumPy Array for Vocab and Emeddings
vocab = list(glove_embedding.keys())
embeddings = list(glove_embedding.values())

vocab = np.array(vocab)
embeddings = np.array(embeddings)
print('Vocab Shape: ', vocab.shape)
print('Embeddings Shape: ', embeddings.shape)

Vocab Shape:  (2196009,)
Embeddings Shape:  (2196009, 300)


In [36]:
# Insert '<pad>' and '<unk>' tokens at start of vocab
vocab = np.insert(vocab, 0, '<pad>')
vocab = np.insert(vocab, 1, '<unk>')
print(vocab.shape)

# Insert embeddings for pad and unk tokens at top of embeddings
embedding_pad = np.zeros((1, 
                          embeddings.shape[1]))
embedding_unk = np.mean(embeddings, 
                        axis = 0, 
                        keepdims = True)
embeddings = np.vstack((embedding_pad, 
                        embedding_unk, 
                        embeddings))
print(embeddings.shape)

(2196011,)
(2196011, 300)


In [37]:
# Dictionary for words
word2idx = { term: idx for idx, term in enumerate(vocab) }
word2idx['<pad>'] = 0
word2idx['<unk>'] = 1
idx2word = { idx: word for word,idx in word2idx.items() }
# print(len(word2idx))

# for k,v in sorted(word2idx.items(), key = operator.itemgetter(1))[:50]:
#     print(k,v)

# Dicitionary for tags
tag_set = set()
tag_count_dict = {}
for index, row in df.iterrows():
    temp = row['tags'].split()
    for item in temp:
        tag_set.add(item)
        if item not in tag_count_dict:
            tag_count_dict[item] = 1
        else:
            tag_count_dict[item] = tag_count_dict[item] + 1

tag2idx = { tag: idx for idx, tag in enumerate(tag_set) }
idx2tag = { idx: word for word, idx in tag2idx.items() }
print(idx2tag)

TAG_COUNT = len(tag2idx)
print(TAG_COUNT)

{0: 'B_country', 1: 'B_person', 2: 'I_release_year', 3: 'B_cast', 4: 'B_subject', 5: 'B_char', 6: 'I_producer', 7: 'I_language', 8: 'B_movie', 9: 'B_release_year', 10: 'B_genre', 11: 'I_movie', 12: 'I_char', 13: 'B_language', 14: 'B_director', 15: 'I_genre', 16: 'I_country', 17: 'B_mpaa_rating', 18: 'I_mpaa_rating', 19: 'I_subject', 20: 'I-movie', 21: 'O', 22: 'I_cast', 23: 'I_director', 24: 'I_person', 25: 'B_location', 26: 'B_producer'}
27


In [38]:
# SlotTagger Class for DataLoader
class SlotTaggerDataset(Dataset):
    
    def __init__(self, 
                 data: pd.DataFrame):
        self.data = data
        self.texts = self.data['texts']
        if 'tags' in self.data.columns:
            self.tags = self.data['tags']
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, 
                    idx: int):
        # Enoding Text
        text = self.texts.iloc[idx]
        encoded_text = self.encode_text(text)
        
        # Encoding Tags if they exist
        if 'tags' in self.data.columns:
            tag = self.tags.iloc[idx]
            encoded_tag = self.encode_tag(tag)
            return encoded_text, encoded_tag 
        else:
            return encoded_text
        
    # Tokenize    
    def tokenize(self, 
                 text: str):
        return [i.text for i in tokenizer(text)]
    
    # Encode Text
    def encode_text(self, 
                    text):
        text_list = [word for word in self.tokenize(text)]
        text_vector = []
        for word in text_list:
            if word in word2idx:
                text_vector.append(word2idx[word])
            else:
                text_vector.append(1)
        return text_vector
    
    # Enocde Tags
    def encode_tag(self, 
                   tag):
        tag_list = [word for word in self.tokenize(tag)]
        tag_vector = [tag2idx[word] for word in tag_list]
        return tag_vector

In [40]:
# Slot Tagger Object for DataLoader
train_ds = SlotTaggerDataset(train_data)
# print(train_ds[0])
val_ds = SlotTaggerDataset(val_data)
# print(val_ds[0])
test_ds = SlotTaggerDataset(test_data)
# print(test_ds[0])

In [75]:
# PyTorch Data Loader
def custom_collate_fn(batch):
    if isinstance(batch[0], list):
        texts_tensor = [torch.tensor(text) for text in batch]

        lengths = [len(text) for text in batch]
        lengths = torch.tensor(lengths)

        texts_padded = pad_sequence(texts_tensor, batch_first = True, padding_value = 0)

        return texts_padded, lengths

    else:
        texts, tags = zip(*batch)

        texts_tensor = [torch.tensor(text) for text in texts]
        tags_tensor = [torch.tensor(tag) for tag in tags]

        lengths = [len(text) for text in texts]
        lengths = torch.tensor(lengths)

        texts_padded = pad_sequence(texts_tensor, batch_first = True, padding_value = 0)
        tags_padded = pad_sequence(tags_tensor, batch_first = True, padding_value = 0)

        return texts_padded, tags_padded, lengths

BATCH_SIZE = 32

train_loader = DataLoader(train_ds, 
                          batch_size = BATCH_SIZE, 
                          shuffle = True, 
                          collate_fn = custom_collate_fn)
val_loader = DataLoader(val_ds, 
                        batch_size = BATCH_SIZE, 
                        shuffle = True, 
                        collate_fn = custom_collate_fn)

test_loader = DataLoader(test_ds, 
                        batch_size = 1, 
                        shuffle = False, 
                        collate_fn = custom_collate_fn)

# print(list(islice(train_loader, 1)))

assert train_data.shape[0] == len(train_loader.dataset)
assert val_data.shape[0] == len(val_loader.dataset)
assert test_data.shape[0] == len(test_loader.dataset)

In [76]:
# RNN Model with 1 hidden layer
class LSTM(nn.Module):
    def __init__(self, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout):
        super().__init__()
        
        # Initialize Embedding Layer with Pre-Trained Embeddings (Vector Sequences)
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embeddings).float())
        # LSTM layer process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
#                             dropout = dropout,
                            batch_first = True)
#         # We use dropout before the final layer to improve with regularization
#         self.dropout = nn.Dropout(dropout)
        # Dense layer to predict 
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, 
                x, 
                x_lengths):
        embedded = self.embedding(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, 
                                                            x_lengths, 
                                                            batch_first = True, 
                                                            enforce_sorted = False) # Pack sequence
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, 
                                                                  batch_first = True) # Unpack sequence
#         output = self.dropout(output)
        output = self.fc(output)
        return output

In [81]:
torch.manual_seed(32)

EMBEDDING_DIM = embeddings.shape[1]
HIDDEN_DIM = 20
OUTPUT_DIM = TAG_COUNT
NUM_LAYERS = 2
BIDIRECTION = False
DROPOUT = 0.2

model = LSTM(EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            NUM_LAYERS, 
            BIDIRECTION, 
            DROPOUT).to(device)

print(model)

LSTM(
  (embedding): Embedding(2196011, 300)
  (lstm): LSTM(300, 20, num_layers=2, batch_first=True)
  (fc): Linear(in_features=20, out_features=27, bias=True)
)


In [82]:
def squeezed_list(my_list):
    temp_list = [int(element) for element in my_list]
    return temp_list  

def slice_list(my_list, slice_increment):
    return [my_list[i : i + slice_increment] for i in range(0, len(my_list), slice_increment)]

def convert_idx_to_tags(lol, isTensor):
    iob_list = []
    for list_element in lol:
        if (isTensor):
            list_element = list_element.numpy() 
        iob = [idx2tag[index] for index in list_element]
        iob_list.append(iob)
    return iob_list

In [83]:
# Model Train Function
def train(loader, 
          model, 
          optimizer, 
          loss_fn):
    model.train()
    losses = []
    pbar = tqdm(loader)
    for x, y, lengths in pbar:
        optimizer.zero_grad()
        
        y_pred = model(x, lengths)
        
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        y = torch.flatten(y)
        
        loss = loss_fn(y_pred, y)
        pbar.set_postfix({'Loss': loss.item()})
        losses.append(loss.item())
        
        # Calculate gradients for w/b
        loss.backward()  
        # Update weights according to optimizer rules
        optimizer.step()          
    return sum(losses) / len(losses)

# Model Evaluate Function
def evaluate(loader, 
             model, 
             loss_fn, 
             score_fn):
    model.eval()
    losses = []
    for x, y, lengths in tqdm(loader):
        y_pred = model(x, lengths)
        
        max_len = x.shape[1]
              
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        y = torch.flatten(y)
        

        loss = loss_fn(y_pred, y)
        losses.append(loss.item())
        
        tags_iob = convert_idx_to_tags(slice_list(y, max_len), True)

        max_preds = y_pred.argmax(dim = 1, keepdim = True) # Get the index of the max probability
        predictions_iob = convert_idx_to_tags(slice_list((squeezed_list(max_preds)), max_len), False)
    
    score = score_fn(tags_iob, predictions_iob, scheme = IOB2)
    return tags_iob, predictions_iob, sum(losses) / len(losses), score

In [84]:
# Model Training on Train dataset and Evaluation on Validation dataset
optimizer = torch.optim.Adam(model.parameters(),
                             lr = 0.01)
loss_fn = nn.CrossEntropyLoss().to(device)
score_fn = f1_score

train_loss_list = []
val_loss_list = []
f1_score_list = []
n_epochs = 50
best_acc = 0
PATH = f'best-model.pt'

for epoch in range(n_epochs):
    # Model Training
    train_loss = train(train_loader, 
                     model, 
                     optimizer, 
                     loss_fn)
    print('Train Loss: ', train_loss)
    train_loss_list.append(train_loss)
    
    # Model Evaluation
    tags, predictions, val_loss, accuracy = evaluate(val_loader, 
                                                       model, 
                                                       loss_fn, 
                                                       score_fn)
    print('Val Accuracy: ', accuracy)
    print('Val Loss: ', val_loss)
    val_loss_list.append(val_loss)
    f1_score_list.append(accuracy)
    
    # Save model if Validation F1_Score is greather than 75%
#     if accuracy > best_acc and accuracy > 0.75:
#         torch.save(model.state_dict(), PATH)
#         cm = multilabel_confusion_matrix(labels, predictions)

# print('Train Loss List: ', train_loss_list)  
# print('Val Loss List: ', val_loss_list)
# print('Acc List: ', f1_score_list)

torch.save(model.state_dict(), PATH)

100%|███████████████████████████████| 53/53 [00:00<00:00, 107.58it/s, Loss=1.63]


Train Loss:  2.1276060747650436


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 255.50it/s]


Val Accuracy:  0.8695652173913043
Val Loss:  1.606158905559116


100%|███████████████████████████████| 53/53 [00:00<00:00, 170.06it/s, Loss=1.13]


Train Loss:  1.3511050399744287


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 307.76it/s]


Val Accuracy:  0.8704663212435233
Val Loss:  1.1327585909101698


100%|██████████████████████████████| 53/53 [00:00<00:00, 163.58it/s, Loss=0.802]


Train Loss:  0.9240433663692115


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 314.62it/s]


Val Accuracy:  0.8669950738916256
Val Loss:  0.7596846785810258


100%|██████████████████████████████| 53/53 [00:00<00:00, 170.99it/s, Loss=0.598]


Train Loss:  0.6240231130483016


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 308.18it/s]


Val Accuracy:  0.8686131386861314
Val Loss:  0.5343639834059609


100%|██████████████████████████████| 53/53 [00:00<00:00, 168.57it/s, Loss=0.416]


Train Loss:  0.4382056436448727


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 294.87it/s]


Val Accuracy:  0.935064935064935
Val Loss:  0.40008341438240475


100%|██████████████████████████████| 53/53 [00:00<00:00, 169.88it/s, Loss=0.276]


Train Loss:  0.3272100068488211


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 316.32it/s]


Val Accuracy:  0.9591078066914499
Val Loss:  0.3221878740522597


100%|██████████████████████████████| 53/53 [00:00<00:00, 174.10it/s, Loss=0.227]


Train Loss:  0.2544457926502768


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 319.28it/s]


Val Accuracy:  0.9036144578313252
Val Loss:  0.27196597970194286


100%|██████████████████████████████| 53/53 [00:00<00:00, 167.57it/s, Loss=0.195]


Train Loss:  0.20670542683241502


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 307.84it/s]


Val Accuracy:  0.8929889298892989
Val Loss:  0.2327897689408726


100%|██████████████████████████████| 53/53 [00:00<00:00, 174.42it/s, Loss=0.155]


Train Loss:  0.1718649113515638


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 315.16it/s]


Val Accuracy:  0.9266409266409267
Val Loss:  0.21253516359461677


100%|██████████████████████████████| 53/53 [00:00<00:00, 175.40it/s, Loss=0.148]


Train Loss:  0.14796796455135885


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 317.47it/s]


Val Accuracy:  0.9274193548387097
Val Loss:  0.19231455110841328


100%|██████████████████████████████| 53/53 [00:00<00:00, 170.79it/s, Loss=0.114]


Train Loss:  0.12448634235364087


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 316.51it/s]


Val Accuracy:  0.9528535980148882
Val Loss:  0.18247787281870842


100%|█████████████████████████████| 53/53 [00:00<00:00, 172.97it/s, Loss=0.0906]


Train Loss:  0.1082897274842802


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 313.57it/s]


Val Accuracy:  0.9733656174334141
Val Loss:  0.1844098638329241


100%|██████████████████████████████| 53/53 [00:00<00:00, 172.51it/s, Loss=0.088]


Train Loss:  0.09632760201984981


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 310.56it/s]


Val Accuracy:  0.9186602870813397
Val Loss:  0.16565537908011013


100%|██████████████████████████████| 53/53 [00:00<00:00, 170.92it/s, Loss=0.104]


Train Loss:  0.08766515862266973


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 310.82it/s]


Val Accuracy:  0.9724137931034483
Val Loss:  0.16417844427956474


100%|██████████████████████████████| 53/53 [00:00<00:00, 167.42it/s, Loss=0.118]


Train Loss:  0.07728591013067174


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 309.76it/s]


Val Accuracy:  0.9469696969696969
Val Loss:  0.168395244412952


100%|█████████████████████████████| 53/53 [00:00<00:00, 173.61it/s, Loss=0.0693]


Train Loss:  0.07240664410703587


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 285.28it/s]


Val Accuracy:  0.9393939393939394
Val Loss:  0.14812188512749142


100%|█████████████████████████████| 53/53 [00:00<00:00, 171.30it/s, Loss=0.0769]


Train Loss:  0.0690774326476286


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 326.36it/s]


Val Accuracy:  0.9281045751633986
Val Loss:  0.15577182256513172


100%|█████████████████████████████| 53/53 [00:00<00:00, 173.34it/s, Loss=0.0575]


Train Loss:  0.06072799608392535


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 309.97it/s]


Val Accuracy:  0.9491525423728814
Val Loss:  0.15426404236091507


100%|█████████████████████████████| 53/53 [00:00<00:00, 171.39it/s, Loss=0.0798]


Train Loss:  0.05445612292244749


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 300.49it/s]


Val Accuracy:  0.935064935064935
Val Loss:  0.15734942878286043


100%|█████████████████████████████| 53/53 [00:00<00:00, 173.51it/s, Loss=0.0327]


Train Loss:  0.05122112071598476


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 308.77it/s]


Val Accuracy:  0.9530685920577618
Val Loss:  0.15140573928753534


100%|█████████████████████████████| 53/53 [00:00<00:00, 172.82it/s, Loss=0.0665]


Train Loss:  0.05032940177282073


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 307.89it/s]


Val Accuracy:  0.9466192170818506
Val Loss:  0.15486299494902292


100%|█████████████████████████████| 53/53 [00:00<00:00, 172.63it/s, Loss=0.0282]


Train Loss:  0.04657049617677365


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 306.99it/s]


Val Accuracy:  0.9591836734693877
Val Loss:  0.16307804940475357


100%|█████████████████████████████| 53/53 [00:00<00:00, 174.51it/s, Loss=0.0447]


Train Loss:  0.04658604915833698


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 305.47it/s]


Val Accuracy:  0.962121212121212
Val Loss:  0.15937109022504753


100%|█████████████████████████████| 53/53 [00:00<00:00, 159.47it/s, Loss=0.0366]


Train Loss:  0.04259015921995325


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 306.43it/s]


Val Accuracy:  0.9344262295081968
Val Loss:  0.1551487934258249


100%|█████████████████████████████| 53/53 [00:00<00:00, 169.46it/s, Loss=0.0481]


Train Loss:  0.039607807396436635


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 316.19it/s]


Val Accuracy:  0.9724770642201834
Val Loss:  0.15716904691523975


100%|█████████████████████████████| 53/53 [00:00<00:00, 172.93it/s, Loss=0.0285]


Train Loss:  0.03515229542862694


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 311.31it/s]


Val Accuracy:  0.9429657794676807
Val Loss:  0.15563724810878435


100%|█████████████████████████████| 53/53 [00:00<00:00, 173.89it/s, Loss=0.0288]


Train Loss:  0.03239620990067158


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 311.29it/s]


Val Accuracy:  0.9787234042553191
Val Loss:  0.14893117008937728


100%|█████████████████████████████| 53/53 [00:00<00:00, 169.67it/s, Loss=0.0122]


Train Loss:  0.032020561203782286


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 311.48it/s]


Val Accuracy:  0.9405940594059405
Val Loss:  0.15761304191417164


100%|█████████████████████████████| 53/53 [00:00<00:00, 172.45it/s, Loss=0.0182]


Train Loss:  0.03167505047442216


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 306.98it/s]


Val Accuracy:  0.9382716049382717
Val Loss:  0.1606156943986813


100%|█████████████████████████████| 53/53 [00:00<00:00, 171.87it/s, Loss=0.0239]


Train Loss:  0.028507802617859165


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 315.94it/s]


Val Accuracy:  0.9737609329446064
Val Loss:  0.16018170966870254


100%|█████████████████████████████| 53/53 [00:00<00:00, 171.33it/s, Loss=0.0122]


Train Loss:  0.02669730005820967


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 312.70it/s]


Val Accuracy:  0.967741935483871
Val Loss:  0.1682089041504595


100%|█████████████████████████████| 53/53 [00:00<00:00, 169.69it/s, Loss=0.0422]


Train Loss:  0.028324304746007018


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 309.37it/s]


Val Accuracy:  0.9722222222222222
Val Loss:  0.16944634541869164


100%|█████████████████████████████| 53/53 [00:00<00:00, 173.30it/s, Loss=0.0418]


Train Loss:  0.025979030244755296


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 307.65it/s]


Val Accuracy:  0.948905109489051
Val Loss:  0.16865281419207653


100%|█████████████████████████████| 53/53 [00:00<00:00, 171.52it/s, Loss=0.0283]


Train Loss:  0.023585767977220833


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 311.81it/s]


Val Accuracy:  0.9683544303797469
Val Loss:  0.1629291098150942


100%|█████████████████████████████| 53/53 [00:00<00:00, 170.80it/s, Loss=0.0274]


Train Loss:  0.023197701710434455


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 312.83it/s]


Val Accuracy:  0.9574468085106383
Val Loss:  0.164430212850372


100%|█████████████████████████████| 53/53 [00:00<00:00, 170.25it/s, Loss=0.0203]


Train Loss:  0.021371376683127205


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 309.44it/s]


Val Accuracy:  0.9712918660287081
Val Loss:  0.15910406907399496


100%|██████████████████████████████| 53/53 [00:00<00:00, 169.47it/s, Loss=0.017]


Train Loss:  0.027741269876231562


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 317.04it/s]


Val Accuracy:  0.9651741293532338
Val Loss:  0.1735167863468329


100%|███████████████████████████████| 53/53 [00:00<00:00, 172.54it/s, Loss=0.03]


Train Loss:  0.02620627263666324


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 311.20it/s]


Val Accuracy:  0.9587628865979382
Val Loss:  0.16599324055843884


100%|█████████████████████████████| 53/53 [00:00<00:00, 164.44it/s, Loss=0.0327]


Train Loss:  0.030397434086310415


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 310.76it/s]


Val Accuracy:  0.90625
Val Loss:  0.1789996615714497


100%|█████████████████████████████| 53/53 [00:00<00:00, 169.39it/s, Loss=0.0175]


Train Loss:  0.035608425209263585


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 312.18it/s]


Val Accuracy:  0.9517684887459806
Val Loss:  0.1637874347054296


100%|█████████████████████████████| 53/53 [00:00<00:00, 158.03it/s, Loss=0.0134]


Train Loss:  0.029647873943003843


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 307.60it/s]


Val Accuracy:  0.967032967032967
Val Loss:  0.15521205382214653


100%|█████████████████████████████| 53/53 [00:00<00:00, 170.03it/s, Loss=0.0186]


Train Loss:  0.028388791682444653


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 325.13it/s]


Val Accuracy:  0.925764192139738
Val Loss:  0.18433673472868073


100%|█████████████████████████████| 53/53 [00:00<00:00, 172.26it/s, Loss=0.0174]


Train Loss:  0.023480765978401562


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 307.19it/s]


Val Accuracy:  0.9464285714285714
Val Loss:  0.16488873875803417


100%|█████████████████████████████| 53/53 [00:00<00:00, 171.46it/s, Loss=0.0122]


Train Loss:  0.01980766900024324


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 309.17it/s]


Val Accuracy:  0.9750692520775623
Val Loss:  0.16663892360197174


100%|█████████████████████████████| 53/53 [00:00<00:00, 170.22it/s, Loss=0.0257]


Train Loss:  0.018145975006638834


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 316.33it/s]


Val Accuracy:  0.9459459459459459
Val Loss:  0.16964025174578032


100%|█████████████████████████████| 53/53 [00:00<00:00, 172.51it/s, Loss=0.0112]


Train Loss:  0.017443956693036937


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 318.03it/s]


Val Accuracy:  0.9052631578947369
Val Loss:  0.16509786393079492


100%|████████████████████████████| 53/53 [00:00<00:00, 171.75it/s, Loss=0.00674]


Train Loss:  0.015904559323318163


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 287.92it/s]


Val Accuracy:  0.9535864978902953
Val Loss:  0.156859102141526


100%|█████████████████████████████| 53/53 [00:00<00:00, 172.13it/s, Loss=0.0103]


Train Loss:  0.015106180208331009


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 312.92it/s]


Val Accuracy:  0.923076923076923
Val Loss:  0.17001528520550993


100%|█████████████████████████████| 53/53 [00:00<00:00, 175.09it/s, Loss=0.0072]


Train Loss:  0.015433565713465214


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 310.80it/s]


Val Accuracy:  0.9623188405797102
Val Loss:  0.1846312607328097


100%|█████████████████████████████| 53/53 [00:00<00:00, 173.87it/s, Loss=0.0327]


Train Loss:  0.015799477459195087


100%|██████████████████████████████████████████| 18/18 [00:00<00:00, 310.38it/s]


Val Accuracy:  0.9465648854961831
Val Loss:  0.16569840970138708


In [85]:
# Save and Load the Model
saved_model = LSTM(EMBEDDING_DIM, 
                   HIDDEN_DIM, 
                   OUTPUT_DIM, 
                   NUM_LAYERS, 
                   BIDIRECTION, 
                   DROPOUT).to(device)

saved_model.load_state_dict(torch.load(PATH))
saved_model.eval()

LSTM(
  (embedding): Embedding(2196011, 300)
  (lstm): LSTM(300, 20, num_layers=2, batch_first=True)
  (fc): Linear(in_features=20, out_features=27, bias=True)
)

In [86]:
def predict(loader, 
            model):
    predictions = []
    for x, lengths in tqdm(loader):
        with torch.no_grad():
            y_pred = saved_model.forward(x, lengths)
            
            max_len = x.shape[1]

            y_pred = y_pred.view(-1, y_pred.shape[-1])

            max_preds = y_pred.argmax(dim = 1, keepdim = True) # Get the index of the max probability
            predictions_iob = convert_idx_to_tags(slice_list((squeezed_list(max_preds)), max_len), False)
            predictions.append(predictions_iob)
    
    return predictions

        
predictions = predict(test_loader, saved_model)
predictions = np.array(predictions)
predictions = predictions.squeeze().tolist()

100%|███████████████████████████████████████| 981/981 [00:00<00:00, 2059.89it/s]
  predictions = np.array(predictions)


In [87]:
# Creating Dictionary
list_of_dict = []
for i in range(0, len(predictions)):
    temp_dict = {}
    temp_dict["ID"] = i
    temp_dict["IOB Slot tags"] = ' '.join(predictions[i])
    list_of_dict.append(temp_dict)

# Converting Dictionary to CSV and compress it for submission to CodaLab
tags_df = pd.DataFrame.from_dict(list_of_dict)
tags_df.to_csv('submission.csv.zip', compression = 'zip', index = False)
print(tags_df)

      ID                                      IOB Slot tags
0      0                                        O O B_movie
1      1                          O O O O O B_movie I_movie
2      2                        O O O O O O B_movie I_movie
3      3                                      O O O B_movie
4      4                                      O O O B_movie
..   ...                                                ...
976  976        O O B_movie I_movie I_movie I_movie I_movie
977  977                            O B_movie I_movie O O O
978  978                O O O O O O B_producer I_producer O
979  979  O O B_person I_director O O B_movie O O B_movi...
980  980                   O O B_director O O O O B_country

[981 rows x 2 columns]
