In [1]:
# Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import re
import seaborn as sns
import scipy
import contractions
import operator
import matplotlib.pyplot as plt
from itertools import islice
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix

from seqeval.metrics import f1_score
from seqeval.scheme import IOB2

from tqdm import tqdm

import spacy
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Set device = CUDA if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: ', device)

Device:  cpu


In [2]:
# Reading CSV File into Pandas DataFrame

# Reading Train Dataset into DataFrame
dataF = pd.read_csv('Data/hw2_train.csv', index_col = 0)
dataF.columns = ['texts', 'tags']
print('Train Set Shape:', dataF.shape)
# print(dataF.tail())

# Removing unequal texts and tags from train dataset
list_of_dict = []
for idx, row in dataF.iterrows():
    temp_dict = {}
    len_text = len(row['texts'].split())
    len_tag = len(row['tags'].split())
    if(len_text == len_tag):
        temp_dict['texts'] = row['texts']
        temp_dict['tags'] = row['tags']
        list_of_dict.append(temp_dict)
df = pd.DataFrame.from_dict(list_of_dict)
print('Train Set with unequal rows removed Shape: ', df.shape)
# print(df.tail())

# Reading Test Dataset into DataFrame
test_df = pd.read_csv('Data/hw2_test.csv', index_col = 0)
test_df.columns = ['texts']
print('Test Set Shape:', test_df.shape)
# print(test_df.tail())

Train Set Shape: (2312, 2)
Train Set with unequal rows removed Shape:  (2253, 2)
Test Set Shape: (981, 1)


In [3]:
# Tokenization
def tokenize(text):
    return [i.text for i in tokenizer(text)]
    
# Find Max Text and Tag Length
def find_max_length(data):
    max_length = 0
    for row in data:
        tokenized_row = tokenize(row)
        if(len(tokenized_row) > max_length):
            max_length = len(tokenized_row)
    return max_length

MAX_LENGTH = find_max_length(df['texts'])
print(MAX_LENGTH)

21


In [4]:
# Splitting the Training dataset into the Training set and Validation set
train_data, val_data = train_test_split(df,
                                        random_state = 0, 
                                        test_size = 0.25, 
                                        shuffle = True)
print('Train Data Shape: ', train_data.shape)
print('Test Data Shape: ', val_data.shape)

Train Data Shape:  (1689, 2)
Test Data Shape:  (564, 2)


In [5]:
# Pre-Trained GloVe Word Embeddings
glove = pd.read_csv('Data/gloVe/glove.840B.300d.txt', sep = ' ', quoting = 3, header = None, index_col = 0)
glove_embedding = {key: val.values for key, val in glove.T.items()}
print(len(glove_embedding))

2196009


In [7]:
# Create NumPy Array for Vocab and Emeddings
vocab = list(glove_embedding.keys())
embeddings = list(glove_embedding.values())

vocab = np.array(vocab)
embeddings = np.array(embeddings)
print('Vocab Shape: ', vocab.shape)
print('Embeddings Shape: ', embeddings.shape)

Vocab Shape:  (2196009,)
Embeddings Shape:  (2196009, 300)


In [8]:
# Insert '<pad>' and '<unk>' tokens at start of vocab
vocab = np.insert(vocab, 0, '<pad>')
vocab = np.insert(vocab, 1, '<unk>')
print(vocab.shape)

# Insert embeddings for pad and unk tokens at top of embeddings
embedding_pad = np.zeros((1, 
                          embeddings.shape[1]))
embedding_unk = np.mean(embeddings, 
                        axis = 0, 
                        keepdims = True)
embeddings = np.vstack((embedding_pad, 
                        embedding_unk, 
                        embeddings))
print(embeddings.shape)

(2196011,)
(2196011, 300)


In [9]:
# Dictionary for words
word2idx = { term: idx + 2 for idx, term in enumerate(vocab) }
word2idx['<pad>'] = 0
word2idx['<unk>'] = 1
idx2word = { idx: word for word,idx in word2idx.items() }
print(len(word2idx))

for k,v in sorted(word2idx.items(), key = operator.itemgetter(1))[:10]:
    print(k,v)

# Dicitionary for tags
tag_set = set()
tag_count_dict = {}
for index, row in df.iterrows():
    temp = row['tags'].split()
    for item in temp:
        tag_set.add(item)
        if item not in tag_count_dict:
            tag_count_dict[item] = 1
        else:
            tag_count_dict[item] = tag_count_dict[item] + 1

tag2idx = { tag: idx + 2 for idx, tag in enumerate(tag_set) }
tag2idx['<pad>'] = 0
tag2idx['<unk>'] = 1
idx2tag = { idx: word for word, idx in tag2idx.items() }
print(idx2tag)

TAG_COUNT = len(tag2idx)
print(TAG_COUNT)

2196011
<pad> 0
<unk> 1
, 4
. 5
the 6
and 7
to 8
of 9
a 10
in 11
{2: 'I_mpaa_rating', 3: 'B_subject', 4: 'I_char', 5: 'I_movie', 6: 'I_language', 7: 'I_subject', 8: 'I_person', 9: 'B_genre', 10: 'I_director', 11: 'B_release_year', 12: 'I_genre', 13: 'B_cast', 14: 'B_producer', 15: 'I_country', 16: 'B_char', 17: 'O', 18: 'B_mpaa_rating', 19: 'B_movie', 20: 'B_language', 21: 'B_country', 22: 'B_location', 23: 'B_person', 24: 'I_producer', 25: 'I-movie', 26: 'I_release_year', 27: 'I_cast', 28: 'B_director', 0: '<pad>', 1: '<unk>'}
29


In [39]:
# SlotTagger Class for DataLoader
class SlotTaggerDataset(Dataset):
    
    def __init__(self, 
                 data: pd.DataFrame):
        self.data = data
        self.texts = self.data['texts']
        self.tags = self.data['tags']
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, 
                    idx: int):
        # Enoding Text
        text = self.texts.iloc[idx]
#         text = self.preprocess_text(self.texts.iloc[idx])
        text_tensor, len_text_vector = self.encode_text(text)
        
        # Encoding Tag
        tag = self.tags.iloc[idx]
        tag_tensor = self.encode_tag(tag)
        
        return text_tensor, tag_tensor, len_text_vector 
    
    # Encode Text
    def encode_text(self, 
                    text):
        text_list = [word for word in self.tokenize(text)]
        text_vector = []
        for word in text_list:
            if word in word2idx:
                text_vector.append(word2idx[word])
            else:
                text_vector.append(1)
        len_text_vector = len(text_vector)
        padded_text_vector = text_vector + [word2idx['<pad>']] * (MAX_LENGTH - len(text_vector))
        return torch.from_numpy(np.array(padded_text_vector)), len_text_vector
    
    # Enocde Label
    def encode_tag(self, 
                   tag):
        tag_list = [word for word in self.tokenize(tag)]
        tag_vector = [tag2idx[word] for word in tag_list]
        padded_tag_vector = tag_vector + [tag2idx['<pad>']] * (MAX_LENGTH - len(tag_vector))
        ohe_tag_vector = np.eye(TAG_COUNT)[padded_tag_vector]
        return torch.from_numpy(np.array(ohe_tag_vector))
    
    # Tokenization
    def tokenize(self, 
                 text: str):
        return tokenize(text)

    # Text Lemmatization
    def lemmatization(self, 
                      text):
        doc = nlp(text)
        temp_list = []
        for token in doc:
            temp_list.append(token)
        return ' '.join(map(str, temp_list))
    
    # Stop Word Removal
    def stopword_removal(self, 
                         text):
        temp_list = []
        for item in self.tokenize(text):
            if item not in stopwords:
                temp_list.append(item)
        return ' '.join(temp_list)
    
    def preprocess_text(self, 
                        text):
        # Removing all HTML Tags
        text = re.sub(r'<.*?>', '', text)
        # Removing links
        text = re.sub(r'http\S+', '', text)
        # Remove Text Contractions
        text = contractions.fix(text)
        # Removing special characters and numbers
        text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
        # Lemmatization
        text = self.lemmatization(text)
        # Stop word Removal
        text = self.stopword_removal(text)
        # Removing single characters
        text = re.sub(r's+[a-zA-Z]s+', '', text)
        # Replacing multi-spaces by a single space
        text = re.sub(r'\s+', ' ', text)
        return text

In [40]:
# Slot Tagger Object for DataLoader
train_ds = SlotTaggerDataset(train_data)
# print(train_ds[0])
val_ds = SlotTaggerDataset(val_data)
# print(val_ds[0])

In [48]:
# PyTorch Data Loader
BATCH_SIZE = 64
train_loader = DataLoader(train_ds, 
                          batch_size = BATCH_SIZE, 
                          shuffle = True)
val_loader = DataLoader(val_ds, 
                        batch_size = BATCH_SIZE, 
                        shuffle = True)

# print(list(islice(train_loader, 1)))
assert train_data.shape[0] == len(train_loader.dataset)
assert val_data.shape[0] == len(val_loader.dataset)

In [79]:
# RNN Model with 1 hidden layer
class LSTM(nn.Module):
    def __init__(self, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout):
        super().__init__()
        
        # Initialize Embedding Layer with Pre-Trained Embeddings (Vector Sequences)
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embeddings).float())
        # LSTM layer process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = False
                           )
#         # We use dropout before the final layer to improve with regularization
#         self.dropout = nn.Dropout(dropout)
        # Dense layer to predict 
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, 
                x, 
                x_lengths):
        embedded = self.embedding(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, x_lengths, batch_first= True, enforce_sorted = False) # Pack sequence
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output) # Unpack sequence
#         output, (hidden, cell) = self.lstm(embedded)
#         output = self.dropout(output)
        output = self.fc(output)
        return output

In [80]:
torch.manual_seed(32)

EMBEDDING_DIM = embeddings.shape[1]
HIDDEN_DIM = 20
OUTPUT_DIM = TAG_COUNT
NUM_LAYERS = 1
BIDIRECTION = False
DROPOUT = 0.2

model = LSTM(EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            NUM_LAYERS, 
            BIDIRECTION, 
            DROPOUT).to(device)

print(model)

LSTM(
  (embedding): Embedding(2196011, 300)
  (lstm): LSTM(300, 20, dropout=0.2)
  (fc): Linear(in_features=20, out_features=29, bias=True)
)


In [81]:
def squeezed_list(my_list):
    temp_list = []
    for element in my_list:
        temp_list.append(int(element))
    return temp_list    

def slice_list(my_list):
    return [my_list[i : i + BATCH_SIZE] for i in range(0, len(my_list), BATCH_SIZE)]

def convert_idx_to_tags(list_of_lists):
    iob_list = []
    for element in list_of_lists:
        iob = []
        for index in element:
            iob.append(idx2tag[index])
        iob_list.append(iob)
    return iob_list

In [88]:
# Model Train Function
def train(loader, 
          model, 
          optimizer, 
          loss_fn):
    model.train()
    losses = []
    pbar = tqdm(loader)
    for x, y, lengths in pbar:
        optimizer.zero_grad()
        print(x.shape, y.shape, lengths.shape)
        
        y_pred = model(x, lengths)
        print(y_pred.shape)
        
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        y = y.view(-1, y.shape[-1])
        
        loss = loss_fn(y_pred, y)
        pbar.set_postfix({'Loss: ': loss.item()})
        losses.append(loss.item())
        
        # Calculate gradients for w/b
        loss.backward()  
        # Update weights according to optimizer rules
        optimizer.step()          
    return sum(losses) / len(losses)

# Model Evaluate Function
def evaluate(loader, 
             model, 
             loss_fn, 
             score_fn):
    model.eval()
    losses = []
    for x, y in tqdm(loader):
        y_pred = model(x)
              
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        y = y.view(-1, y.shape[-1])

        loss = loss_fn(y_pred, y)
        losses.append(loss.item())

        max_preds = y_pred.argmax(dim = 1, keepdim = True) # get the index of the max probability
        max_y = y.argmax(dim = 1, keepdim = True) # get the index of the max probability
        
        predictions_iob = convert_idx_to_tags(slice_list(squeezed_list(max_preds)))
        tags_iob = convert_idx_to_tags(slice_list(squeezed_list(max_y)))
    
    score = score_fn(tags_iob, predictions_iob, scheme = IOB2)
    return tags_iob, predictions_iob, sum(losses) / len(losses), score

In [89]:
# Model Training on Train dataset and Evaluation on Validation dataset
optimizer = torch.optim.Adam(model.parameters(),
                             lr = 0.001)
loss_fn = nn.CrossEntropyLoss().to(device)
score_fn = f1_score

train_loss_list = []
val_loss_list = []
f1_score_list = []
n_epochs = 10
best_acc = 0
PATH = f'best-model.pt'

for epoch in range(n_epochs):
    # Model Training
    train_loss = train(train_loader, 
                     model, 
                     optimizer, 
                     loss_fn)
    print('Train Loss: ', train_loss)
    train_loss_list.append(train_loss)
    
    # Model Evaluation
    labels, predictions, val_loss, accuracy = evaluate(val_loader, 
                                                       model, 
                                                       loss_fn, 
                                                       score_fn)
    print('Val Accuracy: ', accuracy)
    print('Val Loss: ', val_loss)
    val_loss_list.append(val_loss)
    f1_score_list.append(accuracy)
    
    # Save model if Validation F1_Score is greather than 75%
#     if accuracy > best_acc and accuracy > 0.75:
#         torch.save(model.state_dict(), PATH)
#         cm = multilabel_confusion_matrix(labels, predictions)

# print('Train Loss List: ', train_loss_list)  
# print('Val Loss List: ', val_loss_list)
# print('Acc List: ', f1_score_list)

torch.save(model.state_dict(), PATH)

  0%|                                                    | 0/27 [00:00<?, ?it/s]

torch.Size([64, 21]) torch.Size([64, 21, 29]) torch.Size([64])
torch.Size([13, 64, 29])





ValueError: Expected input batch_size (832) to match target batch_size (1344).

In [None]:
# Save and Load the Model
saved_model = LSTM(EMBEDDING_DIM, 
                   HIDDEN_DIM, 
                   OUTPUT_DIM, 
                   NUM_LAYERS, 
                   BIDIRECTION, 
                   DROPOUT).to(device)

saved_model.load_state_dict(torch.load(PATH))
saved_model.eval()

In [None]:
def vectorize_text(text):
    text_list = [word for word in tokenize(text.to_string())]
    text_vector = []
    for word in text_list:
        if word in word2idx:
            text_vector.append(word2idx[word])
        else:
            text_vector.append(1)
    padded_text_vector = text_vector + [word2idx['<pad>']] * (MAX_LENGTH - len(text_vector))
    return torch.from_numpy(np.array(padded_text_vector))
            
def predict(data):
    tag_list = []
    with torch.no_grad():
        for idx, row in test_df.iterrows():
            text_tensor = vectorize_text(row)
            # Using Forward Pass of LSTM for Predictions
            tag_tensor = saved_model.forward(text_tensor)
            tag_tensor = tag_tensor.view(-1, tag_tensor.shape[-1])
            max_tensor = tag_tensor.argmax(dim = 1, keepdim = True)
            predictions_iob = convert_idx_to_tags(slice_list(squeezed_list(max_tensor)))
            tag_list.append(predictions_iob)
    return tag_list
        
iob_preds_list = predict(test_df)
iob_preds_list = np.squeeze(np.array(iob_preds_list))
iob_preds_list = iob_preds_list.tolist()

def truncate_padding(my_list):
    trucated_list = []
    for element in my_list:
        truncated_el = []
        for tag in element:
            if(tag != '<pad>'):
                truncated_el.append(tag)
        trucated_list.append(truncated_el)
    return trucated_list

truncated_iob_list = truncate_padding(iob_preds_list)

# Creating Dictionary
list_of_dict = []
for i in range(0, len(truncated_iob_list)):
    temp_dict = {}
    temp_dict["ID"] = i
    temp_dict["IOB Slot tags"] = ' '.join(truncated_iob_list[i])
    list_of_dict.append(temp_dict)

# Converting Dictionary to CSV and compressing it for submission to CodaLab
tags_df = pd.DataFrame.from_dict(list_of_dict)
tags_df.to_csv('submission.csv.zip', compression = 'zip', index = False)
print(tags_df)