In [9]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

from tqdm import tqdm
from collections import Counter

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
stopwords = English.Defaults.stop_words
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

# Set device = CUDA if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [10]:
# Reading CSV File into Pandas DataFrame
df = pd.read_csv('Data/hw1_train-1.csv', index_col=0)
df.columns = ['text', 'labels']
df['labels'] = df['labels'].str.replace('none', '')
df['labels'] = df['labels'].fillna('')

In [14]:
# Splitting the dataset into the Training set and Validation set
train_data, val_data = train_test_split(df,
                                        random_state=0, 
                                        test_size=0.25, 
                                        shuffle=True)
print("Train Data: ")
print(train_data.head())
print("Val Data: ")
print(val_data.head())
print('Length -> Train Data: ' + str(len(train_data)))
print('Length -> Val Data: ' + str(len(val_data)))

Train Data: 
                                            text                        labels
ID                                                                            
909               can you show me movies spanish  movie.country movie.language
820                         in search for movies                              
1185  are there any g movies out there right now                  movie.rating
1971   show me more information about will smith                              
564                          who directed batman             movie.directed_by
Val Data: 
                                               text  \
ID                                                    
2154          show me the producer of the godfather   
563   can you tell me who is the director of batman   
789      get details of original finding nemo movie   
1325                             list french movies   
570    who directed the movie the thing called love   

                            labels  


In [15]:
# Creating a vocabulary
vocab_size = 2_000
all_tokens = []
for text in df['text']:
    tokens = tokenizer(text)
    all_tokens.extend([i.text for i in tokens])
# print(all_tokens)    

# Decoder
decoder = dict(enumerate(all_tokens))
# print(decoder)

# Encoder
encoder = {token: idx for idx, token in decoder.items()}
# print(len(encoder))
encoder['<unk>'] = len(encoder)
vocab = encoder
# print(vocab)

In [16]:
# Create a set of Labels
label_set = set()
for index, row in df.iterrows():
    temp = row['labels'].split()
    for item in temp:
        label_set.add(item)      

label_list = np.reshape(np.array(list(label_set)), 
                            (-1, 1))
# print(label_list)
# print(label_list.shape)    

In [17]:
# Fit MLB to Label List
mlb.fit(label_list)
# print(mlb.classes_)
# print(len(mlb.classes_))

# Enumerate classes for Labels        
id_to_label = {}
for idx, label in enumerate(mlb.classes_):
    id_to_label[idx] = label
print(id_to_label)  

{0: 'actor.gender', 1: 'gr.amount', 2: 'movie.country', 3: 'movie.directed_by', 4: 'movie.estimated_budget', 5: 'movie.genre', 6: 'movie.gross_revenue', 7: 'movie.initial_release_date', 8: 'movie.language', 9: 'movie.locations', 10: 'movie.music', 11: 'movie.produced_by', 12: 'movie.production_companies', 13: 'movie.rating', 14: 'movie.starring.actor', 15: 'movie.starring.character', 16: 'movie.subjects', 17: 'person.date_of_birth'}


In [18]:
# Tf-Idf
vectorizer = TfidfVectorizer()

# Learn vocabulary from training texts and vectorize training texts.
vectorizer.fit(train_data['text'])

In [19]:
# Relation Extraction Class for DataLoader
class RelationExtractionDataset(Dataset):
    
    def __init__(self, 
                 data: pd.DataFrame, 
                 vocab):
        self.data = data
        self.text = self.data['text']
        self.labels = self.data['labels']
        self.vocab = vocab
        self.default = self.vocab['<unk>']
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, 
                    idx: int):
        text = self.text.iloc[idx]
        text = self.preprocess_text(self.text.iloc[idx])
        label = self.labels.iloc[idx].split()
        return self.encode_text(text), self.encode_label(label)

    def encode_text(self, 
                    text: str):
        text_list = []
        text_list.append(text)
        # Return tensor of encoded text using TF-IDF
        encoded = vectorizer.transform(text_list)
        encoded_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(encoded)).float()
        return encoded_tensor
        
    def encode_label(self, 
                     label: str):
        # Convert label into NumPy matrix
        label_array = np.array(label)
        label_matrix = np.reshape(label_array, 
                                  (1, -1))
        # Return tensor of encoded label using Multi-Label Binarizer
        encoded = mlb.transform(label_matrix)
        encoded_label = torch.from_numpy(encoded)
        return encoded_label
         
    def tokenize(self, 
                 text: str):
        return [i.text for i in tokenizer(text)]

    def stopword_removal(self,
                         text: str):
        corpus = []
        for item in self.tokenize(text):
            if item not in stopwords:
                corpus.append(item)
        corpus = ' '.join(corpus)          
        return corpus

    def preprocess_text(self, 
                        text: str):
        # Convert text to lowercase
        text = text.lower()
        # Removing text contractions
        text = re.sub(r"there's", "there is", text)
        text = re.sub(r"that's", "that is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text)
        text = re.sub(r"who's", "who is", text)
        text = re.sub(r"you're", "you are", text)
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"i'd", "i would", text)
        text = re.sub(r"ain't", "am not", text)
        text = re.sub(r"don't", "donot", text)
        text = re.sub(r"can't", "cannot", text)
        text = re.sub(r"could't", "could not", text)
        text = re.sub(r"should'nt", "should not", text)
        text = re.sub(r"won't", "will not", text)
        # Removing all punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Removing links
        text = re.sub(r'http\S+', '', text)
        # Removing special characters and numbers
        text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
        # Removing single characters
        text = re.sub(r's+[a-zA-Z]s+', '', text)
        # Removing stopwords
        text = self.stopword_removal(text)
        # Replacing multi-spaces by a single space
        text = re.sub(r'\s+', ' ', text)
        return text

In [20]:
# Relation Extraction Object for DataLoader
train_ds = RelationExtractionDataset(train_data, vocab)
val_ds = RelationExtractionDataset(val_data, vocab)

In [26]:
# PyTorch Data Loader
train_loader = DataLoader(train_ds, batch_size=1, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1, shuffle=True)

In [27]:
# Multi-Layer Perceptron
class MLP(nn.Module):
    def __init__(
        self, 
        input_dim, 
        hidden_dim, 
        output_dim
    ):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(relu)
        output = self.sigmoid(output)
        return output

In [23]:
torch.manual_seed(32)
input_features_dim = train_ds[0][0].shape[1]
model = MLP(input_features_dim, 
            200, 
            18).to(device)
print(model)

MLP(
  (fc1): Linear(in_features=981, out_features=200, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=200, out_features=18, bias=True)
  (sigmoid): Sigmoid()
)


In [28]:
# Model Train Function
def train(loader, 
          model, 
          optimizer, 
          loss_fn):
    model.train()
    losses = []
    pbar = tqdm(loader)
    for x, y in pbar:
        optimizer.zero_grad()
        
        y_pred = model(x)
        
        y = y.squeeze()
        y = y.to(torch.float32)
        
        y_pred = y_pred.squeeze()
        y_pred = y_pred.to(torch.float32)

        loss = loss_fn(y_pred, y)
        
        # Calculate gradients for w/b
        loss.backward()  
        # Update weights according to optimizer rules
        optimizer.step()  
        losses.append(loss)
        
    return sum(losses) / len(losses)

# Model Evaluate Function
def evaluate(loader, 
             model, 
             loss_fn, 
             score_fn):
    model.eval()
    predictions = []
    labels = []
    for x, y in tqdm(loader):
        
        y_pred = model(x)
        
        y = y.squeeze()
        y = y.to(torch.float32)
        
        y_pred = y_pred.squeeze()
        y_pred = y_pred.to(torch.float32)
        y_pred = torch.round(y_pred)
        
        loss = loss_fn(y_pred, y)
        
        y = y.detach().numpy()
        y_pred = y_pred.detach().numpy()
        
        predictions.extend(y_pred)
        labels.extend(y)
    
#     print("Y: ")
#     print(y)
#     print("Y_PRED: ")
#     print(y_pred)
    
    score = score_fn(labels, predictions, average='weighted')
    return score

In [30]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
loss_fn = nn.BCELoss()
score_fn = f1_score

n_epochs = 10
best_acc = 0
PATH = f'best-model.pt'
for epoch in range(n_epochs):
    avg_loss = train(train_loader, 
                     model, 
                     optimizer, 
                     loss_fn)
    print('Train Loss: ', avg_loss)
    accuracy = evaluate(val_loader, 
                        model, 
                        loss_fn, 
                        score_fn)
    print('Val Accuracy: ', accuracy)
    if accuracy > best_acc and accuracy > 0.7:
        torch.save(model.state_dict(), PATH)

100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1324.01it/s]


Train Loss:  tensor(0.6672, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2877.71it/s]


Val Accuracy:  0.9145841911753578


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1359.39it/s]


Train Loss:  tensor(0.6650, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2949.03it/s]


Val Accuracy:  0.9187624746063294


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1358.10it/s]


Train Loss:  tensor(0.6627, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2937.45it/s]


Val Accuracy:  0.9210888695859241


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1354.23it/s]


Train Loss:  tensor(0.6605, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2952.55it/s]


Val Accuracy:  0.9209486085727029


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1352.87it/s]


Train Loss:  tensor(0.6583, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2918.42it/s]


Val Accuracy:  0.9209966335603219


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1332.79it/s]


Train Loss:  tensor(0.6561, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2795.98it/s]


Val Accuracy:  0.9209966335603219


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1361.30it/s]


Train Loss:  tensor(0.6538, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2962.96it/s]


Val Accuracy:  0.9209966335603219


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1367.00it/s]


Train Loss:  tensor(0.6516, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2913.26it/s]


Val Accuracy:  0.9209966335603219


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1360.67it/s]


Train Loss:  tensor(0.6494, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2952.01it/s]


Val Accuracy:  0.9209966335603219


100%|█████████████████████████████████████| 1734/1734 [00:01<00:00, 1354.73it/s]


Train Loss:  tensor(0.6472, grad_fn=<DivBackward0>)


100%|███████████████████████████████████████| 578/578 [00:00<00:00, 2945.57it/s]


Val Accuracy:  0.9209966335603219


In [43]:
# Save and Load the Model
saved_model = MLP(input_features_dim, 
            200, 
            18).to(device)
saved_model.load_state_dict(torch.load(PATH))
saved_model.eval()

MLP(
  (fc1): Linear(in_features=981, out_features=200, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=200, out_features=18, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
with torch.no_grad():
    print(new_model(mystery_iris))
    print()
    print(labels[new_model(mystery_iris).argmax()])