In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re, string
import emoji
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from collections import Counter
from gensim.models import Word2Vec
from sklearn.metrics import classification_report, confusion_matrix

seed = 8
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

  plt.style.use("seaborn-whitegrid")


<Figure size 640x480 with 0 Axes>

In [107]:
test_df = pd.read_csv("./data/yahoo_answers_csv/test_cleaned.csv")
test_df = test_df[test_df['text_clean'].notnull()] 
test_df['target'] = test_df['target'] - 1

train_df = pd.read_csv("./data/yahoo_answers_csv/train_cleaned.csv")
train_df = train_df[train_df['text_clean'].notnull()] 
train_df['target'] = train_df['target'] - 1

In [134]:
def pre_process(df):
    text_lens = []
    for text in df.text_clean:
        text_len = len(text.split())
        text_lens.append(text_len)
    df['text_len'] = text_lens
    df = df[df['text_len'] > 3]
    df = df[df['text_len'] < 50]
    return df


def generate_feature(column, vocab_to_int):
    ##Tokenize the columns text using the vocabulary
    text_int = []
    for text in column:
        r = [vocab_to_int[word] for word in text.split() if word in vocab_to_int.keys()]
        text_int.append(r)
    ##Add padding to tokens
    features = np.zeros((len(text_int), seq_len), dtype = int)
    for i, review in enumerate(text_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[: seq_len]
        features[i, :] = np.array(new)
    return features
    
    
def Tokenize(column_train, column_test, seq_len):
    ##Create vocabulary of words from column
    column = column_train + column_test
    corpus = [word for text in column for word in text.split()]
    count_words = Counter(corpus)
    # sorted_words = count_words.most_common()
    # vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
    count_words_now = [(w, c) for (w,c) in count_words.items() if c < 30000 and c > 50]
    vocab_to_int = {w:i+1 for i, (w,c) in enumerate(count_words_now)}

    features_train = generate_feature(column_train, vocab_to_int)
    features_test = generate_feature(column_test, vocab_to_int)

    return vocab_to_int, features_train, features_test


test_df = pre_process(test_df)
train_df = pre_process(train_df)
vocabulary_train, tokenized_column_train, tokenized_column_test = Tokenize(list(train_df["text_clean"]), list(test_df["text_clean"]), 69)

In [138]:
Word2vec_train_data = list(map(lambda x: x.split(), train_df["text_clean"]))
EMBEDDING_DIM = 200
word2vec_model = Word2Vec(Word2vec_train_data, vector_size=EMBEDDING_DIM)
VOCAB_SIZE = len(vocabulary_train) + 1
word2vec_model

<gensim.models.word2vec.Word2Vec at 0x7f793bda7d90>

In [139]:
#define empty embedding matrix
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
    
#fill the embedding matrix with the pre trained values from word2vec
#    corresponding to word (string), token (number associated to the word)
for word, token in vocabulary_train.items():
    if word2vec_model.wv.__contains__(word):
        embedding_matrix[token] = word2vec_model.wv.__getitem__(word)

print("Embedding Matrix Shape:", embedding_matrix.shape)

Embedding Matrix Shape: (16570, 200)


In [140]:
train_data = TensorDataset(torch.from_numpy(tokenized_column_train), torch.from_numpy(train_df['target'].to_numpy()))
# test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
valid_data = TensorDataset(torch.from_numpy(tokenized_column_test), torch.from_numpy(test_df['target'].to_numpy()))
BATCH_SIZE = 1024
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=True) 
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)
# test_loader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)

In [141]:
NUM_CLASSES = 10
HIDDEN_DIM = 100 
LSTM_LAYERS = 1

LR = 3e-4 
DROPOUT = 0.5
BIDIRECTIONAL = True 
EPOCHS = 40 

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [142]:
class BiLSTM_Sentiment_Classifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, lstm_layers, bidirectional,batch_size, dropout):
        super(BiLSTM_Sentiment_Classifier,self).__init__()
        
        self.lstm_layers = lstm_layers
        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.batch_size = batch_size
        

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            dropout=dropout,
                            bidirectional=bidirectional,
                            batch_first=True)

        self.fc = nn.Linear(hidden_dim*self.num_directions, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, hidden):
        self.batch_size = x.size(0)
        embedded = self.embedding(x)
        out, hidden = self.lstm(embedded, hidden)
        out = out[:,-1,:]
        out = self.fc(out)
        out = self.softmax(out)

        return out, hidden

    def init_hidden(self, batch_size):
        #Initialization of the LSTM hidden and cell states
        h0 = torch.zeros((self.lstm_layers*self.num_directions, batch_size, self.hidden_dim)).detach().to(DEVICE)
        c0 = torch.zeros((self.lstm_layers*self.num_directions, batch_size, self.hidden_dim)).detach().to(DEVICE)
        hidden = (h0, c0)
        return hidden

In [143]:
model = BiLSTM_Sentiment_Classifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM,NUM_CLASSES, LSTM_LAYERS,BIDIRECTIONAL, BATCH_SIZE, DROPOUT)
model = model.to(DEVICE)
model.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
model.embedding.weight.requires_grad=True
criterion = nn.NLLLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay = 5e-6)
print(model)

BiLSTM_Sentiment_Classifier(
  (embedding): Embedding(16570, 200)
  (lstm): LSTM(200, 100, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=200, out_features=10, bias=True)
  (softmax): LogSoftmax(dim=1)
)




In [144]:
total_step = len(train_loader)
total_step_val = len(valid_loader)

early_stopping_patience = 4
early_stopping_counter = 0

valid_acc_max = 0 # Initialize best accuracy top 0

for e in range(EPOCHS):
    print("Start Epoch: " + str(e))
    #lists to host the train and validation losses of every batch for each epoch
    train_loss, valid_loss  = [], []
    #lists to host the train and validation accuracy of every batch for each epoch
    train_acc, valid_acc  = [], []

    #lists to host the train and validation predictions of every batch for each epoch
    y_train_list, y_val_list = [], []

    #initalize number of total and correctly classified texts during training and validation
    correct, correct_val = 0, 0
    total, total_val = 0, 0
    running_loss, running_loss_val = 0, 0


    ####TRAINING LOOP####

    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE) #load features and targets in device

        h = model.init_hidden(labels.size(0))

        model.zero_grad() #reset gradients 

        output, h = model(inputs,h) #get output and hidden states from LSTM network
        
        loss = criterion(output, labels)
        loss.backward()
        
        running_loss += loss.item()
        
        optimizer.step()

        y_pred_train = torch.argmax(output, dim=1) #get tensor of predicted values on the training set
        y_train_list.extend(y_pred_train.squeeze().tolist()) #transform tensor to list and the values to the list
        
        correct += torch.sum(y_pred_train==labels).item() #count correctly classified texts per batch
        total += labels.size(0) #count total texts per batch

    train_loss.append(running_loss / total_step)
    train_acc.append(100 * correct / total)

    ####VALIDATION LOOP####
    
    with torch.no_grad():
        
        model.eval()
        
        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            val_h = model.init_hidden(labels.size(0))

            output, val_h = model(inputs, val_h)

            val_loss = criterion(output, labels)
            running_loss_val += val_loss.item()

            y_pred_val = torch.argmax(output, dim=1)
            y_val_list.extend(y_pred_val.squeeze().tolist())

            correct_val += torch.sum(y_pred_val==labels).item()
            total_val += labels.size(0)

        valid_loss.append(running_loss_val / total_step_val)
        valid_acc.append(100 * correct_val / total_val)

    #Save model if validation accuracy increases
    if np.mean(valid_acc) >= valid_acc_max:
        torch.save(model.state_dict(), './state_dict.pt')
        print(f'Epoch {e+1}:Validation accuracy increased ({valid_acc_max:.6f} --> {np.mean(valid_acc):.6f}).  Saving model ...')
        valid_acc_max = np.mean(valid_acc)
        early_stopping_counter=0 #reset counter if validation accuracy increases
    else:
        print(f'Epoch {e+1}:Validation accuracy did not increase')
        early_stopping_counter+=1 #increase counter if validation accuracy does not increase
        
    if early_stopping_counter > early_stopping_patience:
        print('Early stopped at epoch :', e+1)
        break
    
    print(f'\tTrain_loss : {np.mean(train_loss):.4f} Val_loss : {np.mean(valid_loss):.4f}')
    print(f'\tTrain_acc : {np.mean(train_acc):.3f}% Val_acc : {np.mean(valid_acc):.3f}%')

Start Epoch: 0
Epoch 1:Validation accuracy increased (0.000000 --> 67.460938).  Saving model ...
	Train_loss : 1.1713 Val_loss : 1.0182
	Train_acc : 62.472% Val_acc : 67.461%
Start Epoch: 1
Epoch 2:Validation accuracy increased (67.460938 --> 68.400879).  Saving model ...
	Train_loss : 0.9889 Val_loss : 0.9817
	Train_acc : 67.864% Val_acc : 68.401%
Start Epoch: 2
Epoch 3:Validation accuracy increased (68.400879 --> 68.984375).  Saving model ...
	Train_loss : 0.9554 Val_loss : 0.9663
	Train_acc : 68.846% Val_acc : 68.984%
Start Epoch: 3
Epoch 4:Validation accuracy increased (68.984375 --> 69.089355).  Saving model ...
	Train_loss : 0.9327 Val_loss : 0.9560
	Train_acc : 69.533% Val_acc : 69.089%
Start Epoch: 4
Epoch 5:Validation accuracy increased (69.089355 --> 69.333496).  Saving model ...
	Train_loss : 0.9147 Val_loss : 0.9527
	Train_acc : 70.062% Val_acc : 69.333%
Start Epoch: 5
Epoch 6:Validation accuracy increased (69.333496 --> 69.521484).  Saving model ...
	Train_loss : 0.8991 Va

In [None]:
model.load_state_dict(torch.load('./state_dict.pt'))
model.eval()
y_pred_list = []
y_test_list = []
for inputs, labels in test_loader:
    inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
    test_h = model.init_hidden(labels.size(0))

    output, val_h = model(inputs, test_h)
    y_pred_test = torch.argmax(output, dim=1)
    y_pred_list.extend(y_pred_test.squeeze().tolist())
    y_test_list.extend(labels.squeeze().tolist())

In [None]:
confusion_matrix(y_test_list,y_pred_list)