# Test Set Analysis

This notebook will run the tuned models on the test sets and compare performance.

In [46]:
# Importing libraries
import pandas as pd
import numpy as np
import torch
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
import tensorflow as tf
from tensorflow import keras
import pickle
import sys
sys.path.append('/Users/jinalshah/Jinal/Projects/nlp-disaster-tweets/src')

from preprocessing import Preprocessing
from transformer import Transformer

In [47]:
testing_metrics = []
training_metrics = []

# Creating a function that returns the metrics
def get_metrics(truth,predictions):
    f1 = f1_score(truth,predictions)
    precision = precision_score(truth,predictions)
    recall = recall_score(truth,predictions)
    accuracy = accuracy_score(truth,predictions)
    return f1, precision, recall, accuracy

In [52]:
# Importing the data
testing = pd.read_csv('../inputs/bag_of_words_testing.csv')
training = pd.read_csv('../inputs/bag_of_words_training.csv')

# Splitting data into X & y
train_x = training.drop(['ID','Target','Keyword'],axis=1)
train_y = training['Target'].values
test_x = testing.drop(['ID','Target','Keyword'],axis=1)
test_y = testing['Target'].values

## Logistic Regression

In [53]:
# Getting the model
logReg = pickle.load(open('../models/logistic-regression.sav','rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [54]:
# Making predictions
test_predictions = logReg.predict(test_x)
train_predictions = logReg.predict(train_x)

In [55]:
# Calcutating the score
train_metrics = get_metrics(train_y,train_predictions)
train_metrics_df = {'Name':'Logistic Regression','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
test_metrics = get_metrics(test_y,test_predictions)
test_metrics_df = {'Name':'Logistic Regression','F1':test_metrics[0],'Precision':test_metrics[1],'Recall':test_metrics[2],'Accuracy':test_metrics[3]}

In [56]:
# Checking the train metrics
train_metrics_df

{'Name': 'Logistic Regression',
 'F1': 0.8074534161490682,
 'Precision': 0.8205128205128205,
 'Recall': 0.7948032097821933,
 'Accuracy': 0.8371100164203612}

In [57]:
# Checking the test metrics
test_metrics_df

{'Name': 'Logistic Regression',
 'F1': 0.7713625866050807,
 'Precision': 0.7767441860465116,
 'Recall': 0.7660550458715596,
 'Accuracy': 0.8049901510177282}

In [58]:
# Appending the results to the list
training_metrics.append(train_metrics_df)
testing_metrics.append(test_metrics_df)

## Gradient Boosting

In [59]:
# Loading the model
catboost_clf = pickle.load(open('../models/gradient-boosting.sav','rb'))

In [60]:
# Making predictions
test_predictions = catboost_clf.predict(test_x)
train_predictions = catboost_clf.predict(train_x)

In [61]:
# Calcutating the score
train_metrics = get_metrics(train_y,train_predictions)
train_metrics_df = {'Name':'Gradient Boosting','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
test_metrics = get_metrics(test_y,test_predictions)
test_metrics_df = {'Name':'Gradient Boosting','F1':test_metrics[0],'Precision':test_metrics[1],'Recall':test_metrics[2],'Accuracy':test_metrics[3]}

In [62]:
# Checking the train metrics
train_metrics_df

{'Name': 'Gradient Boosting',
 'F1': 0.7834370139968897,
 'Precision': 0.7973882073605065,
 'Recall': 0.7699656094764998,
 'Accuracy': 0.8170771756978653}

In [63]:
# Checking the metrics
test_metrics_df

{'Name': 'Gradient Boosting',
 'F1': 0.7415384615384615,
 'Precision': 0.7461300309597523,
 'Recall': 0.7370030581039755,
 'Accuracy': 0.7793827971109653}

In [64]:
# Appending the results to the list
training_metrics.append(train_metrics_df)
testing_metrics.append(test_metrics_df)

## GRU

In [65]:
# Importing the data
raw_train = pd.read_csv('../inputs/train.csv')

# Getting the testing data
training, testing = train_test_split(raw_train,test_size=0.2,random_state=42,shuffle=True,stratify=raw_train['target'])
training.reset_index(drop=True,inplace=True)
testing.reset_index(drop=True,inplace=True)

# Splitting data into X & Y
train_x = training.drop(['target'],axis=1)
train_y = training['target'].values
test_x = testing.drop(['target'],axis=1)
test_y = testing['target'].values

# Getting the preprocessed text
preprocessor = Preprocessing()
preprocessed_train_x = preprocessor.preprocess_data(train_x)
preprocessed_test_x = preprocessor.preprocess_data(test_x)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jinalshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [66]:
# Transforming data to (number of examples, 57, 1000)
training_X = []
testing_X = []

# Opening the vocabulary and marking 1 to indicate the word
with open('../mappers/word2index.json') as file:
    vocabulary = json.load(file)

    # Iterating through the testing
    for sentence_index in range(0,len(preprocessed_test_x)):
        sentence_convert = [1001] * 57
        for word_index in range(0,len(preprocessed_test_x[sentence_index])):
            # If the word is in the vocab, get the index
            word = preprocessed_test_x[sentence_index][word_index]
            if word in vocabulary.keys():
                sentence_convert[word_index] = vocabulary[word]
            else:
                sentence_convert[word_index] = 1000

        testing_X.append(sentence_convert)
    
    # Iterating through the training
    for sentence_index in range(0,len(preprocessed_train_x)):
        sentence_convert = [1001] * 57
        for word_index in range(0,len(preprocessed_train_x[sentence_index])):
            # If the word is in the vocab, get the index
            word = preprocessed_train_x[sentence_index][word_index]
            if word in vocabulary.keys():
                sentence_convert[word_index] = vocabulary[word]
            else:
                sentence_convert[word_index] = 1000

        training_X.append(sentence_convert)

testing_X = np.array(testing_X)
training_X = np.array(training_X)

In [67]:
# Loading the model
# Note, when loading keras models make sure the versions 
# match-up (version of keras model when saving and the version of keras in the environment)
gru_model = keras.models.load_model('../models/gru.keras',compile=True)

In [68]:
# Making predictions
test_predictions = gru_model.predict(testing_X)
train_predictions = gru_model.predict(training_X)



In [69]:
# Calcutating the score and adding to the array
train_metrics = get_metrics(train_y,np.rint(train_predictions))
train_metrics_df = {'Name':'GRU','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
test_metrics = get_metrics(test_y,np.rint(test_predictions))
test_metrics_df = {'Name':'GRU','F1':test_metrics[0],'Precision':test_metrics[1],'Recall':test_metrics[2],'Accuracy':test_metrics[3]}

In [70]:
# Checking the train metrics
train_metrics_df

{'Name': 'GRU',
 'F1': 0.8098484848484848,
 'Precision': 0.8028539241457003,
 'Recall': 0.8169659915934276,
 'Accuracy': 0.8351395730706076}

In [71]:
# Checking the metrics
test_metrics_df

{'Name': 'GRU',
 'F1': 0.7540983606557378,
 'Precision': 0.7354651162790697,
 'Recall': 0.7737003058103975,
 'Accuracy': 0.783322390019698}

In [72]:
# Appending the results to the list
training_metrics.append(train_metrics_df)
testing_metrics.append(test_metrics_df)

## LSTM

In [73]:
# Loading the model
# Note, when loading keras models make sure the versions 
# match-up (version of keras model when saving and the version of keras in the environment)
lstm_model = keras.models.load_model('../models/lstm.keras',compile=True)

In [74]:
# Making predictions
test_predictions = lstm_model.predict(testing_X)
train_predictions = lstm_model.predict(training_X)



In [75]:
# Scoring
train_metrics = get_metrics(train_y,np.rint(train_predictions))
train_metrics_df = {'Name':'LSTM','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
test_metrics = get_metrics(test_y,np.rint(test_predictions))
test_metrics_df = {'Name':'LSTM','F1':test_metrics[0],'Precision':test_metrics[1],'Recall':test_metrics[2],'Accuracy':test_metrics[3]}

In [76]:
# Checking the train metrics
train_metrics_df

{'Name': 'LSTM',
 'F1': 0.8064516129032259,
 'Precision': 0.7918968692449355,
 'Recall': 0.8215513947267864,
 'Accuracy': 0.8305418719211822}

In [77]:
# Checking the metrics
test_metrics_df

{'Name': 'LSTM',
 'F1': 0.7559171597633136,
 'Precision': 0.7320916905444126,
 'Recall': 0.7813455657492355,
 'Accuracy': 0.783322390019698}

In [78]:
# Appending the results to the list
training_metrics.append(train_metrics_df)
testing_metrics.append(test_metrics_df)

## Transformer

In [79]:
# Loading the model
transformer = Transformer(embed_dim=200,heads=4,dropout_rate=0.4)
transformer.load_state_dict(torch.load('../models/transformers200-4-0-4.pt'))
transformer.eval()

Transformer(
  (embedding): Embedding(1002, 200)
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
  )
  (layernorm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
  (atten_output): Linear(in_features=200, out_features=200, bias=True)
  (final_atten_output): Linear(in_features=200, out_features=200, bias=True)
  (layernorm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
  (clf_output_one): Linear(in_features=200, out_features=50, bias=True)
  (output): Linear(in_features=50, out_features=2, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)

In [80]:
# Making the predictions
test_predictions = transformer(torch.from_numpy(testing_X))
train_predictions = transformer(torch.from_numpy(training_X))

In [81]:
# Calcutating the score and adding to the array
train_metrics = get_metrics(train_y,np.argmax(train_predictions.detach().numpy(),axis=1))
train_metrics_df = {'Name':'Transformer','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
test_metrics = get_metrics(test_y,np.argmax(test_predictions.detach().numpy(),axis=1))
test_metrics_df = {'Name':'Transformer','F1':test_metrics[0],'Precision':test_metrics[1],'Recall':test_metrics[2],'Accuracy':test_metrics[3]}

In [82]:
# Checking the train metrics
train_metrics_df

{'Name': 'Transformer',
 'F1': 0.7769039735099339,
 'Precision': 0.8474040632054176,
 'Recall': 0.7172334734428735,
 'Accuracy': 0.8229885057471265}

In [83]:
# Checking the metrics
test_metrics_df

{'Name': 'Transformer',
 'F1': 0.7323481116584564,
 'Precision': 0.7907801418439716,
 'Recall': 0.6819571865443425,
 'Accuracy': 0.7859487852921865}

In [84]:
# Appending the results to the list
training_metrics.append(train_metrics_df)
testing_metrics.append(test_metrics_df)

In [85]:
# Viewing the metrics in a dataframe
training_metrics_df = pd.DataFrame(training_metrics)
testing_metrics_df = pd.DataFrame(testing_metrics)

print('Training Metrics:')
print(training_metrics_df)

print('Testing Metrics:')
print(testing_metrics_df)

Training Metrics:
                  Name        F1  Precision    Recall  Accuracy
0  Logistic Regression  0.807453   0.820513  0.794803  0.837110
1    Gradient Boosting  0.783437   0.797388  0.769966  0.817077
2                  GRU  0.809848   0.802854  0.816966  0.835140
3                 LSTM  0.806452   0.791897  0.821551  0.830542
4          Transformer  0.776904   0.847404  0.717233  0.822989
Testing Metrics:
                  Name        F1  Precision    Recall  Accuracy
0  Logistic Regression  0.771363   0.776744  0.766055  0.804990
1    Gradient Boosting  0.741538   0.746130  0.737003  0.779383
2                  GRU  0.754098   0.735465  0.773700  0.783322
3                 LSTM  0.755917   0.732092  0.781346  0.783322
4          Transformer  0.732348   0.790780  0.681957  0.785949


In [89]:
# Checking overfitting
overfit = training_metrics_df.drop('Name',axis=1) - testing_metrics_df.drop('Name',axis=1)
overfit['Name'] = training_metrics_df['Name']
overfit

Unnamed: 0,F1,Precision,Recall,Accuracy,Name
0,0.036091,0.043769,0.028748,0.03212,Logistic Regression
1,0.041899,0.051258,0.032963,0.037694,Gradient Boosting
2,0.05575,0.067389,0.043266,0.051817,GRU
3,0.050534,0.059805,0.040206,0.047219,LSTM
4,0.044556,0.056624,0.035276,0.03704,Transformer


In [86]:
# Saving the metrics to a csv file for future reference
training_metrics_df.to_csv('../performances/training_metrics.csv',index=False)
testing_metrics_df.to_csv('../performances/testing_metrics.csv',index=False)

Seems like Logistic Regression is the best model in this case! I am going to submit predictions from all 5 models to see how they stack up.

Hypothesis: Model Performance will follow this trend:
1) Logistic Regression
2) Gradient Boosting
3) Transformer
4) LSTM
5) GRU

Actual Trend:
1. Transformer - 0.7768
2. Logistic Regression - 0.768
3. GRU - 0.76739
4. LSTM - 075758
5. Gradient Boosting - 0.75758

In [90]:
# Getting the submission data
sub_data = pd.read_csv('../inputs/test.csv')

In [93]:
# Getting the preprocessed sentences
preprocessor = Preprocessing()
cleaned_sub_tweets = preprocessor.preprocess_data(sub_data)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jinalshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [94]:
# Creating a dictionary with the counts of each word in each tweet
word_counts = []
for sentence in cleaned_sub_tweets:
    sentence_tokens = {}
    for word in sentence:
        if word in sentence_tokens.keys():
            temp = sentence_tokens[word] + 1
            sentence_tokens[word] = temp
        else:
            sentence_tokens[word] = 1
    word_counts.append(sentence_tokens)

In [96]:
# Creating the bag of words vector for each example
bag_of_words = []

with open('../mappers/word2index.json') as file:
    vocabulary = json.load(file)
    
    for index in range(0,len(word_counts)):
        vector = {}

        for word in vocabulary.keys():
            if word in word_counts[index].keys():
                vector[word] = word_counts[index][word]
            else:
                vector[word] = 0
        bag_of_words.append(vector)

sub_bow_df = pd.DataFrame(bag_of_words)
sub_bow_df

Unnamed: 0,+,10,11,11-year-old,12,12000,13,15,16,16yr,...,year,years,yes,yet,york,young,youth,youtube,zone,~
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
# Getting the logistic regression predictions
pred_logreg = logReg.predict(sub_bow_df)
pred_logreg_df = pd.DataFrame()
pred_logreg_df['id'] = sub_data['id']
pred_logreg_df['target'] = pred_logreg

# Saving the dataframe
pred_logreg_df.to_csv('../predictions/pred_logreg.csv',index=False)

In [101]:
# Getting the gradient boosting predictions
pred_gb = catboost_clf.predict(sub_bow_df)
pred_gb_df = pd.DataFrame()
pred_gb_df['id'] = sub_data['id']
pred_gb_df['target'] = pred_gb

# Saving the dataframe
pred_gb_df.to_csv('../predictions/pred_gb.csv',index=False)

In [102]:
# Transforming data to (number of examples, 57, 1000)
submission_input = []

# Opening the vocabulary and marking 1 to indicate the word
with open('../mappers/word2index.json') as file:
    vocabulary = json.load(file)

    # Iterating through the testing
    for sentence_index in range(0,len(cleaned_sub_tweets)):
        sentence_convert = [1001] * 57
        for word_index in range(0,len(cleaned_sub_tweets[sentence_index])):
            # If the word is in the vocab, get the index
            word = cleaned_sub_tweets[sentence_index][word_index]
            if word in vocabulary.keys():
                sentence_convert[word_index] = vocabulary[word]
            else:
                sentence_convert[word_index] = 1000

        submission_input.append(sentence_convert)
submission_input = np.array(submission_input)

In [118]:
# Getting the gru predictions
pred_gru = gru_model.predict(submission_input)
pred_gru_df = pd.DataFrame()
pred_gru_df['id'] = sub_data['id']
pred_gru_df['target'] = np.rint(pred_gru)
pred_gru_df['target'] = pred_gru_df['target'].astype(int)

# Saving the dataframe
pred_gru_df.to_csv('../predictions/pred_gru.csv',index=False)



In [119]:
# Getting the lstm predictions
pred_lstm = lstm_model.predict(submission_input)
pred_lstm_df = pd.DataFrame()
pred_lstm_df['id'] = sub_data['id']
pred_lstm_df['target'] = np.rint(pred_lstm)
pred_lstm_df['target'] = pred_lstm_df['target'].astype(int)

# Saving the dataframe
pred_lstm_df.to_csv('../predictions/pred_lstm.csv',index=False)





In [106]:
# Getting the transformer predictions
pred_transformer= transformer(torch.from_numpy(submission_input))
pred_transformer_df = pd.DataFrame()
pred_transformer_df['id'] = sub_data['id']
pred_transformer_df['target'] = np.argmax(pred_transformer.detach().numpy(),axis=1)

# Saving the dataframe
pred_transformer_df.to_csv('../predictions/pred_transformer.csv',index=False)