# Model Tuning

This notebook will be dedicated to tuning the following models:
1. Logistic Regression
2. Gradient Boosting Trees
3. GRU
4. LSTM

This notebook was put on Google Colab to leverage the GPU capabilities.

In [73]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, GRU, LSTM, Embedding, Dropout, BatchNormalization
from keras.models import Sequential
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pickle
from google.colab import drive
drive.mount('/content/drive')

%matplotlib inline

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Preprocessing class
class Preprocessing():
    # Constructor
    def __init__(self,strip_chars="#-'.;:)([]!?|/*@",replacements={"\n":" ", "\t": " ", "&gt":"", "&lt":"", "&amp":" and "}):
        """
        constructor

        Class constructor

        inputs:
        - strip_chars: a string of characters to strip from each word
        - replacements: a dictionary where key is the set of characters and the value is the value to replace
        the characters with.

        outputs:
        - None
        """
        self.strip_chars = strip_chars
        self.replacements = replacements

        # Creating a list for the stop words
        nltk.download('stopwords')
        self.stop_words = stopwords.words('english')

    # Creating a function to drop the columns
    def drop_columns(self,data,columns=['location','id','keyword']):
        """
        drop_columns

        A function to drop the listed columns in the data

        inputs:
        - data: data is a Pandas Dataframe containing the data
        - columns: a list of columns to drop

        outputs:
        - The new dataframe with the dropped columns
        """
        return data.drop(columns,axis=1)
        # try:
        #     return data.drop(columns,axis=1)
        # except Exception:
        #     return None

    # Creating a function to perform the preprocessing
    def preprocess_data(self,data):
        """
        preprocess_data

        A function to preprocess the data

        inputs:
        - data: a Pandas Dataframe containing the raw data

        outputs:
        - A numpy array containing the preprocessed sentences at each index
        """
        # Dropping the columns
        text_data = self.drop_columns(data)

        # Convert the text data to a numpy array
        text_data_arr = text_data.to_numpy()
        processed_sentences = []

        # Getting the preprocessed text
        for sentence in text_data_arr:
            # Making the replacements
            preprocessed_sentence = sentence[0]
            for replacements in self.replacements.keys():
                preprocessed_sentence = preprocessed_sentence.replace(replacements,self.replacements[replacements])

            # Splitting the sentence by space
            preprocessed_sentence_arr = preprocessed_sentence.split(' ')
            final_preprocessed_sentence = [] # an array for the preprocessed sentence

            for word in preprocessed_sentence_arr:
                processed_word = word.strip(self.strip_chars)
                processed_word = processed_word.lower()
                processed_word = processed_word.encode('ascii','ignore')
                processed_word = processed_word.decode()

                # Conditions to add it to the final preprocessed sentence
                if (processed_word != "") and (processed_word not in self.stop_words) and ("http" not in processed_word) and ("@" not in processed_word):
                    final_preprocessed_sentence.append(processed_word)

            processed_sentences.append(final_preprocessed_sentence)

        # returning the preprocessed sentences
        return processed_sentences

In [5]:
# Function for monitoring the model metrics
training_metrics = []
validation_metrics = []

# Creating a function that returns the metrics
def get_metrics(truth,predictions):
    f1 = f1_score(truth,predictions)
    precision = precision_score(truth,predictions)
    recall = recall_score(truth,predictions)
    accuracy = accuracy_score(truth,predictions)
    return f1, precision, recall, accuracy

## Bag-of-Words Models

For Logistic Regression and Gradient Boosting, I need to leverage the Bag-of-Words preprocessing approach.

### Logistic Regression

In [6]:
# Importing the data
training = pd.read_csv('bag_of_words_training.csv')
testing = pd.read_csv('bag_of_words_testing.csv')

# Splitting testing data into validation
validation, testing = train_test_split(testing,test_size=0.2,random_state=42,shuffle=True,stratify=testing['Target'])
validation.reset_index(drop=True,inplace=True)

# Splitting data into X & y
train_x = training.drop(['ID','Target','Keyword'],axis=1)
train_y = training['Target'].values
valid_x = validation.drop(['ID','Target','Keyword'],axis=1)
valid_y = validation['Target'].values

In [7]:
# Performing a Randomized Search on the Logistic Regression Model
logReg = LogisticRegression(penalty='l2',random_state=42,max_iter=1500,tol=0.0001)
param_grid = {'C':np.random.uniform(low=0.6,high=2.0,size=100),'class_weight':['balanced',None]}
search = RandomizedSearchCV(logReg,param_grid,n_iter=200,scoring='f1',refit=True,cv=5,random_state=42)
search.fit(train_x,train_y)

In [8]:
# Looking at the best model
tuned_log_reg = search.best_estimator_
tuned_log_reg

In [9]:
# Evaluating the Model on the Training Data and Validation Data
training_predictions = tuned_log_reg.predict(train_x)
validation_predictions = tuned_log_reg.predict(valid_x)
train_metrics = get_metrics(train_y,training_predictions)
valid_metrics = get_metrics(valid_y,validation_predictions)
train_metrics_df = {'Name':'Logistic Regression','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'Logistic Regression','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}

In [10]:
# Looking at the training results
train_metrics_df

{'Name': 'Logistic Regression',
 'F1': 0.8074534161490682,
 'Precision': 0.8205128205128205,
 'Recall': 0.7948032097821933,
 'Accuracy': 0.8371100164203612}

In [11]:
# Looking at the validation results
valid_metrics_df

{'Name': 'Logistic Regression',
 'F1': 0.776923076923077,
 'Precision': 0.781431334622824,
 'Recall': 0.7724665391969407,
 'Accuracy': 0.8095238095238095}

In [13]:
# Saving the model
with open('logistic-regression.sav','wb') as file:
    pickle.dump(tuned_log_reg,file)

### Gradient Boosting

In [14]:
# Building the gradient boosting model
catboost_clf = CatBoostClassifier(loss_function='Logloss',random_state=42,early_stopping_rounds=10,eval_metric='F1',task_type='GPU')
param_grid = {'learning_rate':np.random.uniform(0.0001,0.9999,500),'depth':np.random.randint(1,3,size=7),
              'l2_leaf_reg':np.random.uniform(2,5,300),'min_data_in_leaf':np.random.randint(5,35,size=40),
              'iterations':np.random.randint(5,200,size=200),'auto_class_weights':[None,'Balanced','SqrtBalanced']}
search = catboost_clf.randomized_search(param_grid,train_x,train_y,cv=5,n_iter=300,refit=True,shuffle=True,stratified=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
132:	loss: 0.4075807	best: 0.7035464 (45)	total: 2m 42s	remaining: 3m 23s
0:	learn: 0.0496750	test: 0.0613718	best: 0.0613718 (0)	total: 60.9ms	remaining: 4.75s
1:	learn: 0.1055215	test: 0.1121359	best: 0.1121359 (1)	total: 120ms	remaining: 4.63s
2:	learn: 0.1557288	test: 0.1539503	best: 0.1539503 (2)	total: 181ms	remaining: 4.58s
3:	learn: 0.1969071	test: 0.2001320	best: 0.2001320 (3)	total: 242ms	remaining: 4.54s
4:	learn: 0.2108138	test: 0.2120595	best: 0.2120595 (4)	total: 308ms	remaining: 4.56s
5:	learn: 0.2274385	test: 0.2469119	best: 0.2469119 (5)	total: 368ms	remaining: 4.48s
6:	learn: 0.2386493	test: 0.2582294	best: 0.2582294 (6)	total: 428ms	remaining: 4.4s
7:	learn: 0.3034815	test: 0.3106962	best: 0.3106962 (7)	total: 487ms	remaining: 4.32s
8:	learn: 0.3371183	test: 0.3436112	best: 0.3436112 (8)	total: 557ms	remaining: 4.33s
9:	learn: 0.3435517	test: 0.3436112	best: 0.3436112 (8)	total: 619ms	remaining: 4.27s
1

In [15]:
# Examining the parameters
catboost_clf.get_params()

{'loss_function': 'Logloss',
 'eval_metric': 'F1',
 'task_type': 'GPU',
 'random_state': 42,
 'early_stopping_rounds': 10,
 'min_data_in_leaf': 27,
 'depth': 2,
 'iterations': 149,
 'learning_rate': 0.9976412172247356,
 'l2_leaf_reg': 4.3525809955619685,
 'auto_class_weights': 'Balanced'}

In [16]:
# Evaluating the Model on the Training Data and Validation Data
training_predictions = catboost_clf.predict(train_x)
validation_predictions = catboost_clf.predict(valid_x)
train_metrics = get_metrics(train_y,training_predictions)
valid_metrics = get_metrics(valid_y,validation_predictions)
train_metrics_df = {'Name':'Gradient Boosting','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'Gradient Boosting','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}

In [17]:
# Training metrics
train_metrics_df

{'Name': 'Gradient Boosting',
 'F1': 0.7834370139968897,
 'Precision': 0.7973882073605065,
 'Recall': 0.7699656094764998,
 'Accuracy': 0.8170771756978653}

In [18]:
# Validation metrics
valid_metrics_df

{'Name': 'Gradient Boosting',
 'F1': 0.7535680304471932,
 'Precision': 0.75,
 'Recall': 0.7571701720841301,
 'Accuracy': 0.7873563218390804}

In [20]:
# Saving the model
with open('gradient-boosting.sav','wb') as file:
    pickle.dump(catboost_clf,file)

## Deep Learning Models

Now, I need to prepare the data into a sequence format for deep learning models

In [21]:
# Importing the data
raw_train = pd.read_csv('train.csv')

# Splitting the data into training, validation, and testing
training, testing = train_test_split(raw_train,test_size=0.2,random_state=42,shuffle=True,stratify=raw_train['target'])
validation, testing = train_test_split(testing,test_size=0.2,random_state=42,shuffle=True,stratify=testing['target'])
training.reset_index(drop=True,inplace=True)
validation.reset_index(drop=True,inplace=True)
testing.reset_index(drop=True,inplace=True)

# Splitting data into X & Y
train_x = training.drop(['target'],axis=1)
train_y = training['target'].values
valid_x = validation.drop(['target'],axis=1)
valid_y = validation['target'].values

# Getting the preprocessed text
preprocessor = Preprocessing()
preprocessed_train_x = preprocessor.preprocess_data(train_x)
preprocessed_valid_x = preprocessor.preprocess_data(valid_x)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [22]:
# Transforming data to (number of examples, 57, 1000)
training_X = []
valid_X = []

# Opening the vocabulary and marking 1 to indicate the word
with open('word2index.json') as file:
    vocabulary = json.load(file)

    # Iterating through the training
    for sentence_index in range(0,len(preprocessed_train_x)):
        sentence_convert = [1001] * 57
        for word_index in range(0,len(preprocessed_train_x[sentence_index])):
            # If the word is in the vocab, get the index
            word = preprocessed_train_x[sentence_index][word_index]
            if word in vocabulary.keys():
                sentence_convert[word_index] = vocabulary[word]
            else:
                sentence_convert[word_index] = 1000

        training_X.append(sentence_convert)

    # Iterating through the validation
    for sentence_index in range(0,len(preprocessed_valid_x)):
        sentence_convert = [1001] * 57
        for word_index in range(0,len(preprocessed_valid_x[sentence_index])):
            # If the word is in the vocab, get the index
            word = preprocessed_valid_x[sentence_index][word_index]
            if word in vocabulary.keys():
                sentence_convert[word_index] = vocabulary[word]
            else:
                sentence_convert[word_index] = 1000

        valid_X.append(sentence_convert)

training_X = np.array(training_X)
valid_X = np.array(valid_X)

### GRU

In [37]:
# Defining a function to build the GRU model
def build_gru(embedding_dim=50,hidden_units=50,dropout_rate=0.2,lr=0.001,regularization_constant=0.01):
    gru_clf = Sequential()
    gru_clf.add(Embedding(input_dim=1002,output_dim=embedding_dim,input_length=training_X.shape[1]))
    gru_clf.add(BatchNormalization())
    gru_clf.add(Dropout(rate=dropout_rate))
    gru_clf.add(GRU(units=hidden_units,activation='tanh',recurrent_activation='sigmoid',bias_initializer='ones',return_sequences=False,
                    kernel_regularizer=keras.regularizers.L2(l2=regularization_constant)))
    gru_clf.add(BatchNormalization())
    gru_clf.add(Dropout(rate=dropout_rate))
    gru_clf.add(Dense(1,activation='sigmoid',bias_initializer='ones'))

    # Compiling the model
    loss_function = 'binary_crossentropy'
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    gru_clf.compile(optimizer,loss_function,metrics=[keras.metrics.F1Score(threshold=0.5)])

    # returning the model
    return gru_clf

In [55]:
# Tuning the GRU
early_stopping = keras.callbacks.EarlyStopping(monitor='val_f1_score',patience=10,restore_best_weights=True,mode='max')
gru_model = build_gru(embedding_dim=15,hidden_units=12,dropout_rate=0.3,lr=0.001,regularization_constant=0.75)
history = gru_model.fit(training_X,tf.cast(train_y,float),epochs=50,batch_size=32,validation_data=(valid_X,tf.cast(valid_y,float)),callbacks=early_stopping)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


In [56]:
# Evaluating the Model on the Training Data and Validation Data
training_predictions = gru_model.predict(training_X)
validation_predictions = gru_model.predict(valid_X)
train_metrics = get_metrics(train_y,np.rint(training_predictions))
valid_metrics = get_metrics(valid_y,np.rint(validation_predictions))
train_metrics_df = {'Name':'GRU','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'GRU','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}



In [57]:
# Viewing the training metrics
train_metrics_df

{'Name': 'GRU',
 'F1': 0.8098484848484848,
 'Precision': 0.8028539241457003,
 'Recall': 0.8169659915934276,
 'Accuracy': 0.8351395730706076}

In [None]:
# Viewing the validation metrics
valid_metrics_df

In [61]:
# Saving the model
gru_model.save('gru.keras')

### LSTM

In [65]:
# Defining a function to build the LSTM model
def build_lstm(embedding_dim=50,hidden_units=50,dropout_rate=0.2,lr=0.0001):
    lstm_clf = Sequential()
    lstm_clf.add(Embedding(input_dim=1002,output_dim=embedding_dim,input_length=training_X.shape[1]))
    lstm_clf.add(BatchNormalization())
    lstm_clf.add(Dropout(rate=dropout_rate))
    lstm_clf.add(LSTM(units=hidden_units,activation='tanh',recurrent_activation='sigmoid',return_sequences=False))
    lstm_clf.add(BatchNormalization())
    lstm_clf.add(Dropout(rate=dropout_rate))
    lstm_clf.add(Dense(1,activation='sigmoid',bias_initializer='ones'))
    loss_function = 'binary_crossentropy'
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    lstm_clf.compile(optimizer,loss_function,metrics=[keras.metrics.F1Score(threshold=0.5)])

    # returning the model
    return lstm_clf

In [67]:
# Tuning the LSTM
early_stopping = keras.callbacks.EarlyStopping(monitor='val_f1_score',patience=10,restore_best_weights=True,mode='max')
lstm_model = build_lstm(embedding_dim=15,hidden_units=5,dropout_rate=0.2,lr=0.001)
history = lstm_model.fit(training_X,tf.cast(train_y,float),epochs=50,batch_size=32,validation_data=(valid_X,tf.cast(valid_y,float)),callbacks=early_stopping)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


In [68]:
# Evaluating the Model on the Training Data and Validation Data
training_predictions = lstm_model.predict(training_X)
validation_predictions = lstm_model.predict(valid_X)
train_metrics = get_metrics(train_y,np.rint(training_predictions))
valid_metrics = get_metrics(valid_y,np.rint(validation_predictions))
train_metrics_df = {'Name':'LSTM','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'LSTM','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}



In [69]:
train_metrics_df

{'Name': 'LSTM',
 'F1': 0.8064516129032259,
 'Precision': 0.7918968692449355,
 'Recall': 0.8215513947267864,
 'Accuracy': 0.8305418719211822}

In [70]:
valid_metrics_df

{'Name': 'LSTM',
 'F1': 0.7617295308187673,
 'Precision': 0.7340425531914894,
 'Recall': 0.7915869980879541,
 'Accuracy': 0.7873563218390804}

In [72]:
# Saving the model
lstm_model.save('lstm.keras')