# Model Tuning

This notebook will be dedicated to tuning the following models:
1. Logistic Regression
2. Gradient Boosting Trees
3. GRU
4. LSTM
5. Transformers

Note, due to computational restraints, I won't be able to perform an exhaustive search. However, I will tune the parameters slightly to see if I can improve model performance and minimize overfitting.

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from preprocessing import Preprocessing
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, GRU, LSTM, Embedding, Dropout, BatchNormalization
from keras.models import Sequential
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformer import Transformer
from scikeras.wrappers import KerasClassifier

%matplotlib inline

In [None]:
# Function for monitoring the model metrics
training_metrics = []
validation_metrics = []

# Creating a function that returns the metrics
def get_metrics(truth,predictions):
    f1 = f1_score(truth,predictions)
    precision = precision_score(truth,predictions)
    recall = recall_score(truth,predictions)
    accuracy = accuracy_score(truth,predictions)
    return f1, precision, recall, accuracy

## Bag-of-Words Models

For Logistic Regression and Gradient Boosting, I need to leverage the Bag-of-Words preprocessing approach.

### Logistic Regression

In [None]:
# Importing the data
training = pd.read_csv('data/bag_of_words_training.csv')
testing = pd.read_csv('data/bag_of_words_testing.csv')

# Splitting testing data into validation
validation, testing = train_test_split(testing,test_size=0.2,random_state=42,shuffle=True,stratify=testing['Target'])
validation.reset_index(drop=True,inplace=True)

# Splitting data into X & y
train_x = training.drop(['ID','Target','Keyword'],axis=1)
train_y = training['Target'].values
valid_x = validation.drop(['ID','Target','Keyword'],axis=1)
valid_y = validation['Target'].values

In [None]:
# Performing a Randomized Search on the Logistic Regression Model
logReg = LogisticRegression(penalty='l2',random_state=42,max_iter=1500,tol=0.0001)
param_grid = {'C':np.random.uniform(low=0.01,high=2.0,size=100)}
search = RandomizedSearchCV(logReg,param_grid,n_iter=100,scoring='f1',refit=True,cv=5,random_state=42)
search.fit(train_x,train_y)

In [None]:
# Looking at the best model
tuned_log_reg = search.best_estimator_
tuned_log_reg

In [None]:
# Evaluating the Model on the Training Data and Validation Data
training_predictions = tuned_log_reg.predict(train_x)
validation_predictions = tuned_log_reg.predict(valid_x)
train_metrics = get_metrics(train_y,training_predictions)
valid_metrics = get_metrics(valid_y,validation_predictions)
train_metrics_df = {'Name':'Logistic Regression','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'Logistic Regression','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}

training_metrics.append(train_metrics_df)
validation_metrics.append(valid_metrics_df)

### Gradient Boosting

In [None]:
# Building the gradient boosting model
catboost_clf = CatBoostClassifier(iterations=1500,loss_function='Logloss',random_state=42,early_stopping_rounds=10,eval_metric='F1')
param_grid = {'learning_rate':np.random.uniform(0.0001,0.5,100),'depth':np.random.randint(1,16,size=7),
              'l2_leaf_reg':np.random.uniform(0.5,5,100),'min_data_in_leaf':np.random.randint(5,50,size=10)}
search = catboost_clf.randomized_search(param_grid,train_x,train_y,cv=3,n_iter=15,refit=True,shuffle=True,stratified=True)

In [None]:
# Examining the parameters
catboost_clf.get_params()

In [None]:
# Evaluating the Model on the Training Data and Validation Data
training_predictions = catboost_clf.predict(train_x)
validation_predictions = catboost_clf.predict(valid_x)
train_metrics = get_metrics(train_y,training_predictions)
valid_metrics = get_metrics(valid_y,validation_predictions)
train_metrics_df = {'Name':'Gradient Boosting','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'Gradient Boosting','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}

training_metrics.append(train_metrics_df)
validation_metrics.append(valid_metrics_df)

## Deep Learning Models

Now, I need to prepare the data into a sequence format for deep learning models

In [None]:
# Importing the data
raw_train = pd.read_csv('data/train.csv')

# Splitting the data into training, validation, and testing
training, testing = train_test_split(raw_train,test_size=0.2,random_state=42,shuffle=True,stratify=raw_train['target'])
validation, testing = train_test_split(testing,test_size=0.2,random_state=42,shuffle=True,stratify=testing['target'])
training.reset_index(drop=True,inplace=True)
validation.reset_index(drop=True,inplace=True)
testing.reset_index(drop=True,inplace=True)

# Splitting data into X & Y
train_x = training.drop(['target'],axis=1)
train_y = training['target'].values
valid_x = validation.drop(['target'],axis=1)
valid_y = validation['target'].values

# Getting the preprocessed text
preprocessor = Preprocessing()
preprocessed_train_x = preprocessor.preprocess_data(train_x)
preprocessed_valid_x = preprocessor.preprocess_data(valid_x)


In [None]:
# Transforming data to (number of examples, 57, 1000)
training_X = []
valid_X = []

# Opening the vocabulary and marking 1 to indicate the word
with open('mappers/word2index.json') as file:
    vocabulary = json.load(file)

    # Iterating through the training
    for sentence_index in range(0,len(preprocessed_train_x)):
        sentence_convert = [1001] * 57
        for word_index in range(0,len(preprocessed_train_x[sentence_index])):
            # If the word is in the vocab, get the index
            word = preprocessed_train_x[sentence_index][word_index]
            if word in vocabulary.keys():
                sentence_convert[word_index] = vocabulary[word]
            else:
                sentence_convert[word_index] = 1000
        
        training_X.append(sentence_convert)
    
    # Iterating through the validation
    for sentence_index in range(0,len(preprocessed_valid_x)):
        sentence_convert = [1001] * 57
        for word_index in range(0,len(preprocessed_valid_x[sentence_index])):
            # If the word is in the vocab, get the index
            word = preprocessed_valid_x[sentence_index][word_index]
            if word in vocabulary.keys():
                sentence_convert[word_index] = vocabulary[word]
            else:
                sentence_convert[word_index] = 1000

        valid_X.append(sentence_convert)

training_X = np.array(training_X)
valid_X = np.array(valid_X)

### GRU

In [None]:
# Defining a function to build the GRU model
def build_gru(embedding_dim=50,hidden_units=50,dropout_rate=0.2,lr=0.001):
    gru_clf = Sequential()
    gru_clf.add(Embedding(input_dim=1002,output_dim=embedding_dim,input_length=training_X.shape[1]))
    gru_clf.add(Dropout(rate=dropout_rate))
    gru_clf.add(GRU(units=hidden_units,activation='tanh',recurrent_activation='sigmoid',bias_initializer='ones',return_sequences=False))
    gru_clf.add(BatchNormalization())
    gru_clf.add(Dropout(rate=dropout_rate))
    gru_clf.add(Dense(1,activation='sigmoid',bias_initializer='ones'))

    # Compiling the model
    loss_function = 'binary_crossentropy'
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    gru_clf.compile(optimizer,loss_function)

    # returning the model
    return gru_clf

In [None]:
# Using Scikeras to wrap the model into a scikit-learn classifier
gru_model = KerasClassifier(build_gru,embedding_dim=50,hidden_units=50,dropout_rate=0.2,lr=0.001)
param_grid = {'embedding_dim':np.random.randint(50,500,size=100),'hidden_units':np.random.randint(50,500,size=100),
              'dropout_rate':np.random.uniform(0,0.5,size=50),'lr':np.random.uniform(0.00001,0.03,size=100)}

# Performing randomized search on the model
clf = RandomizedSearchCV(gru_model,param_grid,n_iter=10,scoring='f1',refit=True,cv=3,random_state=42,verbose=3)
search = clf.fit(training_X,train_y)

In [None]:
# Looking at the best hyperparameters
search.best_params_

In [None]:
# Getting the best model
best_gru = search.best_estimator_

In [None]:
# Evaluating the Model on the Training Data and Validation Data
training_predictions = best_gru.predict(training_X)
validation_predictions = best_gru.predict(valid_X)
train_metrics = get_metrics(train_y,training_predictions)
valid_metrics = get_metrics(valid_y,validation_predictions)
train_metrics_df = {'Name':'GRU','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'GRU','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}

In [None]:
training_metrics.append(train_metrics_df)
validation_metrics.append(valid_metrics_df)

### LSTM

In [None]:
# Defining a function to build the LSTM model
def build_lstm(embedding_dim=50,hidden_units=50,dropout_rate=0.2,lr=0.001):
    lstm_clf = Sequential()
    lstm_clf.add(Embedding(input_dim=1002,output_dim=embedding_dim,input_length=training_X.shape[1]))
    lstm_clf.add(Dropout(rate=dropout_rate))
    lstm_clf.add(LSTM(units=hidden_units,activation='tanh',recurrent_activation='sigmoid',return_sequences=False))
    lstm_clf.add(BatchNormalization())
    lstm_clf.add(Dense(1,activation='sigmoid',bias_initializer='ones'))
    loss_function = 'binary_crossentropy'
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    lstm_clf.compile(optimizer,loss_function)

    # returning the model
    return lstm_clf

In [None]:
# Using Scikeras to wrap the model into a scikit-learn classifier
lstm_model = KerasClassifier(build_lstm,embedding_dim=50,hidden_units=50,dropout_rate=0.2,lr=0.001)
param_grid = {'embedding_dim':np.random.randint(50,500,size=100),'hidden_units':np.random.randint(50,500,size=100),
              'dropout_rate':np.random.uniform(0,0.5,size=50),'lr':np.random.uniform(0.00001,0.03,size=100)}

# Performing randomized search on the model
clf = RandomizedSearchCV(lstm_model,param_grid,n_iter=10,scoring='f1',refit=True,cv=3,random_state=42,verbose=3)
search = clf.fit(training_X,train_y)

In [None]:
# Looking at the best hyperparameters
search.best_params_

In [None]:
# Getting the best model
best_lstm = search.best_estimator_

In [None]:
# Evaluating the Model on the Training Data and Validation Data
training_predictions = best_lstm.predict(training_X)
validation_predictions = best_lstm.predict(valid_X)
train_metrics = get_metrics(train_y,training_predictions)
valid_metrics = get_metrics(valid_y,validation_predictions)
train_metrics_df = {'Name':'LSTM','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'LSTM','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}

In [None]:
training_metrics.append(train_metrics_df)
validation_metrics.append(valid_metrics_df)

### Transformer

Won't do randomized search CV since that could take some time. Will just add some dropout and train model to see performance

In [None]:
# Setting up the dataset for PyTorch
train_X = torch.from_numpy(np.array(training_X))
valid_X = torch.from_numpy(np.array(valid_X))
train_y = torch.from_numpy(train_y)
valid_y = torch.from_numpy(valid_y)

training_dataset = TensorDataset(train_X,train_y)
validation_dataset = TensorDataset(valid_X,valid_y)

# Storing the training and validation data in DataLoaders
training_loader = DataLoader(training_dataset,batch_size=32,shuffle=True)
validation_loader = DataLoader(validation_dataset,batch_size=32,shuffle=True)

In [None]:
# A function for the training loop
def training_loop(model,loss_fn,optimizer,training_data):
    size = len(training_data.dataset)
    model.train() # Setting the model to training mode

    # Iterating through the batches
    for batch , (X,y) in enumerate(training_data):
        # Compute the predictions
        pred = model(X)

        # Calculate the loss
        loss = loss_fn(pred,y)

        # Calculate the derivatives (backpropagation)
        loss.backward()

        # Take a step with the optimizer
        optimizer.step()

        # Reset the gradients
        optimizer.zero_grad()

        # Printing out the progress for every 20 batches
        if batch % 20 == 0:
            loss, current = loss.item(), (batch+1) * len(X)
            print(f'loss :{loss} {round(current/size,2)*100}% Complete')

In [None]:
# Defining the Model
model = Transformer()
epochs = 5
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
loss_function = torch.nn.CrossEntropyLoss()
history = []

In [None]:
# Training the model
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}\n-------------------------------')
    training_loop(model,loss_function,optimizer,training_loader)
    training_pred = model(train_X)
    loss = loss_function(training_pred,train_y)
    history.append(loss.item())
    print()

In [None]:
# Making the predictions
training_predictions = np.argmax(model(train_X).detach().numpy(),axis=1)
validation_predictions = np.argmax(model(valid_X).detach().numpy(),axis=1)
train_metrics = get_metrics(train_y,np.rint(training_predictions))
valid_metrics = get_metrics(valid_y,np.rint(validation_predictions))
train_metrics_df = {'Name':'Transformer','F1':train_metrics[0],'Precision':train_metrics[1],'Recall':train_metrics[2],'Accuracy':train_metrics[3]}
valid_metrics_df = {'Name':'Transformer','F1':valid_metrics[0],'Precision':valid_metrics[1],'Recall':valid_metrics[2],'Accuracy':valid_metrics[3]}

In [None]:
training_metrics.append(train_metrics_df)
validation_metrics.append(valid_metrics_df)

In [None]:
# Converting the metrics to dataframe for analysis
training_metrics_df = pd.DataFrame(training_metrics)
validation_metrics_df = pd.DataFrame(validation_metrics)
print('Training Metrics:')
print(training_metrics_df)
print()
print('Validation Metrics:') 
print(validation_metrics_df)