# Ensemble Methods

In this notebook, I want to try out some ensemble methods to see how they "upgrade" performance.

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import pickle
from catboost import CatBoostClassifier
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score

In [2]:
# Getting the data
training = pd.read_csv('../../data/train-noisy-grammar.csv')
valid = pd.read_csv('../../data/validation.csv')

In [3]:
# Separating data into X and y
X_train = training.drop(['row_id','essay','LLM_written','prompt'],axis=1)
X_valid = valid.drop(['row_id','essay','LLM_written','prompt'],axis=1)
y_train = training['LLM_written'].values
y_valid = valid['LLM_written'].values

In [4]:
# Getting the scalar and scaling
with open('../../models/custom-features/noisy-grammar-errors/scalar-noisy.pkl','rb') as file:
    scalar = pickle.load(file)

In [5]:
numerical = ['word_count','stop_word_count','stop_word_ratio','unique_word_count','unique_word_ratio',
             'count_question','count_exclamation','count_semi','count_colon','grammar_errors']
X_train[numerical] = scalar.transform(X_train[numerical])
X_valid[numerical] = scalar.transform(X_valid[numerical])

In [6]:
# Getting the RF model
with open('../../models/custom-features/noisy-grammar-errors/fine-tuned/forest-fine-noisy.pkl','rb') as model:
    forest = pickle.load(model)

In [7]:
# Getting GB model
catboost_clf = CatBoostClassifier()
catboost_clf.load_model('../../models/custom-features/noisy-grammar-errors/fine-tuned/catboost-noisy-fine')

<catboost.core.CatBoostClassifier at 0x1292a06d0>

In [8]:
# Getting the ANN
# Class for the model
class ANN(nn.Module):
    # Constructor
    def __init__(self,num_features,model_layers,include_dropout,dropout_rate):
        # Calling super constructor
        super().__init__()
        self.model = nn.Sequential()

        # Adding the layers
        in_features = num_features
        for index in range(len(model_layers)):
            model_layer = nn.Linear(in_features,model_layers[index],bias=True)
            self.model.append(model_layer)
            self.model.append(nn.ReLU())
            in_features = model_layers[index]

            # Adding dropout if specified
            if include_dropout[index]:
                self.model.append(nn.Dropout(p=dropout_rate))
        
        # Adding the final layer
        self.model.append(nn.Linear(in_features,1))

    # Forward class
    def forward(self,X):
        # Running the input through the model 
        return nn.functional.sigmoid(self.model(X))
    
num_features = X_train.shape[1]
layers = [10,20,10]
include_dropout = [True] * 3
dropout_rate = 0.2

ann = ANN(num_features,layers,include_dropout,dropout_rate)
ann.load_state_dict(torch.load('../../models/custom-features/noisy-grammar-errors/ann.pt'))
ann

ANN(
  (model): Sequential(
    (0): Linear(in_features=515, out_features=10, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=10, out_features=20, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=20, out_features=10, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=10, out_features=1, bias=True)
  )
)

In [23]:
# Putting both into Tensors and into a dataloader for iterating
X_train_tensor = torch.from_numpy(X_train.values)
y_train_tensor = torch.from_numpy(y_train)
X_valid_tensor = torch.from_numpy(X_valid.values)
y_valid_tensor = torch.from_numpy(y_valid)
training_dataset = TensorDataset(X_train_tensor,y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor,y_valid_tensor)
training_loader = DataLoader(training_dataset,batch_size=32,shuffle=False)
valid_loader = DataLoader(valid_dataset,batch_size=32,shuffle=False)

## Making the Predictions

In [24]:
# RF predictions
predictions_rf_train = forest.predict_proba(X_train.values)[:,1]
predictions_rf_valid = forest.predict_proba(X_valid.values)[:,1]

In [25]:
# Gradient Boosting Predictions
predictions_gb_train = catboost_clf.predict_proba(X_train)[:,1]
predictions_gb_valid = catboost_clf.predict_proba(X_valid)[:,1]

In [26]:
# Making Predictions with ANN
with torch.no_grad():
    ann.eval()
    train_preds_ann = None
    val_preds_ann = None
    for X,_ in training_loader:
        # Making predictions
        X = X.to(torch.float32)
        pred = ann(X)
        if train_preds_ann is None:
            train_preds_ann = pred.detach().numpy()
        else:
            train_preds_ann = np.append(train_preds_ann,pred.detach().numpy(),axis=0)

    for X,_ in valid_loader:
        # Making predictions
        X = X.to(torch.float32)
        pred = ann(X)
        if val_preds_ann is None:
            val_preds_ann = pred.detach().numpy()
        else:
            val_preds_ann = np.append(val_preds_ann,pred.detach().numpy(),axis=0)

In [27]:
# Squeezing the last dimension out for the ann ones
train_preds_ann = train_preds_ann.squeeze(-1)
val_preds_ann = val_preds_ann.squeeze(-1)

In [30]:
# Creating a dictionary for the model performances
performances = {
    'model':[],
    'Train ROC AUC':[],
    'Valid ROC AUC':[]
}

## Trial 1: Random Forest + Gradient Boosting

In [31]:
# Getting the training predictions
final_predictions_train = (predictions_rf_train + predictions_gb_train) / 2

In [32]:
# Getting the validation predictions
final_predictions_valid = (predictions_rf_valid + predictions_gb_valid) / 2

In [33]:
# Making predictions
print('Predictions for Random Forest + Gradient Boosting')
train_score = roc_auc_score(y_train,final_predictions_train)
valid_score = roc_auc_score(y_valid,final_predictions_valid)
print(f'Training ROC AUC: {train_score}')
print(f'Validation ROC AUC: {valid_score}')

Predictions for Random Forest + Gradient Boosting
Training ROC AUC: 0.9998957122972557
Validation ROC AUC: 0.9766543982633817


In [34]:
# Adding the metrics
model = 'Random Forest + Gradient Boosting'
performances['model'].append(model)
performances['Train ROC AUC'].append(train_score)
performances['Valid ROC AUC'].append(valid_score)

## Gradient Boosting + ANN

In [35]:
# Getting the training predictions
final_predictions_train = (train_preds_ann + predictions_gb_train) / 2

In [36]:
# Getting the validation predictions
final_predictions_valid = (val_preds_ann + predictions_gb_valid) / 2

In [37]:
# Making predictions
print('Predictions for Gradient Boosting + ANN')
train_score = roc_auc_score(y_train,final_predictions_train)
valid_score = roc_auc_score(y_valid,final_predictions_valid)
print(f'Training ROC AUC: {train_score}')
print(f'Validation ROC AUC: {valid_score}')

Predictions for Gradient Boosting + ANN
Training ROC AUC: 0.9999759895955058
Validation ROC AUC: 0.977667316810433


In [38]:
# Adding the metrics
model = 'Gradient Boosting + ANN'
performances['model'].append(model)
performances['Train ROC AUC'].append(train_score)
performances['Valid ROC AUC'].append(valid_score)

## Random Forest + ANN

In [39]:
# Getting the training predictions
final_predictions_train = (train_preds_ann + predictions_rf_train) / 2

In [40]:
# Getting the validation predictions
final_predictions_valid = (val_preds_ann + predictions_rf_valid) / 2

In [41]:
# Making predictions
print('Predictions for Random Forest + ANN')
train_score = roc_auc_score(y_train,final_predictions_train)
valid_score = roc_auc_score(y_valid,final_predictions_valid)
print(f'Training ROC AUC: {train_score}')
print(f'Validation ROC AUC: {valid_score}')

Predictions for Random Forest + ANN
Training ROC AUC: 0.9995142066887411
Validation ROC AUC: 0.9809386552633081


In [42]:
# Adding the metrics
model = 'Random Forest + ANN'
performances['model'].append(model)
performances['Train ROC AUC'].append(train_score)
performances['Valid ROC AUC'].append(valid_score)

## Random Forest + Gradient Boosting + ANN

In [43]:
# Getting the training predictions
final_predictions_train = (train_preds_ann + predictions_rf_train + predictions_gb_train) / 3

In [44]:
# Getting the validation predictions
final_predictions_valid = (val_preds_ann + predictions_rf_valid + predictions_gb_valid) / 3

In [45]:
# Making predictions
print('Predictions for Random Forest + Gradient Boosting +  ANN')
train_score = roc_auc_score(y_train,final_predictions_train)
valid_score = roc_auc_score(y_valid,final_predictions_valid)
print(f'Training ROC AUC: {train_score}')
print(f'Validation ROC AUC: {valid_score}')

Predictions for Random Forest + Gradient Boosting +  ANN
Training ROC AUC: 0.9999479425402964
Validation ROC AUC: 0.9797768837167549


In [46]:
# Adding the metrics
model = 'Random Forest + Gradient Boostin + ANN'
performances['model'].append(model)
performances['Train ROC AUC'].append(train_score)
performances['Valid ROC AUC'].append(valid_score)

In [47]:
# Printing out the model performances in a dataframe and saving it
metrics_df = pd.DataFrame().from_dict(performances)
metrics_df

Unnamed: 0,model,Train ROC AUC,Valid ROC AUC
0,Random Forest + Gradient Boosting,0.999896,0.976654
1,Gradient Boosting + ANN,0.999976,0.977667
2,Random Forest + ANN,0.999514,0.980939
3,Random Forest + Gradient Boostin + ANN,0.999948,0.979777


In [48]:
# Saving the performances
metrics_df.to_csv('../../models/custom-features/noisy-grammar-errors/ensemble-metrics.csv',index=False)

All ensemble models do comparably well. It would be worth testing each of them on the test set.