# Adding Lots of Noise

I found that gradient boosting is the best model to use in the classical ML setting for my features. Before moving on to deep learning, I want to see if the model can generalize better if I add more noise into the grammar counts and add noise into the word counts

In [1]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
import pickle

In [2]:
# getting the data
training = pd.read_csv('../../data/train.csv')
valid = pd.read_csv('../../data/validation.csv')

In [3]:
# Separating data into X and y
X_train = training.drop(['row_id','essay','LLM_written','prompt'],axis=1)
X_valid = valid.drop(['row_id','essay','LLM_written','prompt'],axis=1)
y_train = training['LLM_written'].values
y_valid = valid['LLM_written'].values

In [4]:
# Adding Gaussian Noise and absolute valuing it to maintain positives in grammar errors
# Making the features very noisy
grammar_noise = np.random.normal(0,20,X_train.shape[0])
word_count_noise = np.random.normal(0,500,X_train.shape[0])
X_train['grammar_errors'] = np.abs(X_train['grammar_errors'] + grammar_noise)
X_train['word_count'] = np.abs(X_train['word_count'] + word_count_noise)

In [5]:
# Building a scalar
scalar = RobustScaler()
numerical = ['word_count','stop_word_count','stop_word_ratio','unique_word_count','unique_word_ratio',
             'count_question','count_exclamation','count_semi','count_colon','grammar_errors']
X_train[numerical] = scalar.fit_transform(X_train[numerical])
X_valid[numerical] = scalar.transform(X_valid[numerical])

In [7]:
# Saving the scalar
with open('../../models/custom-features/scalar-grammar-word-noise.pkl','wb') as file:
    pickle.dump(scalar,file)

In [28]:
# Model CV
sample_weights =  X_train.shape[0] / (2.0 * np.bincount(y_train.astype(int)))

# model
catboost_clf = CatBoostClassifier(iterations=1000,loss_function='Logloss',random_seed=42,early_stopping_rounds=10,
                                  eval_metric='AUC',class_weights=sample_weights)

# Parameter grid
param_grid = {
    'learning_rate':[0.01,0.03,0.3],
    'depth':[2,3,5],
    'l2_leaf_reg':[1,3,7,15],
    'min_data_in_leaf':[5,15,25,50]
}

In [29]:
# Performing Randomized Search
search_results = catboost_clf.randomized_search(param_grid,X_train.values,y_train,cv=3,n_iter=10,refit=True,shuffle=True)

0:	test: 0.9381435	best: 0.9381435 (0)	total: 207ms	remaining: 3m 27s
1:	test: 0.9459080	best: 0.9459080 (1)	total: 354ms	remaining: 2m 56s
2:	test: 0.9484093	best: 0.9484093 (2)	total: 587ms	remaining: 3m 15s
3:	test: 0.9492942	best: 0.9492942 (3)	total: 794ms	remaining: 3m 17s
4:	test: 0.9533773	best: 0.9533773 (4)	total: 883ms	remaining: 2m 55s
5:	test: 0.9589191	best: 0.9589191 (5)	total: 982ms	remaining: 2m 42s
6:	test: 0.9590425	best: 0.9590425 (6)	total: 1.06s	remaining: 2m 30s
7:	test: 0.9599405	best: 0.9599405 (7)	total: 1.14s	remaining: 2m 21s
8:	test: 0.9660515	best: 0.9660515 (8)	total: 1.22s	remaining: 2m 14s
9:	test: 0.9659027	best: 0.9660515 (8)	total: 1.43s	remaining: 2m 21s
10:	test: 0.9675292	best: 0.9675292 (10)	total: 1.53s	remaining: 2m 17s
11:	test: 0.9677565	best: 0.9677565 (11)	total: 1.61s	remaining: 2m 12s
12:	test: 0.9679239	best: 0.9679239 (12)	total: 1.68s	remaining: 2m 7s
13:	test: 0.9678992	best: 0.9679239 (12)	total: 1.76s	remaining: 2m 3s
14:	test: 0.96

In [30]:
# Checking if model is fitted
catboost_clf.is_fitted()

True

In [31]:
# Showing best params
catboost_clf.get_params()

{'iterations': 1000,
 'loss_function': 'Logloss',
 'random_seed': 42,
 'class_weights': array([0.78561644, 1.37529976]),
 'eval_metric': 'AUC',
 'early_stopping_rounds': 10,
 'min_data_in_leaf': 15,
 'depth': 5,
 'l2_leaf_reg': 15,
 'learning_rate': 0.3}

In [32]:
# Making predictions
print('Predictions for Gradient Boosting')
train_preds = catboost_clf.predict_proba(X_train)[:,1]
valid_preds = catboost_clf.predict_proba(X_valid)[:,1]
train_score = roc_auc_score(y_train,train_preds)
valid_score = roc_auc_score(y_valid,valid_preds)
print(f'Training ROC AUC: {train_score}')
print(f'Validation ROC AUC: {valid_score}')

Predictions for Gradient Boosting


Training ROC AUC: 0.9996786683484532
Validation ROC AUC: 0.9524199221459642


In [33]:
catboost_clf.save_model('../../models/custom-features/catboost-grammar-word-noise')