# Catboost TFIDF 3-5

This notebook will train a Catboost model using TFIDF for trigrams to 5grams. I found many notebooks to be using this range to be successful. I want to see if it works for me. I found in my EDA that the longer the ngram, the easier it is to distinguish between the essays.

In [None]:
import scipy
scipy.__version__

In [18]:
# Getting libraries
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import pickle
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)
        
tqdm.pandas()

In [3]:
# Getting the data
data = pd.read_csv('../input/prepared-data-llm-competition/prepared_training_set.csv')
essays = data['essay']
labels = data['LLM_written']

In [4]:
# Text cleaning 
def text_cleaning(essay:str) -> str:
    cleaned_text = essay.replace('\n',"")
    cleaned_text = essay.replace("\t","")
    
    return cleaned_text

In [6]:
# Cleaning the text
essays_cleaned = essays.progress_apply(text_cleaning)

100%|██████████| 49929/49929 [00:00<00:00, 129313.27it/s]


In [11]:
# Setting up the vectorizer
pattern = r'(?u)\b\w\w+\b|!|\?|\:|\;' # pattern for punctuation
vectorizer = TfidfVectorizer(token_pattern=pattern,ngram_range=(3,5),max_df=0.85,
                            min_df=100,max_features=1000,norm=None)

# Fitting it to the essays
X = vectorizer.fit_transform(essays_cleaned)

In [12]:
# Storing the data into a dataframe
transformed_data = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
transformed_data.head()

Unnamed: 0,able to attend,able to attend classes,able to do,able to get,able to have,able to use,abolish the electoral,abolish the electoral college,abolishing the electoral,abolishing the electoral college,...,you for your time,you get to,you have to,you need to,you should join,you to support,you want to,you will be,you will have,young people are
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.173458,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Saving the vectorizer for later use
with open('vectorizer-3-5.pk','wb') as file:
    pickle.dump(vectorizer,file)

In [22]:
# Building the model
catboost_clf = CatBoostClassifier(iterations=5000,learning_rate=0.03,loss_function='Logloss',random_seed=42)
catboost_clf.fit(transformed_data.values,labels.values,verbose=100,plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6760449	total: 74.8ms	remaining: 6m 14s
100:	learn: 0.3520213	total: 7.2s	remaining: 5m 49s
200:	learn: 0.2888020	total: 14.2s	remaining: 5m 38s
300:	learn: 0.2525742	total: 21.6s	remaining: 5m 37s
400:	learn: 0.2271988	total: 28.4s	remaining: 5m 25s
500:	learn: 0.2062442	total: 35.3s	remaining: 5m 16s
600:	learn: 0.1899008	total: 42.1s	remaining: 5m 8s
700:	learn: 0.1768828	total: 49.4s	remaining: 5m 3s
800:	learn: 0.1662198	total: 56.2s	remaining: 4m 54s
900:	learn: 0.1570388	total: 1m 3s	remaining: 4m 46s
1000:	learn: 0.1500119	total: 1m 9s	remaining: 4m 38s
1100:	learn: 0.1437529	total: 1m 16s	remaining: 4m 30s
1200:	learn: 0.1381591	total: 1m 23s	remaining: 4m 24s
1300:	learn: 0.1331351	total: 1m 30s	remaining: 4m 17s
1400:	learn: 0.1288060	total: 1m 36s	remaining: 4m 9s
1500:	learn: 0.1247377	total: 1m 44s	remaining: 4m 2s
1600:	learn: 0.1209073	total: 1m 50s	remaining: 3m 55s
1700:	learn: 0.1174217	total: 1m 57s	remaining: 3m 47s
1800:	learn: 0.1143346	total: 2m 4s	r

<catboost.core.CatBoostClassifier at 0x7d4579b490f0>

In [23]:
# Making predictions on training data and evaluating
print('ROC AUC on Training Set:')
predictions = catboost_clf.predict_proba(transformed_data.values)[:,1]
roc_auc_score(labels.values,predictions)

ROC AUC on Training Set:


0.9985804872965804

In [25]:
# Saving the model
catboost_clf.save_model('catboost_clf-3-5-tfidf')