# Detect AI Generated Text
In this challenge, the goal is to determine whether a piece of text is AI generated or not.

# Import the necessary libraries
Here, we import the libraries necessary for the challenge.

In [2]:
# Libraries
import os
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# NLTK
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec


import xgboost as xgb

from sklearn.metrics import classification_report, roc_auc_score, make_scorer, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

from lightgbm import LGBMClassifier

from hyperopt import fmin, tpe, hp, Trials

# Download NLTK Resources
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Spectra\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Spectra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Spectra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Spectra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Spectra\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tag

True

# Obtain data
Here, we obtain the data to train.

In [5]:
# get training data
final_augmented_train = pd.read_csv("data/final_augmented_data.csv")
test_df = pd.read_csv("data/test_essays.csv")
new_data_one = pd.read_csv("data/train_drcat_01.csv")
new_data_two = pd.read_csv("data/train_drcat_02.csv")
new_data_three = pd.read_csv("data/train_drcat_03.csv")
new_data_four = pd.read_csv("data/train_drcat_04.csv")
augmented_test = pd.read_csv("data/archive (2).zip (Unzipped Files)/final_test.csv")

**Get train data ready**

In [7]:
# Rename Columns
new_data_four.rename(columns={'label': 'generated'}, inplace=True)
new_data_two.rename(columns={'label': 'generated'}, inplace=True)

In [9]:
# Concat the data
concat_train_df = pd.concat([final_augmented_train,
                             new_data_two[["text", "generated"]],
                             new_data_four[["text", "generated"]]], axis=0)

concat_train_df.head()

Unnamed: 0,text,generated
0,"Dear Mr. Senator, I have decided to express my...",0
1,Limiting car usage is advantageous for multip...,1
2,Limiting car usage is beneficial for a number...,1
3,"""The day that mankind realizes that their crea...",0
4,"Dear me. Senator, I am fed up with the elector...",0


In [10]:
# Drop duplicates
concat_train_df.drop_duplicates(subset=["text"])

Unnamed: 0,text,generated
0,"Dear Mr. Senator, I have decided to express my...",0
1,Limiting car usage is advantageous for multip...,1
2,Limiting car usage is beneficial for a number...,1
3,"""The day that mankind realizes that their crea...",0
4,"Dear me. Senator, I am fed up with the elector...",0
...,...,...
44131,Looking for some fun activities to do at the ...,1
44142,The school administration has recently announ...,1
44144,\nSeeking advice from more than one person whe...,1
44160,While the Facial Action Coding System technolo...,1


In [11]:
sampled_train_df, _ = train_test_split(concat_train_df,
                                       test_size=(len(concat_train_df) - 100000) / len(concat_train_df),
                                       stratify=concat_train_df["generated"],
                                       random_state=42)

**Sample Test Data**

In [16]:
# Rename columns
new_data_three.rename(columns={'label': 'generated'}, inplace=True)
new_data_one.rename(columns={'label': 'generated'}, inplace=True)
augmented_test.rename(columns={'label': 'generated'}, inplace=True)

In [17]:
# Concat the data
concat_eval_df = pd.concat([augmented_test,
                             new_data_one[["text", "generated"]],
                             new_data_three[["text", "generated"]]], axis=0)



Unnamed: 0,text,generated
0,The Face on Mars is nothing but a natural occu...,0
1,Students have a higher chance of catching a vi...,0
2,Driverless cars have good and bad things that ...,0
3,Some people might think that traveling in a gr...,1
4,How many of us students want to be forced to d...,0
...,...,...
42201,"""Oh man I didn't make the soccer team!"", yelle...",0
42202,I believe that using this technology could be ...,0
42203,The Face on Mars is a fascinating phenomenon t...,1
42204,Texting & Driving\n\nUsing your phone while dr...,0


In [18]:
# Drop duplicates
concat_eval_df.drop_duplicates(subset=["text"])

Unnamed: 0,text,generated
0,The Face on Mars is nothing but a natural occu...,0
1,Students have a higher chance of catching a vi...,0
2,Driverless cars have good and bad things that ...,0
3,Some people might think that traveling in a gr...,1
4,How many of us students want to be forced to d...,0
...,...,...
42190,I think our principal's idea of making us do e...,1
42191,I think it's a good idea for schools to have o...,1
42196,Students often debate whether inactivity or s...,1
42199,Advantages of Limiting Car Usage\n\nLimiting c...,1


In [19]:
sampled_eval_df, _ = train_test_split(concat_eval_df,
                                       test_size=(len(concat_eval_df) - 20000) / len(concat_eval_df),
                                       stratify=concat_eval_df["generated"],
                                       random_state=42)

In [20]:
sampled_eval_df.reset_index(drop=True, inplace=True)

In [21]:
val_df, test_df = train_test_split(sampled_eval_df,
                                test_size=0.5,
                                stratify=sampled_eval_df["generated"],
                                random_state=42)


# Preprocess text

In [21]:
train_sentpiece_df = pd.concat([test_df["text"], concat_train_df["text"]])
train_sentpiece_df.reset_index(drop=True, inplace=True)

In [22]:
# Train sentence piece model

import sentencepiece as spm
 
# Write text data to a text file for SentencePiece training
with open('sentpiece_train.txt', 'w', encoding='utf-8') as file:
    for text in train_sentpiece_df:
        file.write(text + '\n')

# Train SentencePiece model
spm.SentencePieceTrainer.train(input='data/sentpiece_train.txt', model_prefix='data/sentpiece_model', vocab_size=30522)


In [22]:
def get_tokenized_text(df, tok_model_path):
  sp = spm.SentencePieceProcessor()
  sp.Load(tok_model_path)
  # Tokenize text using SentencePiece
  df['tokens'] = df['text'].apply(lambda x: sp.EncodeAsPieces(x.lower()))
  # Convert SentencePiece tokens to text for XGBoost
  df['text_spm'] = df['tokens'].apply(lambda x: ' '.join(x))
  return df


In [23]:
#tokenized_train_one_df = get_tokenized_text(train_set_one, "sentpiece_model.model")
#tokenized_train_two_df = get_tokenized_text(train_set_two, "sentpiece_model.model")
#tokenized_train_three_df = get_tokenized_text(train_set_three, "sentpiece_model.model")
tokenized_eval_df = get_tokenized_text(val_df, "data/sentpiece_model.model")
tokenized_test_df = get_tokenized_text(test_df, "data/sentpiece_model.model")
tokenized_main_train_df = get_tokenized_text(sampled_train_df, "data/sentpiece_model.model")

In [24]:
# X_train, X_val, y_train, y_val = train_test_split(df['text_spm'], df['generated'], test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(ngram_range=(3, 5), 
                             sublinear_tf=True,
                             lowercase=False,
                             max_features=5000)

#X_train_one = vectorizer.fit_transform(tokenized_train_one_df["text_spm"])
#X_train_two = vectorizer.transform(tokenized_train_two_df["text_spm"])
#X_train_three = vectorizer.transform(tokenized_train_three_df["text_spm"])
X_train_main = vectorizer.fit_transform(tokenized_main_train_df["text_spm"])
y_train_main = tokenized_main_train_df["generated"]
X_val = vectorizer.transform(tokenized_eval_df["text_spm"])
y_val = tokenized_eval_df["generated"]
X_test = vectorizer.transform(tokenized_eval_df["text_spm"])
y_test = tokenized_test_df["generated"]
#y_train_one = tokenized_train_one_df["generated"]
#y_train_two = tokenized_train_two_df["generated"]
#y_train_three = tokenized_train_three_df["generated"]


In [None]:
# Save the vectorizer
joblib.dump(vectorizer, "data/vectorizer.pkl")

# Modelling

## Modelling with LGBM

**Hyperparameter tune LGBM**

In [34]:
X_train_hyper, _, y_train_hyper, _ = train_test_split(X_train_main, y_train_main, train_size=0.1, random_state=42)

def custom_metric(y_true, y_pred):
    # Your custom metric, for example, a combination of ROC-AUC and recall for class 1
    roc_auc = roc_auc_score(y_true, y_pred)
    recall_class_1 = recall_score(y_true, (y_pred > 0.5).astype(int), pos_label=1)

    # You can adjust the weights based on your preference
    custom_metric_value = 0.5 * roc_auc + 0.5 * recall_class_1
    
    return custom_metric_value

def custom_metric_for_lgbm(y_true, y_pred):
    roc_auc = roc_auc_score(y_true, y_pred)
    recall_class_1 = recall_score(y_true, (y_pred > 0.5).astype(int), pos_label=1)
    custom_metric_value = 0.5 * roc_auc + 0.5 * recall_class_1
    return 'custom', custom_metric_value, True

custom_scorer = make_scorer(custom_metric, needs_proba=True)

def objective(params):
    params['objective'] = 'cross_entropy'
    params['metric'] = 'custom'
    params['n_iter'] = 2500
    params['verbose'] = -1
    params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
    params['max_depth'] = int(params['max_depth'])
    params['max_bin'] = int(params['max_bin'])
    


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    model = LGBMClassifier(**params)

    scores = cross_val_score(estimator=model, X=X_train_hyper, y=y_train_hyper, cv=cv, scoring=custom_scorer, n_jobs=-1)

    # Take the mean of the custom metric across folds
    mean_score = np.mean(scores)

    return -mean_score
# Define the search space
space = {
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.7, 1.0),
    'lambda_l1': hp.uniform('lambda_l1', 0.1, 10),
    'lambda_l2': hp.uniform('lambda_l2', 0.1, 10),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 50, 150, 1),
    'max_depth': hp.quniform('max_depth', 10, 30, 1),
    'max_bin': hp.quniform('max_bin', 500, 1000, 1),
}

# Run Hyperopt optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials, verbose=1)

# Print the best hyperparameters
print("Best Hyperparameters:", best)

100%|██████████| 50/50 [56:54<00:00, 68.29s/trial, best loss: -0.959269926304359]  
Best Hyperparameters: {'colsample_bynode': 0.891492853612568, 'colsample_bytree': 0.8652279279632024, 'lambda_l1': 0.1497981206936176, 'lambda_l2': 2.5379309743954743, 'learning_rate': 0.04665576114608101, 'max_bin': 984.0, 'max_depth': 14.0, 'min_data_in_leaf': 50.0}


In [39]:
params = {'colsample_bynode': 0.891492853612568,
          'colsample_bytree': 0.8652279279632024,
          'lambda_l1': 0.1497981206936176,
          'lambda_l2': 2.5379309743954743,
          'learning_rate': 0.04665576114608101,
          'max_bin': 984,
          'max_depth': 14, 
          'min_data_in_leaf': 50,
          'objective' : 'cross_entropy',
          'metric' : 'auc',
          'n_iter' : 2500,
          'verbose' : -1}

In [40]:
lgbm = LGBMClassifier(**params)

In [41]:
lgbm.fit(X_train_main, y_train_main)
y_pred_proba = lgbm.predict_proba(X_val)[:, 1]
y_pred = lgbm.predict(X_val)
roc = roc_auc_score(y_val, y_pred_proba)

print("The ROC:", roc)
print(classification_report(y_val, y_pred))



The ROC: 0.998834454466469
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6888
           1       0.99      0.98      0.98      3112

    accuracy                           0.99     10000
   macro avg       0.99      0.99      0.99     10000
weighted avg       0.99      0.99      0.99     10000



In [42]:
lgbm.booster_.save_model('lgbm_model.txt')

<lightgbm.basic.Booster at 0x167e2dd04d0>