# LLM Competition Submission

This notebook represents my submission to the [LLM - Detect AI Generated Text](https://www.kaggle.com/competitions/llm-detect-ai-generated-text/overview) competition on Kaggle. This notebook was made on Kaggle as the competition is a code competition. This is why there may be some inconsistencies with the other code files and notebooks.

In [1]:
!pip install -q language-tool-python --no-index --find-links ../input/daigt-misc/
!mkdir -p /root/.cache/language_tool_python/
!cp -r /kaggle/input/daigt-misc/lang57/LanguageTool-5.7 /root/.cache/language_tool_python/LanguageTool-5.7

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
from torchtext.data import get_tokenizer
import language_tool_python # Need to add the daigt-misc dataset for offline use
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier

# Getting tqdm for pandas 
tqdm.pandas()

device = "cuda:0" if torch.cuda.is_available() else "cpu"



## Gradient Boosting

In [3]:
# Preprocessor
class Preprocessing():
    # Constructor
    def __init__(self,data_df:pd.DataFrame):
        self.data = data_df
        self.stop_words = stopwords.words('english')
        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+') # tokenizer for words only
        self.overall_tokenizer = get_tokenizer('spacy',language='en_core_web_sm')
        self.grammar_checker = language_tool_python.LanguageTool('en-US',config={'cacheSize': 5000,'maxCheckThreads':20})
        self.detector_tokenizer = AutoTokenizer.from_pretrained("../input/roberta-base-openai-detector/roberta-base-openai-detector")
        self.detector = AutoModelForSequenceClassification.from_pretrained("../input/roberta-base-openai-detector/roberta-base-openai-detector")
        self.emotion_tokenizer = AutoTokenizer.from_pretrained("../input/emotion-detector/emotion-english-distilroberta-base")
        self.emotion_detector = AutoModelForSequenceClassification.from_pretrained("../input/emotion-detector/emotion-english-distilroberta-base")
        self.emotions = ['anger','disgust','fear','joy','neutral','sadness','surprise']

        print('Tokenizing the essays into only words...')
        self.data['tokenized_essay_words'] = self.data['essay'].progress_apply(self.tokenize_essay_words)
        print()
        print('Tokenizing the essays into overall tokens with words and punctuation')
        self.data['tokenized_overall'] = self.data['essay'].progress_apply(self.tokenize_overall)
        print()
    
    # Function to tokenize each essay into words only
    def tokenize_essay_words(self,essay:str) -> list:
        return self.tokenizer.tokenize(essay)
    
    # Function to tokenize each essay into overall tokens
    def tokenize_overall(self,essay:str) -> list:
        return self.overall_tokenizer(essay)
    
    # Getting the count of stop words in an essay
    def get_stop_word_count(self,text:str) -> int:
        count = 0
        for word in text:
            if word in self.stop_words:
                count += 1
        return count
    
    # Getting the count of the unique words in an essay
    def get_unique_words(self,essay:list) -> int:
        return len(set(essay))
    
    # Getting the counts of each punctuation (?, !, ;, :)
    def count_punc(self, essay:list) -> tuple[int,int,int,int]:
        count_q = 0
        count_ex = 0
        count_semi = 0
        count_col = 0

        # Iterating through the tokenized essay
        for token in essay:
            if token == "?":
                count_q += 1
            elif token == "!":
                count_ex += 1
            elif token == ";":
                count_semi += 1
            elif token == ":":
                count_col += 1
        
        return count_q, count_ex,count_semi, count_col
    
    # A function to get the number of grammatical errors
    def get_grammar_error_count(self,essay:str) -> int:
        errors = self.grammar_checker.check(essay)
        return len(errors)
    
    # A function to get the detection from OpenAI's GPT-2 Detector
    def get_detect_pred(self,text:str) -> int:
        # Tokenizing the input essay
        inputs = self.detector_tokenizer(text,return_tensors='pt',truncation=True).to(device)

        # Getting the logits
        with torch.no_grad():
            logits = self.detector(**inputs).logits

        # Doing 1 - max logit because the model has "Real" = class 1 and "Fake" = class 0
        # My labels are the opposite, 1 = LLM Written and 0 = student written.
        # If a logit = 0 = Fake, 1-0 = 1 = LLM Written
        # If a logit = 1 = Real, 1-1 = 0 = student written
        predicted_class = 1 - logits.argmax().item()
        return predicted_class
    
    # A function to get the prediction from the Emotion Detector
    def emotion_detector_pred(self,essay:str) -> tuple[int,int,int,int]:
        # Tokenizing the input essay
        inputs = self.emotion_tokenizer(essay,return_tensors='pt',truncation=True).to(device)

        # Getting the logits
        with torch.no_grad():
            logits = self.emotion_detector(**inputs).logits

        # Getting the predicted emotion
        predicted_emotion = self.emotions[logits.argmax().item()]
        if predicted_emotion == 'anger':
            return 1,0,0,0
        elif predicted_emotion == 'surprise':
            return 0,1,0,0
        elif predicted_emotion == 'sadness':
            return 0,0,1,0
        elif predicted_emotion == 'fear':
            return 0,0,0,1
        else:
            return 0,0,0,0
    
    # A function to knit all preprocessing together
    def preprocessing(self) -> pd.DataFrame:
        # Getting the stop word count and the stop word ratio
        print('Adding the stop word features...')
        self.data['stop_word_count'] = self.data['tokenized_essay_words'].progress_apply(self.get_stop_word_count)
        self.data['stop_word_ratio'] = self.data['stop_word_count'] / self.data['word_count']
        print()
        
        # Getting the unique word counts and the unique word ratio
        print('Adding the unique word features...')
        self.data['unique_word_count'] = self.data['tokenized_essay_words'].progress_apply(self.get_unique_words)
        self.data['unique_word_ratio'] = self.data['unique_word_count'] / self.data['word_count']
        print()

        # Adding the punctuation features
        print('Adding the punctuation features...')
        punc_counts = self.data['tokenized_overall'].progress_apply(self.count_punc)
        self.data['count_question'] = [row[0] for row in punc_counts]
        self.data['count_exclamation'] = [row[1] for row in punc_counts]
        self.data['count_semi'] = [row[2] for row in punc_counts]
        self.data['count_colon'] = [row[3] for row in punc_counts]
        print()

        # Getting the number of grammatical errors
        print('Getting grammar error counts...')
        self.data['grammar_errors'] = self.data['essay'].progress_apply(self.get_grammar_error_count)
        print()

        # Adding the detector model prediction
        print('Getting Detector Prediction...')
        self.data['detector_pred'] = self.data['essay'].progress_apply(self.get_detect_pred)
        print()

        # Getting the emotion model prediction
        print('Getting Emotion Prediction...')
        emotion_rows = self.data['essay'].progress_apply(self.emotion_detector_pred)
        self.data['anger_pred'] = [row[0] for row in emotion_rows]
        self.data['surprise_pred'] = [row[1] for row in emotion_rows]
        self.data['sadness_pred'] = [row[2] for row in emotion_rows]
        self.data['fear_pred'] = [row[3] for row in emotion_rows]
        print()

        # Dropping the tokenized parts
        self.data.drop(['tokenized_essay_words','tokenized_overall'],axis=1,inplace=True)

        # returning the preprocessed dataframe
        return self.data

In [4]:
# Getting the test data
test_data = pd.read_csv('../input/llm-detect-ai-generated-text/test_essays.csv')

In [5]:
# renaming columns
test_data.rename(columns={'text':'essay'},inplace=True)

In [6]:
# Adding a column for word count
def get_word_count(text:str) -> int:
    """
    get_word_count

    A function to get the word count of some text.

    inputs:
    - text: a string that indicates you want to get the word count for.

    outputs:
    - an integer representing the word count
    """
    return len(re.findall(r'[a-zA-Z_]+',text))

test_data['word_count'] = test_data['essay'].progress_apply(get_word_count)

100%|██████████| 3/3 [00:00<00:00, 4620.97it/s]


In [7]:
# Creating the preprocessing class
preprocessor = Preprocessing(test_data)

# Putting the model on GPU
preprocessor.detector.to(device)
preprocessor.emotion_detector.to(device)

# Running data through preprocessing 
test_data_prepared = preprocessor.preprocessing()

Tokenizing the essays into only words...


100%|██████████| 3/3 [00:00<00:00, 3887.21it/s]



Tokenizing the essays into overall tokens with words and punctuation


100%|██████████| 3/3 [00:00<00:00, 2139.59it/s]







Adding the stop word features...


100%|██████████| 3/3 [00:00<00:00, 7512.19it/s]



Adding the unique word features...


100%|██████████| 3/3 [00:00<00:00, 10347.79it/s]



Adding the punctuation features...


100%|██████████| 3/3 [00:00<00:00, 9098.27it/s]



Getting grammar error counts...


100%|██████████| 3/3 [00:05<00:00,  1.98s/it]



Getting Detector Prediction...


100%|██████████| 3/3 [00:00<00:00,  3.84it/s]



Getting Emotion Prediction...


100%|██████████| 3/3 [00:00<00:00, 128.21it/s]







In [8]:
# Text cleaning 
def text_cleaning(essay:str) -> str:
    cleaned_text = essay.replace('\n',"")
    cleaned_text = essay.replace("\t","")
    
    return cleaned_text

# Getting the training data and cleaning it
essays_cleaned = test_data_prepared['essay'].progress_apply(text_cleaning)

100%|██████████| 3/3 [00:00<00:00, 7733.81it/s]


In [9]:
# Getting the tfidf vectorizer and running the test data through it. 
with open('../input/llm-competition-models/tfidf-vectorizer-kaggle.pk/tfidf-vectorizer-kaggle.pk','rb') as file:
    vectorizer = pickle.load(file)

vectorized = vectorizer.transform(essays_cleaned)
transformed_data = pd.DataFrame(vectorized.toarray(),columns=vectorizer.get_feature_names_out())

In [10]:
# Combining the dataframes
combined_data = pd.concat([test_data_prepared,transformed_data],axis=1)

In [11]:
# Dropping the other columns
combined_data.drop(['id','prompt_id','essay'],axis=1,inplace=True)

In [12]:
# Getting the scalar and scaling
with open('../input/llm-competition-models/scalar-noisy.pkl','rb') as file:
    scalar = pickle.load(file)

numerical = ['word_count','stop_word_count','stop_word_ratio','unique_word_count','unique_word_ratio',
             'count_question','count_exclamation','count_semi','count_colon','grammar_errors']
combined_data[numerical] = scalar.transform(combined_data[numerical])

In [13]:
# Getting the model
catboost_model = CatBoostClassifier()
catboost_model.load_model('../input/llm-competition-models/catboost-noisy-fine')

<catboost.core.CatBoostClassifier at 0x7f595cab63b0>

In [14]:
predictions_catboost = catboost_model.predict_proba(combined_data)[:,1]

## DistilBERT

In [15]:
# Preprocessing
def preprocess(essay:str):
    preprocessed_essay = essay.lower()
    
    # Subbing out \n and \t
    preprocessed_essay = re.sub("\n","",preprocessed_essay)
    preprocessed_essay = re.sub("\t","",preprocessed_essay)

    # Replacing /xa0 = non-breaking space in Latin1
    preprocessed_essay = preprocessed_essay.replace(u'\xa0', u' ')
    
    return preprocessed_essay

In [16]:
# Running the training essays and validation essays through preprocessing
preprocessed_essays = test_data['essay'].progress_apply(preprocess)

100%|██████████| 3/3 [00:00<00:00, 4888.47it/s]


In [17]:
# Getting the tokenizer and model, using DistilBERT
tokenizer = AutoTokenizer.from_pretrained('../input/llm-competition-models/distil-bert-tokenizer/distil-bert-tokenizer')
distil_model = AutoModelForSequenceClassification.from_pretrained('../input/llm-competition-models/fine-tuned-distillBert/fine-tuned-distillBert')
distil_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [18]:
# Defining a function for inference
def inference(essay:str) -> float:
    # Tokenizing the input essay
    inputs = tokenizer(essay,padding='max_length',truncation=True,max_length=512,return_tensors='pt').to(device)
    
    # Getting the logits
    with torch.no_grad():
        logits = distil_model(**inputs).logits
        probability = nn.functional.sigmoid(logits)
    return probability.item()

In [19]:
# Running the examples through the model
distil_pred = preprocessed_essays.progress_apply(inference)

100%|██████████| 3/3 [00:00<00:00, 30.60it/s]


In [20]:
# Combining predictions with ids
submission = pd.DataFrame()
submission['id'] = test_data['id']
submission['generated'] = (predictions_catboost + distil_pred) / 2

In [21]:
# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

## Checking Training & Validation Score

In [22]:
# from sklearn.metrics import roc_auc_score

# # Getting the data
# training_data = pd.read_csv('../input/training-llm-competition/train.csv')
# valid_data = pd.read_csv('../input/training-llm-competition/validation.csv')

# # Making copies
# train_data_copy = training_data.copy()
# valid_data_copy = valid_data.copy()

In [23]:
# # Text cleaning 
# def text_cleaning(essay:str) -> str:
#     cleaned_text = essay.replace('\n',"")
#     cleaned_text = essay.replace("\t","")
    
#     return cleaned_text

# train_data_copy['essay'] = train_data_copy['essay'].progress_apply(text_cleaning)
# valid_data_copy['essay'] = valid_data_copy['essay'].progress_apply(text_cleaning)

In [24]:
# # Getting the scalar and scaling
# with open('../input/llm-competition-models/scalar-noisy.pkl','rb') as file:
#     scalar = pickle.load(file)

# numerical = ['word_count','stop_word_count','stop_word_ratio','unique_word_count','unique_word_ratio',
#              'count_question','count_exclamation','count_semi','count_colon','grammar_errors']
# train_data_copy[numerical] = scalar.transform(train_data_copy[numerical])
# valid_data_copy[numerical] = scalar.transform(valid_data_copy[numerical])

In [25]:
# # Getting the model
# catboost_model = CatBoostClassifier()
# catboost_model.load_model('../input/llm-competition-models/catboost-noisy-fine')

# # Making predictions on training and validation
# predictions_catboost_train = catboost_model.predict_proba(train_data_copy.drop(['row_id','essay','LLM_written','prompt'],axis=1))[:,1]
# predictions_catboost_valid = catboost_model.predict_proba(valid_data_copy.drop(['row_id','essay','LLM_written','prompt'],axis=1))[:,1]

In [26]:
# # Preprocessing
# def preprocess(essay:str):
#     preprocessed_essay = essay.lower()
    
#     # Subbing out \n and \t
#     preprocessed_essay = re.sub("\n","",preprocessed_essay)
#     preprocessed_essay = re.sub("\t","",preprocessed_essay)

#     # Replacing /xa0 = non-breaking space in Latin1
#     preprocessed_essay = preprocessed_essay.replace(u'\xa0', u' ')
    
#     return preprocessed_essay
# train_data_copy['essay'] = training_data['essay'].progress_apply(preprocess)
# valid_data_copy['essay'] = valid_data['essay'].progress_apply(preprocess)

In [27]:
# # Getting the tokenizer and model, using DistilBERT
# tokenizer = AutoTokenizer.from_pretrained('../input/llm-competition-models/distil-bert-tokenizer/distil-bert-tokenizer')
# distil_model = AutoModelForSequenceClassification.from_pretrained('../input/llm-competition-models/fine-tuned-distillBert/fine-tuned-distillBert')
# distil_model.to(device)

In [28]:
# # Defining a function for inference
# def inference(essay:str) -> float:
#     # Tokenizing the input essay
#     inputs = tokenizer(essay,padding='max_length',truncation=True,max_length=512,return_tensors='pt').to(device)
    
#     # Getting the logits
#     with torch.no_grad():
#         logits = distil_model(**inputs).logits
#         probability = nn.functional.sigmoid(logits)
#     return probability.item()

In [29]:
# distil_pred_train = train_data_copy['essay'].progress_apply(inference)
# distil_pred_valid = valid_data_copy['essay'].progress_apply(inference)

In [30]:
# # Getting the final predictions
# final_pred_train = (distil_pred_train + predictions_catboost_train) / 2
# final_pred_valid = (distil_pred_valid + predictions_catboost_valid) / 2

In [31]:
# # Making predictions
# print('Predictions for Ensemble')
# train_score = roc_auc_score(training_data['LLM_written'],final_pred_train)
# valid_score = roc_auc_score(valid_data['LLM_written'],final_pred_valid)
# print(f'Training ROC AUC: {train_score}')
# print(f'Validation ROC AUC: {valid_score}')