# LLM - Detect AI Generated Text 
# DATA PRE-PROCESS

## import

In [1]:
import math
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

from gensim.models import fasttext
from gensim.models import doc2vec

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.metrics import roc_curve ,precision_recall_curve,auc,confusion_matrix,ConfusionMatrixDisplay

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import PreTrainedTokenizerFast

import keras


[nltk_data] Downloading package punkt to /home/mrtc101/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mrtc101/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
2024-01-22 15:52:46.783119: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-22 15:52:46.783154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-22 15:52:46.815060: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-22 15:52:46.884467: I tensorflow/core/platfor

In [2]:
initial_dataset = pd.read_csv("../data/train_essays.csv")
prompts_dataset = pd.read_csv("../data/train_prompts.csv")
custom_data = pd.read_csv("../data/custom_essays.csv")
downloaded_data_1 = pd.read_csv("../data/train_v4_drcat_01.csv")

## Adding new Data

Describing the imbalance of this dataset in terms of ration is 1:500. The dataset presents sever imbalance. Previous aproches using only 20 new LLM generated examples manually and random Downsampling technic, didn't reach a higher score than 0.56. 

Concluding that more new data is needed, i downloaded data shared by competitors.

In [3]:
downloaded_data_1["prompt_id"] = downloaded_data_1["prompt_name"].apply(lambda name : 0 if name == "Car-free cities" else 1 if name == "Does the electoral college work?" else 21 )
downloaded_data_1 = downloaded_data_1[["prompt_id","text","label"]].rename(columns={"label":"generated"})

In [4]:
new_data = pd.concat([custom_data,downloaded_data_1],axis=0,ignore_index=True)
new_data["id"] = range(0,new_data.shape[0])

In [5]:
new_data

Unnamed: 0,id,prompt_id,text,generated
0,0,0,The Advantages of Limiting Car Usage in Suburb...,1
1,1,0,Paris' Driving Ban: A Temporary Solution to En...,1
2,2,0,Bogota's Car-Free Day: A Model for Sustainable...,1
3,3,0,Shifting Trends: The Decline of Car Culture in...,1
4,4,0,The End of Car Culture and the Rise of Sustain...,1
...,...,...,...,...
73590,73590,21,I am writing you today to disagree with your t...,1
73591,73591,21,"Dear Principal,\n\nIn conclusion, I would obse...",1
73592,73592,21,"Dear Mrs. Principal,\n\nin these kinds of cons...",1
73593,73593,21,I enjoyed Form five and excitedly ex claims ed...,1


In [6]:
target_data = pd.concat([initial_dataset,new_data],ignore_index=True,axis=0)
target_data

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
74968,73590,21,I am writing you today to disagree with your t...,1
74969,73591,21,"Dear Principal,\n\nIn conclusion, I would obse...",1
74970,73592,21,"Dear Mrs. Principal,\n\nin these kinds of cons...",1
74971,73593,21,I enjoyed Form five and excitedly ex claims ed...,1


### Data cleaning

In [7]:
print(target_data.shape)
target_data.drop_duplicates(subset=["text"],inplace=True,keep="first")
target_data = target_data.dropna()
target_data.reset_index(inplace=True,drop=True)
print(target_data.shape)

(74973, 4)
(73595, 4)


## Data Feature Engineering

### Training a Tokenizer
Reading the competition discussions lead me to this [Notebook](https://www.kaggle.com/code/datafan07/train-your-own-tokenizer), where is suggested to add words with typos into the vocabulary for better performance by training a tokenizer.

1. normalization
2. pre-tokenization
3. model
4. post-processing

>ByteLevel: 
>
>Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties:
> - Since it maps on bytes, a tokenizer using this only requires 256 characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.
> - A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)
> - For non ascii characters, it gets completely unreadable, but it works nonetheless!

### Byte-Pair Encoding tokenizer

In [8]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
#Cleaning
raw_tokenizer.normalizer =  normalizers.Sequence(
    [
        normalizers.NFC(),
        normalizers.Lowercase(),
        normalizers.Replace("\n"," "),
        normalizers.Replace("\r"," "),
        normalizers.Replace("\t"," ")
    ]    
    )
#First tokenization
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
#Training
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(show_progress=True,special_tokens=special_tokens)

def data_iter(dataset):
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

raw_tokenizer.train_from_iterator(data_iter(target_data),trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)






In [17]:
tokenizer.save_pretrained("../data/lexp/byte_pair_tokenizer")

('../data/lexp/byte_pair_tokenizer/tokenizer_config.json',
 '../data/lexp/byte_pair_tokenizer/special_tokens_map.json',
 '../data/lexp/byte_pair_tokenizer/tokenizer.json')

In [10]:
raw_tokenizer.normalizer.normalize_str("Martín's bag")

"martín's bag"

In [11]:
tokenizer.tokenize("Martín's bag")

['Ġmart', 'ÃŃ', 'n', "'s", 'Ġbag']

### Words Tokenizing

In [12]:
tokenized_dataset = target_data.copy()
tokenized_dataset["text"] = target_data["text"].apply(lambda x : tokenizer.tokenize(x))
tokenized_dataset

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,"[Ġcars, ., Ġcars, Ġhave, Ġbeen, Ġaround, Ġsinc...",0
1,005db917,0,"[Ġtransportation, Ġis, Ġa, Ġlarge, Ġnecessity,...",0
2,008f63e3,0,"[Ġ"", america, 's, Ġlove, Ġaffair, Ġwith, Ġit, ...",0
3,00940276,0,"[Ġhow, Ġoften, Ġdo, Ġyou, Ġride, Ġin, Ġa, Ġcar...",0
4,00c39458,0,"[Ġcars, Ġare, Ġa, Ġwonderful, Ġthing, ., Ġthey...",0
...,...,...,...,...
73590,73590,21,"[Ġi, Ġam, Ġwriting, Ġyou, Ġtoday, Ġto, Ġdisagr...",1
73591,73591,21,"[Ġdear, Ġprincipal, ,, Ġ, Ġin, Ġconclusion, ,,...",1
73592,73592,21,"[Ġdear, Ġmrs, ., Ġprincipal, ,, Ġ, Ġin, Ġthese...",1
73593,73593,21,"[Ġi, Ġenjoyed, Ġform, Ġfive, Ġand, Ġexcitedly,...",1


#### Deleting stopwords

In [13]:
tokenized_stopwords=[token for word in stopwords.words('english') for token in tokenizer.tokenize(word)]

In [14]:
new_docs = []
for doc in tokenized_dataset["text"]:
    tokens = [word for word in doc if word not in tokenized_stopwords]  # Eliminar stopwords
    new_docs.append(tokens)
tokenized_dataset["text"]=new_docs

In [15]:
tokenized_dataset.to_csv("../data/lexp/tokenized_data/tokenized.csv",index=False)

## Doc2Vec

In [18]:
docs_dataset = tokenized_dataset.copy()
docs_dataset["text"] = [doc2vec.TaggedDocument(row[2],['z'+str(row[0])]) for row in tokenized_dataset.values]
docs_dataset["text"]

0        ([Ġcars, ., Ġcars, Ġaround, Ġsince, Ġbecame, Ġ...
1        ([Ġtransportation, Ġlarge, Ġnecessity, Ġcountr...
2        ([Ġ", america, Ġlove, Ġaffair, Ġvehicles, Ġsee...
3        ([Ġoften, Ġride, Ġcar, ?, Ġdrive, Ġone, Ġmotor...
4        ([Ġcars, Ġwonderful, Ġthing, ., Ġperhaps, Ġone...
                               ...                        
73590    ([Ġwriting, Ġtoday, Ġdisagree, Ġtaking, Ġactio...
73591    ([Ġdear, Ġprincipal, ,, Ġ, Ġconclusion, ,, Ġwo...
73592    ([Ġdear, Ġmrs, ., Ġprincipal, ,, Ġ, Ġkinds, Ġc...
73593    ([Ġenjoyed, Ġform, Ġfive, Ġexcitedly, Ġex, Ġcl...
73594    ([Ġdear, Ġteacher, _, name, ,, Ġ, Ġwell, Ġms, ...
Name: text, Length: 73595, dtype: object

In [20]:
doc_model = doc2vec.Doc2Vec(documents=docs_dataset["text"],vector_size=100,epochs=20)

In [21]:
doc_model.save("../data/lexp/embedding_model/docModel.bin")

## Inferred Features

In [None]:
tokenized_dataset = pd.read_csv("../data/tokenized_data/tokenized.csv",)
tokenized_dataset["text"] = tokenized_dataset["text"].apply(lambda x : eval(x))
docs_dataset = tokenized_dataset.copy()
docs_dataset["text"] = [doc2vec.TaggedDocument(row[2],['z'+row[0]]) for row in tokenized_dataset.values]
docs_dataset["text"]

In [24]:
def features(dataset):
    token_count=dataset["text"].apply(lambda x: len(x))
    sentence_count = []
    punctuation_count = []
    apostrofees_count = []
    unk_count = []
    for doc in dataset["text"]:
        unk = 0
        dot = 0
        punctuation = 0
        apostrofees = 0
        for token in doc.words:
            if(token.endswith(".")):
                dot+=1
                punctuation+=1
            elif(token.endswith(",") or token.endswith("?") or token.endswith("!")):
                punctuation+=1
            elif(token.count("'")>0):
                    apostrofees+=token.count("'")
            elif(token=="[UNK]"):
                unk+=1
        sentence_count.append(dot)
        punctuation_count.append(punctuation)
        apostrofees_count.append(apostrofees)
        unk_count.append(unk)
    df = pd.DataFrame(
        columns=["token_num","sent_num","punct_sym","apostrof_sym","unk_num"]
    )
    df["token_num"]=token_count
    df["sent_num"]=sentence_count
    df["punct_sym"]=punctuation_count
    df["apostrof_sym"]=apostrofees_count
    df["unk_num"]=unk_count
    return df

## Normalizar embedding

In [26]:
doc_model = doc2vec.Doc2Vec.load("../data/lexp/embedding_model/docModel.bin")
arr = [doc_model.dv[doc.tags] for doc in docs_dataset["text"]]
embeddings_dataset = pd.DataFrame(np.reshape(arr,(len(arr), 100)))
embeddings_dataset["norm"]=np.linalg.norm(embeddings_dataset, axis=1)
norm_embeddings_dataset = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x), axis=1, arr=embeddings_dataset))
norm_embeddings_dataset["normalized_norm"] = (embeddings_dataset['norm'] - embeddings_dataset['norm'].min()) / (embeddings_dataset['norm'].max() - embeddings_dataset['norm'].min())
# Eliminar la columna '100' del DataFrame
norm_embeddings_dataset = norm_embeddings_dataset.drop([100], axis=1)

# Concatenar los DataFrames
doc2vec_data = pd.concat([docs_dataset, norm_embeddings_dataset], axis=1)

# Calcular características adicionales con la función 'features' (no proporcionada en tu código)
doc_features = features(doc2vec_data)

# Normalizar las características adicionales
for col in doc_features.columns:
    if doc_features[col].max() > 0:
        doc_features[col] = doc_features[col] / np.linalg.norm(doc_features[col])

# Concatenar las características adicionales al DataFrame principal y eliminar la columna 'text'
doc2vec_data = pd.concat([doc2vec_data, doc_features], axis=1).drop("text", axis=1)

In [41]:
# Guardar el DataFrame resultante en un archivo CSV
from sklearn.model_selection import train_test_split
train,test = train_test_split(doc2vec_data, test_size=0.6)
train.to_csv("../data/lexp/pre_processed/train.csv", index=False)
test.to_csv("../data/lexp/pre_processed/test.csv", index=False)