# LLM - Detect AI Generated Text 
# DATA PRE-PROCESS

## import

In [2]:
import math
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

from gensim.models import fasttext
from gensim.models import doc2vec

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.metrics import roc_curve ,precision_recall_curve,auc,confusion_matrix,ConfusionMatrixDisplay

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import PreTrainedTokenizerFast

import keras


[nltk_data] Downloading package punkt to /home/mrtc101/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mrtc101/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
2024-01-10 16:59:04.737490: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-10 16:59:04.737526: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-10 16:59:04.739454: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-10 16:59:04.748619: I tensorflow/core/platfor

In [None]:
initial_dataset = pd.read_csv("../data/train_essays.csv")
prompts_dataset = pd.read_csv("../data/train_prompts.csv")
custom_data = pd.read_csv("../data/custom_essays.csv")
downloaded_data_1 = pd.read_csv("../data/train_v4_drcat_01.csv")

## Adding new Data

Describing the imbalance of this dataset in terms of ration is 1:500. The dataset presents sever imbalance. Previous aproches using only 20 new LLM generated examples manually and random Downsampling technic, didn't reach a higher score than 0.56. 

Concluding that more new data is needed, i downloaded data shared by competitors.

In [None]:
downloaded_data_1["prompt_id"] = downloaded_data_1["prompt_name"].apply(lambda name : 0 if name == "Car-free cities" else 1 if name == "Does the electoral college work?" else 21 )
downloaded_data_1 = downloaded_data_1[["prompt_id","text","label"]].rename(columns={"label":"generated"})

In [None]:
new_data = pd.concat([custom_data,downloaded_data_1],axis=0,ignore_index=True)
new_data["id"] = range(0,new_data.shape[0])

In [None]:
new_data

In [None]:
target_data = pd.concat([initial_dataset,new_data],ignore_index=True,axis=0)
target_data

### Data cleaning

In [None]:
print(target_data.shape)
target_data.drop_duplicates(subset=["text"],inplace=True,keep="first")
target_data = target_data.dropna()
target_data.reset_index(inplace=True,drop=True)
print(target_data.shape)

## Data Feature Engineering

### Training a Tokenizer
Reading the competition discussions lead me to this [Notebook](https://www.kaggle.com/code/datafan07/train-your-own-tokenizer), where is suggested to add words with typos into the vocabulary for better performance by training a tokenizer.

1. normalization
2. pre-tokenization
3. model
4. post-processing

>ByteLevel: 
>
>Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties:
> - Since it maps on bytes, a tokenizer using this only requires 256 characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.
> - A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)
> - For non ascii characters, it gets completely unreadable, but it works nonetheless!

### Byte-Pair Encoding tokenizer

In [None]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
#Cleaning
raw_tokenizer.normalizer =  normalizers.Sequence(
    [
        normalizers.NFC(),
        normalizers.Lowercase(),
        normalizers.Replace("\n"," "),
        normalizers.Replace("\r"," "),
        normalizers.Replace("\t"," ")
    ]    
    )
#First tokenization
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
#Training
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(show_progress=True,special_tokens=special_tokens)

def data_iter(dataset):
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

raw_tokenizer.train_from_iterator(data_iter(target_data),trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [None]:
tokenizer.save_pretrained("../data/byte_pair_tokenizer")

In [None]:
raw_tokenizer.normalizer.normalize_str("Martín's bag")

In [None]:
tokenizer.tokenize("Martín's bag")

### Words Tokenizing

In [None]:
tokenized_dataset = target_data.copy()
tokenized_dataset["text"] = target_data["text"].apply(lambda x : tokenizer.tokenize(x))
tokenized_dataset

#### Deleting stopwords

In [None]:
tokenized_stopwords=[token for word in stopwords.words('english') for token in tokenizer.tokenize(word)]

In [None]:
new_docs = []
for doc in tokenized_dataset["text"]:
    tokens = [word for word in doc if word not in tokenized_stopwords]  # Eliminar stopwords
    new_docs.append(tokens)
tokenized_dataset["text"]=new_docs

In [None]:
tokenized_dataset.to_csv("../data/lexp/tokenized_data/tokenized.csv",index=False)

## Doc2Vec

In [5]:
docs_dataset = tokenized_dataset.copy()
docs_dataset["text"] = [doc2vec.TaggedDocument(row[2],['z'+row[0]]) for row in tokenized_dataset.values]
docs_dataset["text"]

0        ([Ġcars, ., Ġcars, Ġaround, Ġsince, Ġbecame, Ġ...
1        ([Ġtransportation, Ġlarge, Ġnecessity, Ġcountr...
2        ([Ġ", america, Ġlove, Ġaffair, Ġvehicles, Ġsee...
3        ([Ġoften, Ġride, Ġcar, ?, Ġdrive, Ġone, Ġmotor...
4        ([Ġcars, Ġwonderful, Ġthing, ., Ġperhaps, Ġone...
                               ...                        
65349    ([Ġ, Ġdear, Ġsenator, ,, Ġ, Ġwriting, Ġregardi...
65350    ([Ġ, Ġremember, Ġday, Ġdistinctively, ., Ġsitt...
65351    ([Ġ, Ġdear, Ġsenator, ,, ĠĠ, Ġwriting, Ġletter...
65352    ([Ġ, Ġdear, Ġsenator, ,, Ġ, Ġwriting, Ġurge, Ġ...
65353    ([Ġ, Ġtypical, Ġsummer, Ġafternoon, Ġhometown,...
Name: text, Length: 65354, dtype: object

In [6]:
doc_model = doc2vec.Doc2Vec(documents=docs_dataset["text"],vector_size=100,epochs=10)

In [7]:
doc_model.save("../data/lexp/embedding_model/docModel.bin")

## Inferred Features

In [9]:
def features(dataset):
    token_count=dataset["text"].apply(lambda x: len(x))
    sentence_count = []
    punctuation_count = []
    apostrofees_count = []
    unk_count = []
    for doc in dataset["text"]:
        unk = 0
        dot = 0
        punctuation = 0
        apostrofees = 0
        for token in doc.words:
            if(token.endswith(".")):
                dot+=1
                punctuation+=1
            elif(token.endswith(",") or token.endswith("?") or token.endswith("!")):
                punctuation+=1
            elif(token.count("'")>0):
                    apostrofees+=token.count("'")
            elif(token=="[UNK]"):
                unk+=1
        sentence_count.append(dot)
        punctuation_count.append(punctuation)
        apostrofees_count.append(apostrofees)
        unk_count.append(unk)
    df = pd.DataFrame(
        columns=["token_num","sent_num","punct_sym","apostrof_sym","unk_num"]
    )
    df["token_num"]=token_count
    df["sent_num"]=sentence_count
    df["punct_sym"]=punctuation_count
    df["apostrof_sym"]=apostrofees_count
    df["unk_num"]=unk_count
    return df

## Normalizar embedding

In [4]:
tokenized_dataset = pd.read_csv("../data/tokenized_data/tokenized.csv",)
tokenized_dataset["text"] = tokenized_dataset["text"].apply(lambda x : eval(x))
docs_dataset = tokenized_dataset.copy()
docs_dataset["text"] = [doc2vec.TaggedDocument(row[2],['z'+row[0]]) for row in tokenized_dataset.values]
docs_dataset["text"]

0        ([Ġcars, ., Ġcars, Ġaround, Ġsince, Ġbecame, Ġ...
1        ([Ġtransportation, Ġlarge, Ġnecessity, Ġcountr...
2        ([Ġ", america, Ġlove, Ġaffair, Ġvehicles, Ġsee...
3        ([Ġoften, Ġride, Ġcar, ?, Ġdrive, Ġone, Ġmotor...
4        ([Ġcars, Ġwonderful, Ġthing, ., Ġperhaps, Ġone...
                               ...                        
65349    ([Ġ, Ġdear, Ġsenator, ,, Ġ, Ġwriting, Ġregardi...
65350    ([Ġ, Ġremember, Ġday, Ġdistinctively, ., Ġsitt...
65351    ([Ġ, Ġdear, Ġsenator, ,, ĠĠ, Ġwriting, Ġletter...
65352    ([Ġ, Ġdear, Ġsenator, ,, Ġ, Ġwriting, Ġurge, Ġ...
65353    ([Ġ, Ġtypical, Ġsummer, Ġafternoon, Ġhometown,...
Name: text, Length: 65354, dtype: object

In [10]:
doc_model = doc2vec.Doc2Vec.load("../data/embedding_model/docModel.bin")
arr = [doc_model.dv[doc.tags] for doc in docs_dataset["text"]]
embeddings_dataset = pd.DataFrame(np.reshape(arr,(len(arr), 100)))
embeddings_dataset["norm"]=np.linalg.norm(embeddings_dataset, axis=1)
norm_embeddings_dataset = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x), axis=1, arr=embeddings_dataset))
norm_embeddings_dataset["normalized_norm"] = (embeddings_dataset['norm'] - embeddings_dataset['norm'].min()) / (embeddings_dataset['norm'].max() - embeddings_dataset['norm'].min())
# Eliminar la columna '100' del DataFrame
norm_embeddings_dataset = norm_embeddings_dataset.drop([100], axis=1)

# Concatenar los DataFrames
doc2vec_data = pd.concat([docs_dataset, norm_embeddings_dataset], axis=1)

# Calcular características adicionales con la función 'features' (no proporcionada en tu código)
doc_features = features(doc2vec_data)

# Normalizar las características adicionales
for col in doc_features.columns:
    if doc_features[col].max() > 0:
        doc_features[col] = doc_features[col] / np.linalg.norm(doc_features[col])

# Concatenar las características adicionales al DataFrame principal y eliminar la columna 'text'
doc2vec_data = pd.concat([doc2vec_data, doc_features], axis=1).drop("text", axis=1)



In [None]:
# Guardar el DataFrame resultante en un archivo CSV
train,test = doc2vec_data.split()
train.to_csv("../data/lexp/pre_processed/train.csv", index=False)
test.to_csv("../data/lexp/pre_processed/test.csv", index=False)