# LLM - Detect AI Generated Text
## Import

In [1]:
import math
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

from gensim.corpora import Dictionary
from gensim.models import word2vec
from gensim.models import doc2vec

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.metrics import roc_curve ,precision_recall_curve,auc,confusion_matrix,ConfusionMatrixDisplay

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import PreTrainedTokenizerFast

import keras

[nltk_data] Downloading package punkt to /home/mrtc101/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mrtc101/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
2024-01-07 19:16:57.367252: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-07 19:16:57.367298: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-07 19:16:57.368057: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-07 19:16:57.374383: I tensorflow/core/platfor

### Inferred Features

In [19]:
def features(dataset):
    token_count=dataset["text"].apply(lambda x: len(x))
    sentence_count = []
    punctuation_count = []
    apostrofees_count = []
    unk_count = []
    for doc in dataset["text"]:
        unk = 0
        dot = 0
        punctuation = 0
        apostrofees = 0
        for token in doc:
            if(token.endswith(".")):
                dot+=1
                punctuation+=1
            elif(token.endswith(",") or token.endswith("?") or token.endswith("!")):
                punctuation+=1
            elif(token.count("'")>0):
                    apostrofees+=token.count("'")
            elif(token=="[UNK]"):
                unk+=1
        sentence_count.append(dot)
        punctuation_count.append(punctuation)
        apostrofees_count.append(apostrofees)
        unk_count.append(unk)
    df = pd.DataFrame(
        columns=["token_num","sent_num","punct_sym","apostrof_sym","unk_num"]
    )
    df["token_num"]=token_count
    df["sent_num"]=sentence_count
    df["punct_sym"]=punctuation_count
    df["apostrof_sym"]=apostrofees_count
    df["unk_num"]=unk_count
    return df

In [3]:
test_dataset = pd.read_csv("../data/test_essays.csv")
tokenizer = PreTrainedTokenizerFast.from_pretrained("../data/weight/preTrainedTokenizer/")
doc_model = doc2vec.Doc2Vec.load("../data/weight/EmbeddingModel.bin")
model = keras.models.load_model("../data/weight/DenseNetwork.keras")

## Tokenizing

In [4]:
#load tokenizer
tokenized_test = test_dataset;
tokenized_test["text"] = test_dataset["text"].apply(lambda x : tokenizer.tokenize(text=x))

## Embedding

In [13]:
#load embeddings model
embeddings_matrix = [doc_model.infer_vector(tokenList) for tokenList in tokenized_test["text"]]
embeddings_test_dataset = pd.DataFrame(embeddings_matrix)
embeddings_test_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.088414,-0.051629,-0.32361,-0.037229,0.163105,-0.037558,-0.440097,-0.042547,0.196336,-0.107967,...,-0.301314,-0.129505,0.069851,-0.590607,-0.11276,0.328893,0.319068,0.055032,0.104145,-0.171187
1,-0.065463,0.063926,-0.318723,-0.335251,-0.071396,0.183771,-0.489209,-0.080673,0.322589,-0.158499,...,-0.300947,0.120153,0.059784,-0.504389,-0.058125,0.114964,0.002815,-0.139881,0.11185,-0.510255
2,-0.084854,-0.054968,-0.155465,-0.191102,-0.051919,0.088906,-0.284388,-0.023338,0.099484,-0.096832,...,-0.332409,0.11037,0.053438,-0.434887,-0.081258,0.117186,-0.052212,-0.131037,0.042335,-0.368967


#### Normalizado de embedding

In [20]:
embeddings_test_dataset["norm"]=np.linalg.norm(embeddings_test_dataset, axis=1)
norm_embeddings_test_dataset = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x), axis=1, arr=embeddings_test_dataset))
norm_embeddings_test_dataset["normalized_norm"] = (embeddings_test_dataset['norm'] - embeddings_test_dataset['norm'].min()) / (embeddings_test_dataset['norm'].max() - embeddings_test_dataset['norm'].min())
norm_embeddings_test_dataset = norm_embeddings_test_dataset.drop([100],axis=1)
pre_processed_test_data = pd.concat([tokenized_test,norm_embeddings_test_dataset],axis=1)
pre_processed_test_data

Unnamed: 0,id,prompt_id,text,0,1,2,3,4,5,6,...,91,92,93,94,95,96,97,98,99,normalized_norm
0,0000aaaa,2,"[Ġaaa, Ġb, b, b, Ġc, cc, .]",-0.015424,-0.009007,-0.056456,-0.006495,0.028455,-0.006552,-0.076778,...,-0.022593,0.012186,-0.103035,-0.019672,0.057378,0.055664,0.009601,0.018169,-0.029865,0.286704
1,1111bbbb,3,"[Ġb, b, b, Ġc, cc, Ġd, dd, .]",-0.009799,0.009569,-0.047708,-0.050182,-0.010687,0.027508,-0.073228,...,0.017985,0.008949,-0.0755,-0.0087,0.017208,0.000421,-0.020938,0.016742,-0.076378,1.0
2,2222cccc,4,"[Ġc, cc, Ġd, dd, Ġe, ee, .]",-0.015858,-0.010273,-0.029054,-0.035715,-0.009703,0.016615,-0.053149,...,0.020627,0.009987,-0.081275,-0.015186,0.021901,-0.009758,-0.024489,0.007912,-0.068955,0.0


## Feature inferring

In [21]:
test_features = features(pre_processed_test_data)
for col in test_features.columns:
    if(test_features[col].max()>0):
        test_features[col] = test_features[col] / np.linalg.norm(test_features[col])
pre_processed_test_data=pd.concat([pre_processed_test_data,test_features],axis=1).drop("text",axis=1)

In [22]:
pre_processed_test_data

Unnamed: 0,id,prompt_id,0,1,2,3,4,5,6,7,...,96,97,98,99,normalized_norm,token_num,sent_num,punct_sym,apostrof_sym,unk_num
0,0000aaaa,2,-0.015424,-0.009007,-0.056456,-0.006495,0.028455,-0.006552,-0.076778,-0.007423,...,0.055664,0.009601,0.018169,-0.029865,0.286704,0.549972,0.57735,0.57735,0,0
1,1111bbbb,3,-0.009799,0.009569,-0.047708,-0.050182,-0.010687,0.027508,-0.073228,-0.012076,...,0.000421,-0.020938,0.016742,-0.076378,1.0,0.628539,0.57735,0.57735,0,0
2,2222cccc,4,-0.015858,-0.010273,-0.029054,-0.035715,-0.009703,0.016615,-0.053149,-0.004361,...,-0.009758,-0.024489,0.007912,-0.068955,0.0,0.549972,0.57735,0.57735,0,0


## Model prediction

### Dense Neural Network

In [24]:
#load model
X_test = pre_processed_test_data.drop(["id"],axis=1)
pred = model.predict(x=X_test)



In [29]:
submition = pd.DataFrame()
submition["id"]=pre_processed_test_data["id"].to_numpy()
submition["generated"] = pred.round(4)
submition

Unnamed: 0,id,generated
0,0000aaaa,1.0
1,1111bbbb,1.0
2,2222cccc,1.0


In [31]:
submition.to_csv("../data/submission.csv",index=False)