# LLM - Detect AI Generated Text
## Import

In [None]:
import math
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

from gensim.corpora import Dictionary
from gensim.models import word2vec
from gensim.models import doc2vec

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.metrics import roc_curve ,precision_recall_curve,auc,confusion_matrix,ConfusionMatrixDisplay

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import PreTrainedTokenizerFast

import keras

### Inferred Features

In [None]:
def features(dataset):
    token_count=dataset["text"].apply(lambda x: len(x))
    sentence_count = []
    punctuation_count = []
    apostrofees_count = []
    unk_count = []
    for doc in dataset["text"]:
        unk = 0
        dot = 0
        punctuation = 0
        apostrofees = 0
        for token in doc:
            if(token.endswith(".")):
                dot+=1
                punctuation+=1
            elif(token.endswith(",") or token.endswith("?") or token.endswith("!")):
                punctuation+=1
            elif(token.count("'")>0):
                    apostrofees+=token.count("'")
            elif(token=="[UNK]"):
                unk+=1
        sentence_count.append(dot)
        punctuation_count.append(punctuation)
        apostrofees_count.append(apostrofees)
        unk_count.append(unk)
    df = pd.DataFrame(
        columns=["token_num","sent_num","punct_sym","apostrof_sym","unk_num"]
    )
    df["token_num"]=token_count
    df["sent_num"]=sentence_count
    df["punct_sym"]=punctuation_count
    df["apostrof_sym"]=apostrofees_count
    df["unk_num"]=unk_count
    return df

In [None]:
test_dataset = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
tokenizer = PreTrainedTokenizerFast.from_pretrained("/kaggle/input/models-wights/weight/preTrainedTokenizer/")
doc_model = doc2vec.Doc2Vec.load("/kaggle/input/models-wights/weight/EmbeddingModel.bin")
model = keras.models.load_model("/kaggle/input/models-wights/DenseNetwork.h5")

## Tokenizing

In [None]:
#load tokenizer
tokenized_test = test_dataset;
tokenized_test["text"] = test_dataset["text"].apply(lambda x : tokenizer.tokenize(text=x))

## Embedding

In [None]:
#load embeddings model
embeddings_matrix = [doc_model.infer_vector(tokenList,epochs=10) for tokenList in tokenized_test["text"]]
embeddings_test_dataset = pd.DataFrame(embeddings_matrix)
embeddings_test_dataset

#### Normalizado de embedding

In [None]:
embeddings_test_dataset["norm"]=np.linalg.norm(embeddings_test_dataset, axis=1)
norm_embeddings_test_dataset = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x), axis=1, arr=embeddings_test_dataset))
norm_embeddings_test_dataset["normalized_norm"] = (embeddings_test_dataset['norm'] - embeddings_test_dataset['norm'].min()) / (embeddings_test_dataset['norm'].max() - embeddings_test_dataset['norm'].min())
norm_embeddings_test_dataset = norm_embeddings_test_dataset.drop([100],axis=1)
pre_processed_test_data = pd.concat([tokenized_test,norm_embeddings_test_dataset],axis=1)
pre_processed_test_data

## Feature inferring

In [None]:
test_features = features(pre_processed_test_data)
for col in test_features.columns:
    if(test_features[col].max()>0):
        test_features[col] = test_features[col] / np.linalg.norm(test_features[col])
pre_processed_test_data=pd.concat([pre_processed_test_data,test_features],axis=1).drop("text",axis=1)

In [None]:
pre_processed_test_data

## Model prediction

### Dense Neural Network

In [None]:
#load model
X_test = pre_processed_test_data.drop(["id"],axis=1)
pred = model.predict(x=X_test)

In [None]:
submition = pd.DataFrame()
submition["id"]=pre_processed_test_data["id"].to_numpy()
submition["generated"] = pred.round(4)
submition

In [None]:
submition.to_csv("/kaggle/working/submission.csv",index=False)