# LLM - Detect AI Generated Text
# PREDICTION

## import

In [196]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from gensim.models import doc2vec
from transformers import PreTrainedTokenizerFast

import keras

### Inferred Features

In [197]:
def features(dataset):
    token_count=dataset["text"].apply(lambda x: len(x))
    sentence_count = []
    punctuation_count = []
    apostrofees_count = []
    unk_count = []
    for doc in dataset["text"]:
        unk = 0
        dot = 0
        punctuation = 0
        apostrofees = 0
        for token in doc:
            if(token.endswith(".")):
                dot+=1
                punctuation+=1
            elif(token.endswith(",") or token.endswith("?") or token.endswith("!")):
                punctuation+=1
            elif(token.count("'")>0):
                    apostrofees+=token.count("'")
            elif(token=="[UNK]"):
                unk+=1
        sentence_count.append(dot)
        punctuation_count.append(punctuation)
        apostrofees_count.append(apostrofees)
        unk_count.append(unk)
    df = pd.DataFrame(
        columns=["token_num","sent_num","punct_sym","apostrof_sym","unk_num"]
    )
    df["token_num"]=token_count
    df["sent_num"]=sentence_count
    df["punct_sym"]=punctuation_count
    df["apostrof_sym"]=apostrofees_count
    df["unk_num"]=unk_count
    return df

In [198]:
test_dataset = pd.read_csv("../data/test_essays.csv")
tokenizer = PreTrainedTokenizerFast.from_pretrained("../data/byte_pair_tokenizer/")
doc_model = doc2vec.Doc2Vec.load("../data/embedding_model/docModel.bin")
model = keras.models.load_model("../data/weights.h5")
subPath = "../data/CNN/submission.csv"

## Tokenizing

In [199]:
#load tokenizer
tokenized_test = test_dataset;
tokenized_test["text"] = test_dataset["text"].apply(lambda x : tokenizer.tokenize(text=x))

## Embedding

In [200]:
# Create embeddings and normalize
doc_model = doc2vec.Doc2Vec.load("../data/lexp/embedding_model/docModel.bin")
arr = [doc_model.infer_vector(doc) for doc in tokenized_test["text"]]
embeddings_dataset = pd.DataFrame(np.reshape(arr,(len(arr), 100)))
norma = np.linalg.norm(embeddings_dataset, axis=1)
norma
norm_embeddings_dataset = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x), axis=1, arr=embeddings_dataset))
norm_embeddings_dataset["normalized_norm"] = (norma - norma.min()) / (norma.max() - norma.min())
norm_embeddings_dataset
# Calcular características adicionales con la función 'features' (no proporcionada en tu código)
feature_data = features(tokenized_test)
feature_data
# Normalizar las características adicionales
feature_data_arr = pd.DataFrame(np.reshape(feature_data,(len(feature_data), len(feature_data.columns))))
norm_feature_data = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x) if x.max()>0 else 0,axis=0,arr=feature_data_arr),columns=feature_data.columns)
norm_feature_data
test = pd.concat([tokenized_test[["id","prompt_id"]].reset_index(drop=True),norm_feature_data, norm_embeddings_dataset], axis=1)
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.579811,-0.023181,0.105455,0.964687,-1.137239,-0.284674,0.798672,1.683975,-1.45244,-1.469087,...,0.021805,-0.369931,0.773667,-0.063291,-0.743866,0.615434,0.412717,-0.246424,2.069485,-1.535682
1,-0.846651,-0.690581,-0.38333,0.381186,-0.338441,-0.33148,0.590804,0.894206,-1.246327,-0.346884,...,-0.351556,0.513875,0.779376,0.465845,-0.495945,0.578675,0.211442,-0.000358,2.048722,-1.40814
2,-1.336258,-0.404491,-0.713487,1.189348,-0.542389,-0.52011,0.331135,0.673923,-0.80354,-0.52482,...,-0.113739,0.120349,0.947649,0.414408,-0.583618,0.262747,-0.12789,-0.430329,1.464094,-0.798821


## Model prediction

### Dense Neural Network

In [None]:
X_test = test.drop(["id"],axis=1)
pred = model.predict(x=X_test)
pred

In [205]:
submition = pd.DataFrame()
submition["id"] = test["id"].to_numpy()
submition["generated"] = pred.round(4)
submition

Unnamed: 0,id,generated
0,0000aaaa,1.0
1,1111bbbb,1.0
2,2222cccc,1.0


In [206]:
submition.to_csv(subPath,index=False)