# LLM - Detect AI Generated Text
# PREDICTION

## import

In [196]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from gensim.models import doc2vec
from transformers import PreTrainedTokenizerFast

import keras

### Inferred Features

In [197]:
def features(dataset):
    token_count=dataset["text"].apply(lambda x: len(x))
    sentence_count = []
    punctuation_count = []
    apostrofees_count = []
    unk_count = []
    for doc in dataset["text"]:
        unk = 0
        dot = 0
        punctuation = 0
        apostrofees = 0
        for token in doc:
            if(token.endswith(".")):
                dot+=1
                punctuation+=1
            elif(token.endswith(",") or token.endswith("?") or token.endswith("!")):
                punctuation+=1
            elif(token.count("'")>0):
                    apostrofees+=token.count("'")
            elif(token=="[UNK]"):
                unk+=1
        sentence_count.append(dot)
        punctuation_count.append(punctuation)
        apostrofees_count.append(apostrofees)
        unk_count.append(unk)
    df = pd.DataFrame(
        columns=["token_num","sent_num","punct_sym","apostrof_sym","unk_num"]
    )
    df["token_num"]=token_count
    df["sent_num"]=sentence_count
    df["punct_sym"]=punctuation_count
    df["apostrof_sym"]=apostrofees_count
    df["unk_num"]=unk_count
    return df

In [198]:
test_dataset = pd.read_csv("../data/test_essays.csv")
tokenizer = PreTrainedTokenizerFast.from_pretrained("../data/byte_pair_tokenizer/")
doc_model = doc2vec.Doc2Vec.load("../data/embedding_model/docModel.bin")
model = keras.models.load_model("../data/CNN/weights.h5")
subPath = "../data/CNN/submission.csv"

## Tokenizing

In [199]:
#load tokenizer
tokenized_test = test_dataset;
tokenized_test["text"] = test_dataset["text"].apply(lambda x : tokenizer.tokenize(text=x))

## Embedding

In [200]:
#load embeddings model
embeddings_matrix = [doc_model.infer_vector(tokenList,epochs=400) for tokenList in tokenized_test["text"]]
embeddings_test_dataset = pd.DataFrame(embeddings_matrix)
embeddings_test_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.579811,-0.023181,0.105455,0.964687,-1.137239,-0.284674,0.798672,1.683975,-1.45244,-1.469087,...,0.021805,-0.369931,0.773667,-0.063291,-0.743866,0.615434,0.412717,-0.246424,2.069485,-1.535682
1,-0.846651,-0.690581,-0.38333,0.381186,-0.338441,-0.33148,0.590804,0.894206,-1.246327,-0.346884,...,-0.351556,0.513875,0.779376,0.465845,-0.495945,0.578675,0.211442,-0.000358,2.048722,-1.40814
2,-1.336258,-0.404491,-0.713487,1.189348,-0.542389,-0.52011,0.331135,0.673923,-0.80354,-0.52482,...,-0.113739,0.120349,0.947649,0.414408,-0.583618,0.262747,-0.12789,-0.430329,1.464094,-0.798821


#### Normalizado de embedding

In [201]:
embeddings_test_dataset["norm"]=np.linalg.norm(embeddings_test_dataset, axis=1)
norm_embeddings_test_dataset = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x), axis=1, arr=embeddings_test_dataset))
norm_embeddings_test_dataset["normalized_norm"] = (embeddings_test_dataset['norm'] - embeddings_test_dataset['norm'].min()) / (embeddings_test_dataset['norm'].max() - embeddings_test_dataset['norm'].min())
norm_embeddings_test_dataset = norm_embeddings_test_dataset.drop([100],axis=1)
pre_processed_test_data = pd.concat([tokenized_test,norm_embeddings_test_dataset],axis=1)
pre_processed_test_data

Unnamed: 0,id,prompt_id,text,0,1,2,3,4,5,6,...,91,92,93,94,95,96,97,98,99,normalized_norm
0,0000aaaa,2,"[Ġaaa, Ġb, b, b, Ġc, cc, .]",-0.040339,-0.001613,0.007337,0.067117,-0.079122,-0.019806,0.055566,...,-0.025737,0.053827,-0.004403,-0.051753,0.042818,0.028714,-0.017145,0.143981,-0.106843,1.0
1,1111bbbb,3,"[Ġb, b, b, Ġc, cc, Ġd, dd, .]",-0.087266,-0.071179,-0.03951,0.03929,-0.034884,-0.034166,0.060895,...,0.052966,0.080332,0.048015,-0.051118,0.059645,0.021794,-3.7e-05,0.211165,-0.145139,0.0
2,2222cccc,4,"[Ġc, cc, Ġd, dd, Ġe, ee, .]",-0.136463,-0.041308,-0.072864,0.12146,-0.05539,-0.053115,0.033817,...,0.01229,0.096777,0.042321,-0.059601,0.026833,-0.013061,-0.043947,0.149518,-0.081578,0.019292


## Feature inferring

In [202]:
test_features = features(pre_processed_test_data)
for col in test_features.columns:
    if(test_features[col].max()>0):
        test_features[col] = test_features[col] / np.linalg.norm(test_features[col])
pre_processed_test_data=pd.concat([pre_processed_test_data,test_features],axis=1).drop("text",axis=1)

In [203]:
pre_processed_test_data

Unnamed: 0,id,prompt_id,0,1,2,3,4,5,6,7,...,96,97,98,99,normalized_norm,token_num,sent_num,punct_sym,apostrof_sym,unk_num
0,0000aaaa,2,-0.040339,-0.001613,0.007337,0.067117,-0.079122,-0.019806,0.055566,0.11716,...,0.028714,-0.017145,0.143981,-0.106843,1.0,0.549972,0.57735,0.57735,0,0
1,1111bbbb,3,-0.087266,-0.071179,-0.03951,0.03929,-0.034884,-0.034166,0.060895,0.092167,...,0.021794,-3.7e-05,0.211165,-0.145139,0.0,0.628539,0.57735,0.57735,0,0
2,2222cccc,4,-0.136463,-0.041308,-0.072864,0.12146,-0.05539,-0.053115,0.033817,0.068823,...,-0.013061,-0.043947,0.149518,-0.081578,0.019292,0.549972,0.57735,0.57735,0,0


## Model prediction

### Dense Neural Network

In [204]:
#load model
X_test = pre_processed_test_data.drop(["id"],axis=1)
pred = model.predict(x=X_test)
pred



array([[1.],
       [1.],
       [1.]], dtype=float32)

In [205]:
submition = pd.DataFrame()
submition["id"] = pre_processed_test_data["id"].to_numpy()
submition["generated"] = pred.round(4)
submition

Unnamed: 0,id,generated
0,0000aaaa,1.0
1,1111bbbb,1.0
2,2222cccc,1.0


In [206]:
submition.to_csv(subPath,index=False)