# LLM - Detect AI Generated Text 
# DATA PRE-PROCESS

## import

In [1]:
import math
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from gensim.models import doc2vec

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.metrics import roc_curve ,precision_recall_curve,auc,confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import PreTrainedTokenizerFast

import keras


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mrtc101/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
2024-01-26 20:53:00.572833: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-26 20:53:00.572868: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-26 20:53:00.606626: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-26 20:53:00.674806: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critic

In [2]:
initial_dataset = pd.read_csv("../data/train_essays.csv")
custom_data = pd.read_csv("../data/custom_essays.csv")
more_data = pd.read_csv("../data/train_v4_drcat_01.csv")

## Adding new Data

In [3]:
prompts = more_data.value_counts("prompt_name")
classes = [i for i in prompts.keys()]
map = {}
for i,cName in enumerate(classes):
    if (i > 1):
        map[cName] = i+6
    else:
        map[cName] = i
map

{'Car-free cities': 0,
 'Does the electoral college work?': 1,
 'Facial action coding system': 8,
 'Distance learning': 9,
 'Seeking multiple opinions': 10,
 'Driverless cars': 11,
 'Exploring Venus': 12,
 '"A Cowboy Who Rode the Waves"': 13,
 'The Face on Mars': 14,
 'Mandatory extracurricular activities': 15,
 'Summer projects': 16,
 'Cell phones at school': 17,
 'Grades for extracurricular activities': 18,
 'Community service': 19,
 'Phones and driving': 20}

In [4]:
more_data["prompt_id"] = more_data["prompt_name"].apply(lambda name : map[name])
more_data = more_data[["prompt_id","text","label"]].rename(columns={"label":"generated"})

In [5]:
new_data = pd.concat([custom_data,more_data],axis=0,ignore_index=True)
new_data["id"] = range(0,new_data.shape[0])

In [6]:
new_data

Unnamed: 0,id,prompt_id,text,generated
0,0,0,The Advantages of Limiting Car Usage in Suburb...,1
1,1,0,Paris' Driving Ban: A Temporary Solution to En...,1
2,2,0,Bogota's Car-Free Day: A Model for Sustainable...,1
3,3,0,Shifting Trends: The Decline of Car Culture in...,1
4,4,0,The End of Car Culture and the Rise of Sustain...,1
...,...,...,...,...
73590,73590,18,I am writing you today to disagree with your t...,1
73591,73591,18,"Dear Principal,\n\nIn conclusion, I would obse...",1
73592,73592,18,"Dear Mrs. Principal,\n\nin these kinds of cons...",1
73593,73593,18,I enjoyed Form five and excitedly ex claims ed...,1


In [7]:
target_data = pd.concat([initial_dataset,new_data],ignore_index=True,axis=0)
target_data

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
74968,73590,18,I am writing you today to disagree with your t...,1
74969,73591,18,"Dear Principal,\n\nIn conclusion, I would obse...",1
74970,73592,18,"Dear Mrs. Principal,\n\nin these kinds of cons...",1
74971,73593,18,I enjoyed Form five and excitedly ex claims ed...,1


### Data cleaning

In [8]:
print(target_data.shape)
target_data.drop_duplicates(subset=["text"],inplace=True,keep="first")
target_data = target_data.dropna()
target_data.reset_index(inplace=True,drop=True)
print(target_data.shape)

(74973, 4)
(73595, 4)


## Data Feature Engineering

### Training a Tokenizer
Reading the competition discussions lead me to this [Notebook](https://www.kaggle.com/code/datafan07/train-your-own-tokenizer), where is suggested to add words with typos into the vocabulary for better performance by training a tokenizer.

1. normalization
2. pre-tokenization
3. model
4. post-processing

>ByteLevel: 
>
>Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties:
> - Since it maps on bytes, a tokenizer using this only requires 256 characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.
> - A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)
> - For non ascii characters, it gets completely unreadable, but it works nonetheless!

### Byte-Pair Encoding tokenizer

In [9]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
#Cleaning
raw_tokenizer.normalizer =  normalizers.Sequence(
    [
        normalizers.NFC(),
        normalizers.Lowercase(),
        normalizers.Replace("\n"," "),
        normalizers.Replace("\r"," "),
        normalizers.Replace("\t"," ")
    ]    
    )
#First tokenization
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
#Training
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(show_progress=True,special_tokens=special_tokens)

def data_iter(dataset):
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

raw_tokenizer.train_from_iterator(data_iter(target_data),trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)






In [10]:
tokenizer.save_pretrained("../data/lexp/byte_pair_tokenizer")

('../data/lexp/byte_pair_tokenizer/tokenizer_config.json',
 '../data/lexp/byte_pair_tokenizer/special_tokens_map.json',
 '../data/lexp/byte_pair_tokenizer/tokenizer.json')

In [11]:
raw_tokenizer.normalizer.normalize_str("Martín's bag")

"martín's bag"

In [12]:
tokenizer.tokenize("Martin's bag")

['Ġmartin', "'s", 'Ġbag']

### Words Tokenizing

In [13]:
tokenized_dataset = target_data.copy()
tokenized_dataset["text"] = target_data["text"].apply(lambda x : tokenizer.tokenize(x))
tokenized_dataset

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,"[Ġcars, ., Ġcars, Ġhave, Ġbeen, Ġaround, Ġsinc...",0
1,005db917,0,"[Ġtransportation, Ġis, Ġa, Ġlarge, Ġnecessity,...",0
2,008f63e3,0,"[Ġ"", america, 's, Ġlove, Ġaffair, Ġwith, Ġit, ...",0
3,00940276,0,"[Ġhow, Ġoften, Ġdo, Ġyou, Ġride, Ġin, Ġa, Ġcar...",0
4,00c39458,0,"[Ġcars, Ġare, Ġa, Ġwonderful, Ġthing, ., Ġthey...",0
...,...,...,...,...
73590,73590,18,"[Ġi, Ġam, Ġwriting, Ġyou, Ġtoday, Ġto, Ġdisagr...",1
73591,73591,18,"[Ġdear, Ġprincipal, ,, Ġ, Ġin, Ġconclusion, ,,...",1
73592,73592,18,"[Ġdear, Ġmrs, ., Ġprincipal, ,, Ġ, Ġin, Ġthese...",1
73593,73593,18,"[Ġi, Ġenjoyed, Ġform, Ġfive, Ġand, Ġexcitedly,...",1


#### Deleting stopwords

In [14]:
tokenized_stopwords=[token for word in stopwords.words('english') for token in tokenizer.tokenize(word)]

In [15]:
new_docs = []
for doc in tokenized_dataset["text"]:
    tokens = [word for word in doc if word not in tokenized_stopwords]  # Eliminar stopwords
    new_docs.append(tokens)
tokenized_dataset["text"]=new_docs

In [16]:
# Guardar el DataFrame resultante en un archivo CSV
train,test = train_test_split(tokenized_dataset, test_size=0.6)
train.reset_index(drop=True)
test.reset_index(drop=True)

Unnamed: 0,id,prompt_id,text,generated
0,71583,8,"[Ġfacial, Ġaction, Ġcoding, Ġsystem, Ġfascinat...",1
1,9975,8,"[Ġfacial, Ġexpressions, Ġclassmate, Ġgives, Ġt...",0
2,57645,12,"[Ġchallenge, Ġexploring, Ġvenus, Ġcompelling, ...",1
3,49412,14,"[Ġface, Ġmars, Ġ, Ġface, Ġmars, Ġlong, Ġsubjec...",1
4,69946,13,"[Ġseagoing, Ġcowboys, Ġprogram, Ġunique, Ġrewa...",1
...,...,...,...,...
44152,65793,20,"[Ġphones, Ġability, Ġoperate, Ġmotor, Ġvehicle...",1
44153,4469,16,"[Ġquestion, Ġteachers, Ġlike, Ġgive, Ġwork, Ġs...",0
44154,29617,16,"[Ġone, Ġway, Ġenhance, Ġstudent, Ġachievement,...",1
44155,17636,11,"[Ġdriverless, Ġcars, Ġcom, inng, Ġaccidents, Ġ...",0


## Doc2Vec

In [17]:
train["text"] = [doc2vec.TaggedDocument(row[2],['z'+str(row[0])]) for row in train.values]
train["text"]

42543    ([Ġconcept, Ġreducing, Ġcar, Ġusage, Ġbecome, ...
56704    ([Ġ(, introduction, ):, Ġever, Ġdreamed, Ġtrav...
49646    ([Ġcowboy, Ġrode, Ġwaves, Ġ, Ġoften, Ġsaid, Ġt...
65909    ([Ġdriverless, Ġcars, Ġpolarizing, Ġtopic, Ġsp...
38071    ([Ġ8, th, -, grade, Ġstudent, ,, Ġalways, Ġdre...
                               ...                        
35937    ([Ġsure, ,, Ġattempt, Ġwriting, Ġessay, Ġ8, th...
59384    ([Ġidea, Ġcar, Ġdrive, Ġseems, Ġfuturistic, Ġe...
40696    ([Ġschools, Ġcontrol, Ġnumber, Ġextracurricula...
18247    ([Ġautonomous, Ġcars, Ġseem, Ġlike, Ġwould, Ġn...
17432    ([Ġdear, Ġprincipal, ,, Ġ, Ġglad, Ġwant, Ġstud...
Name: text, Length: 29438, dtype: object

In [18]:
doc_model = doc2vec.Doc2Vec(documents=train["text"],vector_size=100,epochs=20)

In [19]:
doc_model.save("../data/lexp/embedding_model/docModel.bin")

## Inferred Features

In [20]:
"""
tokenized_dataset = pd.read_csv("../data/tokenized_data/tokenized.csv",)
tokenized_dataset["text"] = tokenized_dataset["text"].apply(lambda x : eval(x))
docs_dataset = tokenized_dataset.copy()
docs_dataset["text"] = [doc2vec.TaggedDocument(row[2],['z'+row[0]]) for row in tokenized_dataset.values]
docs_dataset["text"]
"""

'\ntokenized_dataset = pd.read_csv("../data/tokenized_data/tokenized.csv",)\ntokenized_dataset["text"] = tokenized_dataset["text"].apply(lambda x : eval(x))\ndocs_dataset = tokenized_dataset.copy()\ndocs_dataset["text"] = [doc2vec.TaggedDocument(row[2],[\'z\'+row[0]]) for row in tokenized_dataset.values]\ndocs_dataset["text"]\n'

In [40]:
def features(dataset):
    token_count=dataset["text"].apply(lambda x: len(x))
    sentence_count = []
    punctuation_count = []
    apostrofees_count = []
    unk_count = []
    for doc in dataset["text"]:
        unk = 0
        dot = 0
        punctuation = 0
        apostrofees = 0
        for token in doc:
            if(token.endswith(".")):
                dot+=1
                punctuation+=1
            elif(token.endswith(",") or token.endswith("?") or token.endswith("!")):
                punctuation+=1
            elif(token.count("'")>0):
                    apostrofees+=token.count("'")
            elif(token=="[UNK]"):
                unk+=1
        sentence_count.append(dot)
        punctuation_count.append(punctuation)
        apostrofees_count.append(apostrofees)
        unk_count.append(unk)
    df = pd.DataFrame(
        columns=["token_num","sent_num","punct_sym","apostrof_sym","unk_num"]
    )
    df["token_num"]=token_count
    df["sent_num"]=sentence_count
    df["punct_sym"]=punctuation_count
    df["apostrof_sym"]=apostrofees_count
    df["unk_num"]=unk_count
    return df

## Normalizar embedding

In [22]:
doc_model = doc2vec.Doc2Vec.load("../data/lexp/embedding_model/docModel.bin")
arr = [doc_model.dv[doc.tags] for doc in train["text"]]
embeddings_dataset = pd.DataFrame(np.reshape(arr,(len(arr), 100)))
norma = np.linalg.norm(embeddings_dataset, axis=1)
norma

array([5.8421884, 6.657906 , 6.7383027, ..., 7.909009 , 8.5561695,
       4.0770845], dtype=float32)

In [23]:
norm_embeddings_dataset = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x), axis=1, arr=embeddings_dataset))
norm_embeddings_dataset["normalized_norm"] = (norma - norma.min()) / (norma.max() - norma.min())
norm_embeddings_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,normalized_norm
0,0.024149,-0.012406,-0.102259,0.035312,-0.166295,-0.214346,0.064996,0.081732,0.050830,-0.042659,...,-0.059754,-0.013995,-0.003609,-0.054222,0.005852,-0.162204,0.129035,-0.113359,0.072855,0.250193
1,0.009624,0.016637,-0.084729,-0.064131,-0.071519,-0.079279,-0.040373,0.382153,-0.125943,-0.075180,...,0.148504,0.179979,0.162245,-0.031815,0.126953,0.080496,0.139554,0.151027,0.093111,0.304024
2,0.098738,0.126639,-0.171531,-0.056539,-0.023899,-0.216765,0.115909,0.219873,0.054761,0.022534,...,0.041842,-0.041917,-0.060383,-0.060455,0.021589,-0.032543,-0.070403,0.043904,-0.039627,0.309329
3,-0.047298,-0.145219,-0.124332,0.233109,-0.099511,-0.347072,0.011194,0.098677,-0.073594,-0.118093,...,-0.061865,0.113064,0.122951,-0.119200,0.019564,-0.112948,-0.080771,0.027291,0.060934,0.226686
4,-0.053892,0.051193,-0.060982,0.048062,-0.106206,-0.031200,-0.026929,0.194755,-0.037228,-0.074959,...,0.049328,0.019387,-0.052216,0.033253,0.078349,-0.076047,-0.020799,-0.052328,0.183286,0.381400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29433,0.009123,-0.063777,0.109404,0.011289,-0.142039,-0.112562,-0.055773,0.152568,-0.064115,-0.051446,...,0.123257,-0.081951,0.022218,0.047455,-0.058178,0.022473,0.102880,0.049880,-0.006577,0.417857
29434,-0.078960,-0.006486,-0.115504,0.114665,-0.090872,-0.299745,-0.006072,0.081878,-0.007784,-0.135264,...,-0.061180,0.120647,0.001908,0.035093,0.091502,0.048425,0.008579,-0.069383,0.006188,0.241051
29435,0.023342,0.130713,0.107501,0.063010,-0.019766,-0.139278,-0.088074,0.154818,-0.041467,-0.026429,...,0.033398,0.052869,0.122322,0.186538,-0.063021,0.013305,0.097663,-0.175404,0.084010,0.386586
29436,0.052244,-0.040034,-0.181365,0.047256,-0.184097,-0.171362,0.107267,0.072358,0.077126,0.015500,...,0.164946,-0.119586,0.091539,-0.003745,-0.027525,0.014295,-0.186965,0.019864,-0.132460,0.429293


In [24]:
# Calcular características adicionales con la función 'features' (no proporcionada en tu código)
train["text"] = [t[0] for t in train["text"]]
feature_data = features(train)
feature_data

Unnamed: 0,token_num,sent_num,punct_sym,apostrof_sym,unk_num
42543,195,11,31,0,0
56704,345,21,53,1,0
49646,263,27,51,0,0
65909,237,18,34,0,0
38071,258,20,41,0,0
...,...,...,...,...,...
35937,242,20,38,12,0
59384,253,18,40,0,0
40696,197,13,27,4,0
18247,377,47,66,0,0


In [25]:
# Normalizar las características adicionales
feature_data_arr = pd.DataFrame(np.reshape(feature_data,(len(feature_data), len(feature_data.columns))))
norm_feature_data = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x) if x.max()>0 else 0,axis=0,arr=feature_data_arr),columns=feature_data.columns)
norm_feature_data

Unnamed: 0,token_num,sent_num,punct_sym,apostrof_sym,unk_num
0,0.004023,0.003139,0.004303,0.000000,0.0
1,0.007118,0.005992,0.007358,0.003764,0.0
2,0.005426,0.007704,0.007080,0.000000,0.0
3,0.004890,0.005136,0.004720,0.000000,0.0
4,0.005323,0.005706,0.005692,0.000000,0.0
...,...,...,...,...,...
29433,0.004993,0.005706,0.005275,0.045164,0.0
29434,0.005220,0.005136,0.005553,0.000000,0.0
29435,0.004065,0.003709,0.003748,0.015055,0.0
29436,0.007778,0.013410,0.009162,0.000000,0.0


In [26]:
new = pd.concat([norm_feature_data, norm_embeddings_dataset], axis=1,ignore_index=True)
train = pd.concat([train[["id","prompt_id","generated"]].reset_index(drop=True),new],axis=1)
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,0.004023,0.003139,0.004303,0.000000,0.0,0.024149,-0.012406,-0.102259,0.035312,-0.166295,...,-0.059754,-0.013995,-0.003609,-0.054222,0.005852,-0.162204,0.129035,-0.113359,0.072855,0.250193
1,0.007118,0.005992,0.007358,0.003764,0.0,0.009624,0.016637,-0.084729,-0.064131,-0.071519,...,0.148504,0.179979,0.162245,-0.031815,0.126953,0.080496,0.139554,0.151027,0.093111,0.304024
2,0.005426,0.007704,0.007080,0.000000,0.0,0.098738,0.126639,-0.171531,-0.056539,-0.023899,...,0.041842,-0.041917,-0.060383,-0.060455,0.021589,-0.032543,-0.070403,0.043904,-0.039627,0.309329
3,0.004890,0.005136,0.004720,0.000000,0.0,-0.047298,-0.145219,-0.124332,0.233109,-0.099511,...,-0.061865,0.113064,0.122951,-0.119200,0.019564,-0.112948,-0.080771,0.027291,0.060934,0.226686
4,0.005323,0.005706,0.005692,0.000000,0.0,-0.053892,0.051193,-0.060982,0.048062,-0.106206,...,0.049328,0.019387,-0.052216,0.033253,0.078349,-0.076047,-0.020799,-0.052328,0.183286,0.381400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29433,0.004993,0.005706,0.005275,0.045164,0.0,0.009123,-0.063777,0.109404,0.011289,-0.142039,...,0.123257,-0.081951,0.022218,0.047455,-0.058178,0.022473,0.102880,0.049880,-0.006577,0.417857
29434,0.005220,0.005136,0.005553,0.000000,0.0,-0.078960,-0.006486,-0.115504,0.114665,-0.090872,...,-0.061180,0.120647,0.001908,0.035093,0.091502,0.048425,0.008579,-0.069383,0.006188,0.241051
29435,0.004065,0.003709,0.003748,0.015055,0.0,0.023342,0.130713,0.107501,0.063010,-0.019766,...,0.033398,0.052869,0.122322,0.186538,-0.063021,0.013305,0.097663,-0.175404,0.084010,0.386586
29436,0.007778,0.013410,0.009162,0.000000,0.0,0.052244,-0.040034,-0.181365,0.047256,-0.184097,...,0.164946,-0.119586,0.091539,-0.003745,-0.027525,0.014295,-0.186965,0.019864,-0.132460,0.429293


In [34]:
doc_model = doc2vec.Doc2Vec.load("../data/lexp/embedding_model/docModel.bin")
arr = [doc_model.infer_vector(doc) for doc in test["text"]]
embeddings_dataset = pd.DataFrame(np.reshape(arr,(len(arr), 100)))
norma = np.linalg.norm(embeddings_dataset, axis=1)
norma
norm_embeddings_dataset = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x), axis=1, arr=embeddings_dataset))
norm_embeddings_dataset["normalized_norm"] = (norma - norma.min()) / (norma.max() - norma.min())
norm_embeddings_dataset

AttributeError: 'list' object has no attribute 'words'

In [35]:
norm_embeddings_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,normalized_norm
0,0.036622,-0.038121,-0.105222,0.013056,0.051433,-0.061571,-0.018496,0.058857,-0.157071,-0.196250,...,0.086987,0.000420,-0.026344,0.024768,0.030659,-0.119696,-0.088376,0.029170,0.035785,0.274137
1,-0.008121,0.088199,0.104572,-0.117521,0.115067,0.050076,-0.055492,-0.181880,0.066261,0.044652,...,0.080455,0.033021,0.080602,0.114687,0.157715,0.038906,-0.056661,-0.038428,-0.087695,0.304560
2,-0.027336,0.092930,0.074559,-0.006871,-0.136344,-0.271265,0.050145,-0.010475,0.011442,-0.091727,...,0.030754,0.101818,-0.046264,-0.078960,0.182100,-0.145130,-0.030719,0.071164,0.036743,0.249861
3,-0.080974,-0.040131,-0.077742,-0.077227,-0.150863,-0.018527,0.144219,0.172715,-0.089951,-0.094392,...,-0.081414,0.000254,0.054607,-0.061284,-0.031539,-0.120210,0.169098,-0.085678,-0.081633,0.200334
4,0.059042,0.079199,-0.031547,-0.025354,-0.321285,-0.092259,-0.118979,0.156379,0.043297,-0.144024,...,0.082784,-0.005716,-0.047013,-0.026827,0.100590,0.148490,0.098928,0.055644,0.213073,0.241949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44152,-0.008800,0.038571,-0.051649,0.112227,0.090971,-0.009117,-0.051963,0.128642,0.166866,-0.131899,...,0.084024,-0.002498,-0.058832,0.212846,-0.009739,-0.046080,-0.127745,0.063745,-0.073253,0.316837
44153,-0.092060,-0.033295,0.016752,-0.081701,-0.003183,0.083824,-0.009096,0.179808,-0.004928,-0.161151,...,0.062056,-0.012075,-0.198098,0.017619,0.103528,0.004939,0.089680,-0.046918,0.030329,0.283916
44154,0.071130,0.005502,0.157914,-0.019164,-0.002814,-0.007179,-0.081465,0.066129,-0.051207,0.015411,...,-0.059153,0.130420,-0.075304,0.130708,0.010312,-0.137394,0.049831,-0.116521,-0.009227,0.254528
44155,-0.048292,0.021797,-0.112793,-0.052139,-0.136571,-0.088548,0.032064,0.006344,0.093800,-0.052833,...,-0.078910,0.026213,0.122429,0.145564,-0.152553,0.057191,-0.112378,0.095895,-0.035510,0.227081


In [41]:
# Calcular características adicionales con la función 'features' (no proporcionada en tu código)
feature_data = features(test)
feature_data
# Normalizar las características adicionales
feature_data_arr = pd.DataFrame(np.reshape(feature_data,(len(feature_data), len(feature_data.columns))))
norm_feature_data = pd.DataFrame(np.apply_along_axis(lambda x: x / np.linalg.norm(x) if x.max()>0 else 0,axis=0,arr=feature_data_arr),columns=feature_data.columns)
norm_feature_data

Unnamed: 0,token_num,sent_num,punct_sym,apostrof_sym,unk_num
0,0.003832,0.004436,0.004079,0.009160,0.0
1,0.003731,0.003268,0.002719,0.000000,0.0
2,0.003613,0.003035,0.002946,0.000000,0.0
3,0.003562,0.003735,0.003059,0.000000,0.0
4,0.004541,0.003735,0.004306,0.000000,0.0
...,...,...,...,...,...
44152,0.005250,0.004202,0.005892,0.006107,0.0
44153,0.004575,0.003969,0.004306,0.000000,0.0
44154,0.003208,0.002801,0.003173,0.000000,0.0
44155,0.002212,0.001868,0.002266,0.000000,0.0


In [44]:
test = pd.concat([test[["id","prompt_id","generated"]].reset_index(drop=True),norm_feature_data, norm_embeddings_dataset], axis=1)

In [45]:
train.to_csv("../data/lexp/pre_processed/train.csv", index=False)
test.to_csv("../data/lexp/pre_processed/test.csv", index=False)