# Desarrollo de DF único - Sample

## Preprocesamiento

In [1]:
import stylo_metrix as sm
import pandas as pd
from raid import run_detection, run_evaluation
from raid.utils import load_data

from processer import split_text_into_sentences
from sklearn.preprocessing import LabelEncoder

In [2]:
# Download the RAID dataset without adversarial attacks
or_train_noadv_df = load_data(split="train", include_adversarial=False)
# test_noadv_df = load_data(split="test", include_adversarial=False)
# extra_noadv_df = load_data(split="extra", include_adversarial=False)

In [3]:
intCols = ['id','model', 'domain', 'title', 'prompt', 'generation']
# print("Visualizar columnas específicas:")
# train_noadv_df = or_train_noadv_df[or_train_noadv_df['model'] != 'human']

# Copia del dataframe con columnas específicas
train_noadv_df = or_train_noadv_df.copy()
train_noadv_df = train_noadv_df[intCols]

display(train_noadv_df.head(7))

Unnamed: 0,id,model,domain,title,prompt,generation
0,e5e058ce-be2b-459d-af36-32532aaba5ff,human,abstracts,FUTURE-AI: Guiding Principles and Consensus Re...,,The recent advancements in artificial intellig...
1,f95b107b-d176-4af5-90f7-4d0bb20caf93,human,abstracts,EdgeFlow: Achieving Practical Interactive Segm...,,High-quality training data play a key role in ...
2,856d8972-9e3d-4544-babc-0fe16f21e04d,human,abstracts,Semi-supervised Contrastive Learning for Label...,,The success of deep learning methods in medica...
3,fbc8a5ea-90fa-47b8-8fa7-73dd954f1524,human,abstracts,Combo Loss: Handling Input and Output Imbalanc...,,Simultaneous segmentation of multiple organs f...
4,72c41b8d-0069-4886-b734-a4000ffca286,human,abstracts,Attention-Based 3D Seismic Fault Segmentation ...,,Detection faults in seismic data is a crucial ...
5,72fe360b-cce6-4daf-b66a-1d778f5964f8,human,abstracts,Segmenter: Transformer for Semantic Segmentation,,Image segmentation is often ambiguous at the l...
6,df594cf4-9a0c-4488-bcb3-68f41e2d5a16,human,abstracts,Mining Contextual Information Beyond Image for...,,This paper studies the context aggregation pro...


## Versión 1 - Fusión original

In [4]:
# Get sample dataframe 'generation'

filtered_by_domain = train_noadv_df[
    (train_noadv_df['domain'] != 'recipes')
    ]
generation_sample = filtered_by_domain[['id', 'model', 'domain', 'generation']].sample(n=32, random_state=3)

display(generation_sample)

Unnamed: 0,id,model,domain,generation
110956,c4059838-c14e-4b84-b75c-12bc0bd2f34a,cohere,books,"The story centers on Charles, the husband of ..."
363000,c5001dc3-5baa-4abf-8a1b-2e0bf6d7c77d,gpt3,reddit,\n\nJust wondering where do I start?\n\nI've b...
172379,42bb8f1e-60ec-428d-800c-e9059a0efdab,gpt3,news,\n\nLasers are being used to create an ultra-f...
200619,6a2721a4-d15e-44a9-9f09-e48e60e17a4d,llama-chat,poetry,"With eager hands, we plant the seeds,\nIn fert..."
27573,4a64da9c-2df8-4f58-91ee-2f9192cc5667,mistral,abstracts,Medical image analysis has experienced an expl...
242879,16d088eb-1511-4711-b8ff-521f77e03ace,mpt-chat,poetry,"In the endless, boundless night,\nWhere darkne..."
437690,307c1c28-c58f-4f4d-8294-7a89a972268d,mistral-chat,wiki,Matagarup Refugee Camp is a refugee camp locat...
217036,07b7c311-9092-4f22-ac67-7a9dc377a82c,mistral,poetry,> Who is your best friend?\n> I'll tell you wh...
456322,11c47e1c-54d5-48e4-9835-95125e0c7b2b,gpt4,wiki,"Eldora, Colorado is a small unincorporated com..."
174334,069a165e-431d-4acd-8f82-f8b15ad244a4,chatgpt,news,Legendary filmmaker Spike Lee has shown his su...


In [5]:
def extract_features_from_dataset(df_original, sample_size=None):
    """
    Extrae features estilométricos a nivel de oración.
    
    Returns:
        DataFrame con estructura: id_original, model, domain, sentence_num, text, features...
    """
    if sample_size:
        df_original = df_original.sample(n=sample_size, random_state=42)
    
    # Inicializar StyloMetrix (sin guardar archivos)
    stylo = sm.StyloMetrix('en', debug=False)  # debug=False para evitar archivos
    
    all_results = []
    
    for idx, row in df_original.iterrows():
        # Dividir en oraciones (en memoria)
        sentences = split_text_into_sentences(row['generation'])
        
        # Extraer features para todas las oraciones del documento
        features_df = stylo.transform(sentences)
        
        # Agregar metadatos del documento original
        features_df.insert(0, 'id_original', row['id'])
        features_df.insert(1, 'model', row['model'])
        features_df.insert(2, 'domain', row['domain'])
        features_df.insert(3, 'sentence_num', range(len(sentences)))
        # La columna 'text' ya existe en features_df (viene de stylo.transform)
        
        all_results.append(features_df)
    
    # Concatenar todos los resultados
    final_df = pd.concat(all_results, ignore_index=True)
    
    return final_df

In [6]:
features_df = extract_features_from_dataset(generation_sample)



[OK] Total de oraciones: 23


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 23/23 [00:02<00:00,  8.74it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  8.23it/s]


[OK] Total de oraciones: 19


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 19/19 [00:02<00:00,  6.84it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  7.13it/s]


[OK] Total de oraciones: 10


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 10/10 [00:01<00:00,  9.88it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  8.96it/s]


[OK] Total de oraciones: 12


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 12/12 [00:01<00:00, 11.20it/s]


[OK] Total de oraciones: 1


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 1/1 [00:00<00:00,  2.31it/s]


[OK] Total de oraciones: 24


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 24/24 [00:03<00:00,  7.18it/s]


[OK] Total de oraciones: 14


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 14/14 [00:01<00:00,  7.85it/s]


[OK] Total de oraciones: 9


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 9/9 [00:00<00:00, 11.19it/s]


[OK] Total de oraciones: 1


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 1/1 [00:01<00:00,  1.00s/it]


[OK] Total de oraciones: 18


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 18/18 [00:02<00:00,  8.79it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:03<00:00,  2.96it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  5.70it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  5.55it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  8.48it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:01<00:00,  5.68it/s]


[OK] Total de oraciones: 17


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 17/17 [00:02<00:00,  7.62it/s]


[OK] Total de oraciones: 13


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 13/13 [00:01<00:00,  7.14it/s]


[OK] Total de oraciones: 18


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 18/18 [00:02<00:00,  6.36it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  4.35it/s]


[OK] Total de oraciones: 14


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 14/14 [00:02<00:00,  6.33it/s]


[OK] Total de oraciones: 9


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 9/9 [00:01<00:00,  7.40it/s]


[OK] Total de oraciones: 18


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 18/18 [00:02<00:00,  6.77it/s]


[OK] Total de oraciones: 10


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 10/10 [00:01<00:00,  6.10it/s]


[OK] Total de oraciones: 12


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 12/12 [00:01<00:00,  6.27it/s]


[OK] Total de oraciones: 19


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 19/19 [00:02<00:00,  6.59it/s]


[OK] Total de oraciones: 21


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 21/21 [00:02<00:00,  7.14it/s]


[OK] Total de oraciones: 8


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 8/8 [00:01<00:00,  5.95it/s]


[OK] Total de oraciones: 9


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 9/9 [00:01<00:00,  7.17it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  7.06it/s]


In [7]:
# Uso:
print(f"Shape: {features_df.shape}")
display(features_df.head(5))

Shape: (352, 201)


Unnamed: 0,id_original,model,domain,sentence_num,text,POS_VERB,POS_NOUN,POS_ADJ,POS_ADV,POS_DET,...,RE,ASF,ASM,OM,RCI,DMC,OR,QAS,PA,PR
0,c4059838-c14e-4b84-b75c-12bc0bd2f34a,cohere,books,0,"The story centers on Charles, the husband of A...",0.2,0.2,0.066667,0.0,0.133333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0
1,c4059838-c14e-4b84-b75c-12bc0bd2f34a,cohere,books,1,He insists that she stay in the chateau while ...,0.2,0.133333,0.0,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.0
2,c4059838-c14e-4b84-b75c-12bc0bd2f34a,cohere,books,2,Charles abruptly ends his trip and returns hom...,0.3,0.1,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,c4059838-c14e-4b84-b75c-12bc0bd2f34a,cohere,books,3,"He announces that he has sold the estate, and ...",0.375,0.0625,0.0,0.0625,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0
4,c4059838-c14e-4b84-b75c-12bc0bd2f34a,cohere,books,4,Then he hands Alice a key he says he found on ...,0.235294,0.176471,0.0,0.058824,0.176471,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0


## Versión 2 - Tags codificados

### Codificación de etiquetas 'model' y 'domain'

In [8]:
print("Información del dataset original:")
print(f"Forma del dataset: {or_train_noadv_df.shape}")
print(f"Columnas: {list(or_train_noadv_df.columns)}")
print(f"Modelos unicos: {or_train_noadv_df['model'].unique()}")
print(f"Dominios unicos: {or_train_noadv_df['domain'].unique()}")

Información del dataset original:
Forma del dataset: (467985, 11)
Columnas: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation']
Modelos unicos: ['human' 'llama-chat' 'mpt' 'mpt-chat' 'gpt2' 'mistral' 'mistral-chat'
 'gpt3' 'cohere' 'chatgpt' 'gpt4' 'cohere-chat']
Dominios unicos: ['abstracts' 'books' 'news' 'poetry' 'recipes' 'reddit' 'reviews' 'wiki']


In [9]:
print("Información del dataset generado:")
print(f"Forma del dataset: {features_df.shape}")
print(f"Columnas: {list(features_df.columns)}")
print(f"Modelos unicos: {features_df['model'].unique()}")
print(f"Dominios unicos: {features_df['domain'].unique()}")

Información del dataset generado:
Forma del dataset: (352, 201)
Columnas: ['id_original', 'model', 'domain', 'sentence_num', 'text', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SENT', 'SY_SUBO

In [10]:
# Codificación de etiquetas 'model' y 'domain'
id_encoder = LabelEncoder()
model_encoder = LabelEncoder()
domain_encoder = LabelEncoder()
features_df['id_encoded'] = id_encoder.fit_transform(features_df['id_original'])
features_df['model_encoded'] = model_encoder.fit_transform(features_df['model'])
features_df['domain_encoded'] = domain_encoder.fit_transform(features_df['domain'])

In [11]:
print("Etiquetas codificadas:")

print(f"Modelos:")

for i in range(len(model_encoder.classes_)):
    print(f"{i}: {model_encoder.classes_[i]}")
print()

print(f"Dominios:")
for i in range(len(domain_encoder.classes_)):
    print(f"{i}: {domain_encoder.classes_[i]}")
# print(f"Modelos: {list(model_encoder.classes_)}")
# print(f"Dominios: {list(domain_encoder.classes_)}")

Etiquetas codificadas:
Modelos:
0: chatgpt
1: cohere
2: cohere-chat
3: gpt2
4: gpt3
5: gpt4
6: human
7: llama-chat
8: mistral
9: mistral-chat
10: mpt
11: mpt-chat

Dominios:
0: abstracts
1: books
2: news
3: poetry
4: reddit
5: reviews
6: wiki


In [12]:
# Visualizar texto de registros aleatorios de dominio poetry
pd.set_option('display.max_colwidth', None)  # Sin límite de ancho
pd.set_option('display.max_rows', None)      # Sin límite de filas (usar con cuidado)

poetry_df = features_df[features_df['domain'] == 'poetry']
display(poetry_df[['id_original', 'sentence_num', 'model', 'domain', 'text']].sample(n=10, random_state=11))
# 2 7 
pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')

Unnamed: 0,id_original,sentence_num,model,domain,text
266,b4fae87a-96f7-4fa0-ba5d-7b04034e3ff9,15,chatgpt,poetry,"A political romance, against the odds, An epitome of love's uncompromising gods."
278,e0a86252-57a1-49ad-a0b0-d16a9845bba7,9,human,poetry,"So fear the fear that i fear, And see that all this mess, Will not be sanitised with truth, When you beg me to confess."
272,e0a86252-57a1-49ad-a0b0-d16a9845bba7,3,human,poetry,"I cry tears of happiness, To pretend it isnt real, I divulge no information, On what is truely real."
270,e0a86252-57a1-49ad-a0b0-d16a9845bba7,1,human,poetry,"I close my eyes with acid, And dream while not asleep, To confuse any enemys, Anything i write i eat."
65,16d088eb-1511-4711-b8ff-521f77e03ace,3,mpt-chat,poetry,"So let us embrace this dusky hue, And find solace in its soothing arms, For even in the darkest night, There's a glimmer of hope to be found."
269,e0a86252-57a1-49ad-a0b0-d16a9845bba7,0,human,poetry,"I fear the fear i cannot stop, I fear it more and more, And when i grind my teeth with pain, The words come out demure."
164,d40dd382-0aab-40ff-9760-722299206249,4,mpt-chat,poetry,"So I will not waste in despair, But instead choose to believe, That better days are coming, And my dreams are not dead."
78,07b7c311-9092-4f22-ac67-7a9dc377a82c,0,mistral,poetry,"> Who is your best friend? > I'll tell you who mine are, > They're my wife and children; > And they don't care! > > If I am sick or sad, > Or if I have to go away, > My wife and children > Are always glad to see me come home that way. > > When I get back from work at night, > There's no one there but them; > But when I leave in the morning, > It seems like half the town has come. > > So, who is your best friend? > Mine are my wife and children; > And they don't care!"
258,b4fae87a-96f7-4fa0-ba5d-7b04034e3ff9,7,chatgpt,poetry,"Through heated debates and whispers of disdain, Their love remained, resilient, never waned."
277,e0a86252-57a1-49ad-a0b0-d16a9845bba7,8,human,poetry,"Seek the maze of happiness, And dont you chase you tail, Dead ends lace every divide, But no one can win or fail."


### Armado de DF

In [19]:
# Ordenar DF final
train_df = features_df.copy()

trainCols = ['id_encoded', 'sentence_num', 'model_encoded', 'domain_encoded', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SENT', 'SY_SUBORD_SENT_PUNCT', 'SY_COORD_SENT', 'SY_COORD_SENT_PUNCT', 'SY_SIMPLE_SENT', 'SY_INVERSE_PATTERNS', 'SY_SIMILE', 'SY_FRONTING', 'SY_IRRITATION', 'SY_INTENSIFIER', 'SY_QUOT', 'VT_PRESENT_SIMPLE', 'VT_PRESENT_PROGRESSIVE', 'VT_PRESENT_PERFECT', 'VT_PRESENT_PERFECT_PROGR', 'VT_PRESENT_SIMPLE_PASSIVE', 'VT_PRESENT_PROGR_PASSIVE', 'VT_PRESENT_PERFECT_PASSIVE', 'VT_PAST_SIMPLE', 'VT_PAST_SIMPLE_BE', 'VT_PAST_PROGR', 'VT_PAST_PERFECT', 'VT_PAST_PERFECT_PROGR', 'VT_PAST_SIMPLE_PASSIVE', 'VT_PAST_POGR_PASSIVE', 'VT_PAST_PERFECT_PASSIVE', 'VT_FUTURE_SIMPLE', 'VT_FUTURE_PROGRESSIVE', 'VT_FUTURE_PERFECT', 'VT_FUTURE_PERFECT_PROGR', 'VT_FUTURE_SIMPLE_PASSIVE', 'VT_FUTURE_PROGR_PASSIVE', 'VT_FUTURE_PERFECT_PASSIVE', 'VT_WOULD', 'VT_WOULD_PASSIVE', 'VT_WOULD_PROGRESSIVE', 'VT_WOULD_PERFECT', 'VT_WOULD_PERFECT_PASSIVE', 'VT_SHOULD', 'VT_SHOULD_PASSIVE', 'VT_SHALL', 'VT_SHALL_PASSIVE', 'VT_SHOULD_PROGRESSIVE', 'VT_SHOULD_PERFECT', 'VT_SHOULD_PERFECT_PASSIVE', 'VT_MUST', 'VT_MUST_PASSIVE', 'VT_MUST_PROGRESSIVE', 'VT_MUST_PERFECT', 'VT_MST_PERFECT_PASSIVE', 'VT_CAN', 'VT_CAN_PASSIVE', 'VT_COULD', 'VT_COULD_PASSIVE', 'VT_CAN_PROGRESSIVE', 'VT_COULD_PROGRESSIVE', 'VT_COULD_PERFECT', 'VT_COULD_PERFECT_PASSIVE', 'VT_MAY', 'VT_MAY_PASSIVE', 'VT_MIGHT', 'VT_MIGHT_PASSIVE', 'VT_MAY_PROGRESSIVE', 'VT_MIGTH_PERFECT', 'VT_MIGHT_PERFECT_PASSIVE', 'VT_MAY_PERFECT_PASSIVE', 'ST_TYPE_TOKEN_RATIO_LEMMAS', 'ST_HERDAN_TTR', 'ST_MASS_TTR', 'ST_SENT_WRDSPERSENT', 'ST_SENT_DIFFERENCE', 'ST_REPETITIONS_WORDS', 'ST_REPETITIONS_SENT', 'ST_SENT_D_VP', 'ST_SENT_D_NP', 'ST_SENT_D_PP', 'ST_SENT_D_ADJP', 'ST_SENT_D_ADVP', 'L_I_PRON', 'L_HE_PRON', 'L_SHE_PRON', 'L_IT_PRON', 'L_YOU_PRON', 'L_WE_PRON', 'L_THEY_PRON', 'L_ME_PRON', 'L_YOU_OBJ_PRON', 'L_HIM_PRON', 'L_HER_OBJECT_PRON', 'L_IT_OBJECT_PRON', 'L_US_PRON', 'L_THEM_PRON', 'L_MY_PRON', 'L_YOUR_PRON', 'L_HIS_PRON', 'L_HER_PRON', 'L_ITS_PRON', 'L_OUR_PRON', 'L_THEIR_PRON', 'L_YOURS_PRON', 'L_THEIRS_PRON', 'L_HERS_PRON', 'L_OURS_PRON', 'L_MYSELF_PRON', 'L_YOURSELF_PRON', 'L_HIMSELF_PRON', 'L_HERSELF_PRON', 'L_ITSELF_PRON', 'L_OURSELVES_PRON', 'L_YOURSELVES_PRON', 'L_THEMSELVES_PRON', 'L_FIRST_PERSON_SING_PRON', 'L_SECOND_PERSON_PRON', 'L_THIRD_PERSON_SING_PRON', 'L_THIRD_PERSON_PLURAL_PRON', 'VF_INFINITIVE', 'G_PASSIVE', 'G_ACTIVE', 'G_PRESENT', 'G_PAST', 'G_FUTURE', 'G_MODALS_SIMPLE', 'G_MODALS_CONT', 'G_MODALS_PERFECT', 'AN', 'DDP', 'SVP', 'CDS', 'DDF', 'IS', 'PS', 'RE', 'ASF', 'ASM', 'OM', 'RCI', 'DMC', 'OR', 'QAS', 'PA', 'PR']

# trainCols = ['id_original', 'id_encoded', 'sentence_num', 'model_encoded', 'domain_encoded', 'text']

train_df = train_df[trainCols]

train_df = train_df.rename(columns={
    'id_encoded': 'id',
    'model_encoded': 'model_label',
    'domain_encoded': 'domain_label'
})

train_df = train_df.sort_values(by=['id', 'sentence_num']).reset_index(drop=True)

In [20]:
# display(train_df.sample(5, random_state=42))
display(train_df.head(30))

Unnamed: 0,id,sentence_num,model_label,domain_label,POS_VERB,POS_NOUN,POS_ADJ,POS_ADV,POS_DET,POS_INTJ,...,RE,ASF,ASM,OM,RCI,DMC,OR,QAS,PA,PR
0,0,0,0,2,0.153846,0.230769,0.076923,0.0,0.076923,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0
1,0,1,0,2,0.28,0.32,0.08,0.0,0.08,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0
2,0,2,0,2,0.166667,0.208333,0.166667,0.0,0.083333,0.0,...,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.041667,0.041667,0.0
3,0,3,0,2,0.166667,0.277778,0.111111,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0
4,0,4,0,2,0.181818,0.30303,0.030303,0.030303,0.121212,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151515,0.0,0.0
5,0,5,0,2,0.227273,0.318182,0.090909,0.045455,0.045455,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,6,0,2,0.2,0.32,0.04,0.0,0.12,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0
7,0,7,0,2,0.1,0.3,0.2,0.0,0.1,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.1,0.0,0.0
8,0,8,0,2,0.234043,0.234043,0.042553,0.021277,0.148936,0.0,...,0.0,0.0,0.021277,0.021277,0.0,0.0,0.0,0.085106,0.0,0.0
9,0,9,0,2,0.086957,0.391304,0.0,0.0,0.086957,0.0,...,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.086957,0.0,0.0


In [21]:
# Guardar el DataFrame final a un archivo CSV
train_df.to_csv('train_df.csv', index=False)