# Desarrollo de DF único - Sample

## Preprocesamiento

In [1]:
import stylo_metrix as sm
import pandas as pd
from raid import run_detection, run_evaluation
from raid.utils import load_data

from processer import split_text_into_sentences
from sklearn.preprocessing import LabelEncoder

In [2]:
# Download the RAID dataset without adversarial attacks
or_train_noadv_df = load_data(split="train", include_adversarial=False)

In [3]:
print(f"Original training data shape: {or_train_noadv_df.shape}")

Original training data shape: (467985, 11)


In [4]:
intCols = ['id','model', 'domain', 'title', 'prompt', 'generation']

# Copia del dataframe con columnas específicas
train_noadv_df = or_train_noadv_df.copy()
train_noadv_df = train_noadv_df[intCols]

## Versión 1 - Fusión original

In [5]:
filtered_by_domain = train_noadv_df[
    (train_noadv_df['domain'] != 'recipes')
    ]
generation_sample = filtered_by_domain[['id', 'model', 'domain', 'generation']].sample(n=1000, random_state=50)

display(generation_sample)

Unnamed: 0,id,model,domain,generation
416790,d474d928-a0b4-4cff-b4c6-0ac1ea7137c3,mistral,wiki,SF Studios is a Swedish film production and di...
218239,0915805f-1e28-499d-90ae-3a5ec1525fd5,mistral-chat,poetry,"On this Christmas Day, as the world wakes,\nA ..."
87832,6eddea46-27b0-4697-8667-82cf5444a48b,llama-chat,books,"In the small town of Willowdale, a beautiful a..."
171835,c39958f4-2074-4be6-9c11-9bd7316cabbc,gpt3,news,\n\nThe Scissor Sisters have been named best l...
169504,7e23796a-4fc1-4eb5-a3c6-d5e58a221d95,gpt4,news,Former England manager Kevin Keegan heaped pra...
...,...,...,...,...
391988,b1d446fa-69e5-4126-8396-a5e4b39a7531,mistral-chat,reviews,Taken 2 is a thrilling sequel to the 2008 acti...
54119,d2898250-a261-47e4-b241-61cea693826d,gpt4,abstracts,This academic paper presents an innovative met...
382920,44f6a282-1b8a-48e1-8ebb-feb51bcf5a68,mistral,reviews,"""Midnight in Paris"" is a movie that is so good..."
214533,1417761b-05b6-4238-ae73-8eab757f1552,mistral,poetry,"Sky at My Doorway, Sunrises and Distant Thunde..."


In [6]:
def extract_features_from_dataset(df_original, sample_size=None):
    """
    Extrae features estilométricos a nivel de oración.
    
    Returns:
        DataFrame con estructura: id_original, model, domain, sentence_num, text, features...
    """
    if sample_size:
        df_original = df_original.sample(n=sample_size, random_state=42)
    
    # Inicializar StyloMetrix (sin guardar archivos)
    stylo = sm.StyloMetrix('en', debug=False)  # debug=False para evitar archivos
    
    all_results = []
    
    for idx, row in df_original.iterrows():
        # Dividir en oraciones (en memoria)
        sentences = split_text_into_sentences(row['generation'])
        
        # Extraer features para todas las oraciones del documento
        features_df = stylo.transform(sentences)
        
        # Agregar metadatos del documento original
        features_df.insert(0, 'id_original', row['id'])
        features_df.insert(1, 'model', row['model'])
        features_df.insert(2, 'domain', row['domain'])
        features_df.insert(3, 'sentence_num', range(len(sentences)))
        # La columna 'text' ya existe en features_df (viene de stylo.transform)
        
        all_results.append(features_df)
    
    # Concatenar todos los resultados
    final_df = pd.concat(all_results, ignore_index=True)
    
    return final_df

In [None]:
features_df = extract_features_from_dataset(generation_sample)



[OK] Total de oraciones: 22


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 22/22 [00:02<00:00,  9.40it/s]


[OK] Total de oraciones: 12


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 12/12 [00:01<00:00,  7.57it/s]


[OK] Total de oraciones: 16


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 16/16 [00:02<00:00,  7.24it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  6.42it/s]


[OK] Total de oraciones: 17


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 17/17 [00:02<00:00,  7.97it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  5.64it/s]


[OK] Total de oraciones: 3


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 3/3 [00:00<00:00,  9.70it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:00<00:00,  9.48it/s]


[OK] Total de oraciones: 1


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 1/1 [00:00<00:00,  1.85it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:01<00:00,  9.39it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:00<00:00,  9.50it/s]


[OK] Total de oraciones: 13


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 13/13 [00:01<00:00,  7.35it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:00<00:00,  7.46it/s]


[OK] Total de oraciones: 12


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 12/12 [00:01<00:00,  8.03it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  7.84it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:01<00:00,  6.91it/s]


[OK] Total de oraciones: 8


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 8/8 [00:01<00:00,  6.59it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  6.83it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:01<00:00,  6.81it/s]


[OK] Total de oraciones: 25


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 25/25 [00:03<00:00,  8.10it/s]


[OK] Total de oraciones: 16


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 16/16 [00:01<00:00,  8.41it/s]


[OK] Total de oraciones: 18


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 18/18 [00:03<00:00,  5.79it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:03<00:00,  2.22it/s]


[OK] Total de oraciones: 13


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 13/13 [00:02<00:00,  4.36it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:01<00:00,  4.21it/s]


[OK] Total de oraciones: 18


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 18/18 [00:03<00:00,  5.80it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:02<00:00,  5.23it/s]


float division by zero
 AT METRIC ST_HERDAN_TTR, TEXT: She...
[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


[OK] Total de oraciones: 16


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 16/16 [00:03<00:00,  5.23it/s]


[OK] Total de oraciones: 18


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 18/18 [00:02<00:00,  6.04it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:01<00:00,  3.90it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  4.94it/s]


[OK] Total de oraciones: 14


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 14/14 [00:01<00:00,  8.15it/s]


[OK] Total de oraciones: 1


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 1/1 [00:01<00:00,  1.69s/it]


[OK] Total de oraciones: 13


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 13/13 [00:02<00:00,  5.62it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:01<00:00,  4.70it/s]


[OK] Total de oraciones: 15


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 15/15 [00:02<00:00,  5.08it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:02<00:00,  5.18it/s]


[OK] Total de oraciones: 1


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 1/1 [00:01<00:00,  1.55s/it]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:01<00:00,  5.17it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:01<00:00,  3.51it/s]


[OK] Total de oraciones: 10


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 10/10 [00:02<00:00,  3.63it/s]


[OK] Total de oraciones: 15


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 15/15 [00:04<00:00,  3.49it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:01<00:00,  4.07it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:01<00:00,  5.13it/s]


[OK] Total de oraciones: 16


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 16/16 [00:02<00:00,  6.09it/s]


[OK] Total de oraciones: 14


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 14/14 [00:01<00:00,  7.37it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:01<00:00,  3.14it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:01<00:00,  5.65it/s]


[OK] Total de oraciones: 9


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 9/9 [00:01<00:00,  7.42it/s]


[OK] Total de oraciones: 14


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 14/14 [00:01<00:00,  7.24it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  5.14it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:01<00:00, 10.03it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:02<00:00,  4.96it/s]


[OK] Total de oraciones: 23


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 23/23 [00:04<00:00,  4.85it/s]


[OK] Total de oraciones: 26


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 26/26 [00:05<00:00,  4.99it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:01<00:00,  4.73it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:02<00:00,  3.25it/s]


[OK] Total de oraciones: 10


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 10/10 [00:02<00:00,  3.86it/s]


[OK] Total de oraciones: 9


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 9/9 [00:02<00:00,  3.97it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:02<00:00,  2.23it/s]


[OK] Total de oraciones: 13


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 13/13 [00:02<00:00,  4.85it/s]


[OK] Total de oraciones: 13


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 13/13 [00:02<00:00,  4.58it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:02<00:00,  1.84it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:01<00:00,  3.01it/s]


[OK] Total de oraciones: 19


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 19/19 [00:03<00:00,  5.58it/s]


[OK] Total de oraciones: 9


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 9/9 [00:02<00:00,  3.56it/s]


[OK] Total de oraciones: 11


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 11/11 [00:01<00:00,  7.67it/s]


[OK] Total de oraciones: 8


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 8/8 [00:01<00:00,  5.87it/s]


[OK] Total de oraciones: 2


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 2/2 [00:00<00:00,  2.28it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  5.10it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  6.98it/s]


[OK] Total de oraciones: 3


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 3/3 [00:02<00:00,  1.27it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:01<00:00,  4.88it/s]


[OK] Total de oraciones: 21


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 21/21 [00:03<00:00,  6.34it/s]


[OK] Total de oraciones: 8


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 8/8 [00:01<00:00,  7.91it/s]


[OK] Total de oraciones: 10


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 10/10 [00:01<00:00,  8.04it/s]


[OK] Total de oraciones: 25


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 25/25 [00:03<00:00,  7.75it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:00<00:00,  6.01it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  5.66it/s]


[OK] Total de oraciones: 20


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 20/20 [00:02<00:00,  8.21it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:01<00:00,  4.80it/s]


[OK] Total de oraciones: 15


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 15/15 [00:02<00:00,  5.38it/s]


[OK] Total de oraciones: 8


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 8/8 [00:01<00:00,  7.30it/s]


[OK] Total de oraciones: 43


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 43/43 [00:04<00:00,  9.18it/s]


[OK] Total de oraciones: 31


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 31/31 [00:03<00:00,  7.90it/s]


[OK] Total de oraciones: 3


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 3/3 [00:00<00:00,  4.51it/s]


[OK] Total de oraciones: 12


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 12/12 [00:01<00:00,  7.45it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  7.50it/s]


[OK] Total de oraciones: 14


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 14/14 [00:01<00:00,  8.83it/s]


[OK] Total de oraciones: 20


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 20/20 [00:02<00:00,  9.59it/s]


[OK] Total de oraciones: 13


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 13/13 [00:02<00:00,  4.81it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:02<00:00,  1.56it/s]


[OK] Total de oraciones: 16


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 16/16 [00:02<00:00,  6.05it/s]


[OK] Total de oraciones: 1


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 1/1 [00:01<00:00,  1.86s/it]


[OK] Total de oraciones: 28


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 28/28 [00:03<00:00,  8.44it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  6.96it/s]


[OK] Total de oraciones: 1


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 1/1 [00:00<00:00,  1.13it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:01<00:00,  5.37it/s]


[OK] Total de oraciones: 2


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


[OK] Total de oraciones: 27


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 27/27 [00:05<00:00,  5.33it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  4.04it/s]


[OK] Total de oraciones: 5


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 5/5 [00:00<00:00,  5.27it/s]


[OK] Total de oraciones: 10


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 10/10 [00:01<00:00,  6.15it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:02<00:00,  1.89it/s]


[OK] Total de oraciones: 6


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 6/6 [00:01<00:00,  4.37it/s]


[OK] Total de oraciones: 16


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 16/16 [00:02<00:00,  6.81it/s]


[OK] Total de oraciones: 19


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 19/19 [00:03<00:00,  6.19it/s]


[OK] Total de oraciones: 17


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 17/17 [00:02<00:00,  6.19it/s]


[OK] Total de oraciones: 9


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 9/9 [00:01<00:00,  4.87it/s]


[OK] Total de oraciones: 27


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 27/27 [00:04<00:00,  6.59it/s]


[OK] Total de oraciones: 16


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 16/16 [00:04<00:00,  3.98it/s]


[OK] Total de oraciones: 7


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 7/7 [00:02<00:00,  3.36it/s]


[OK] Total de oraciones: 24


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 24/24 [00:04<00:00,  5.78it/s]


[OK] Total de oraciones: 4


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 4/4 [00:00<00:00,  6.70it/s]


[OK] Total de oraciones: 2


  with torch.cuda.amp.autocast(self._mixed_precision):
100%|██████████| 2/2 [00:00<00:00,  3.22it/s]


[OK] Total de oraciones: 3


  with torch.cuda.amp.autocast(self._mixed_precision):
 67%|██████▋   | 2/3 [00:00<00:00,  2.49it/s]

In [None]:
# Uso:
print(f"Shape: {features_df.shape}")
display(features_df.head(5))

NameError: name 'features_df' is not defined

## Versión 2 - Tags codificados

### Codificación de etiquetas 'model' y 'domain'

In [None]:
print("Información del dataset original:")
print(f"Forma del dataset: {or_train_noadv_df.shape}")
print(f"Columnas: {list(or_train_noadv_df.columns)}")
print(f"Modelos unicos: {or_train_noadv_df['model'].unique()}")
print(f"Dominios unicos: {or_train_noadv_df['domain'].unique()}")

Información del dataset original:
Forma del dataset: (467985, 11)
Columnas: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation']
Modelos unicos: ['human' 'llama-chat' 'mpt' 'mpt-chat' 'gpt2' 'mistral' 'mistral-chat'
 'gpt3' 'cohere' 'chatgpt' 'gpt4' 'cohere-chat']
Dominios unicos: ['abstracts' 'books' 'news' 'poetry' 'recipes' 'reddit' 'reviews' 'wiki']


In [None]:
print("Información del dataset generado:")
print(f"Forma del dataset: {features_df.shape}")
print(f"Columnas: {list(features_df.columns)}")
print(f"Modelos unicos: {features_df['model'].unique()}")
print(f"Dominios unicos: {features_df['domain'].unique()}")

Información del dataset generado:
Forma del dataset: (10036, 201)
Columnas: ['id_original', 'model', 'domain', 'sentence_num', 'text', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SENT', 'SY_SU

In [None]:
# Codificación de etiquetas 'model' y 'domain'
id_encoder = LabelEncoder()
model_encoder = LabelEncoder()
domain_encoder = LabelEncoder()
features_df['id_encoded'] = id_encoder.fit_transform(features_df['id_original'])
features_df['model_encoded'] = model_encoder.fit_transform(features_df['model'])
features_df['domain_encoded'] = domain_encoder.fit_transform(features_df['domain'])

In [None]:
print("Etiquetas codificadas:")

print(f"Modelos:")

for i in range(len(model_encoder.classes_)):
    print(f"{i}: {model_encoder.classes_[i]}")
print()

print(f"Dominios:")
for i in range(len(domain_encoder.classes_)):
    print(f"{i}: {domain_encoder.classes_[i]}")
# print(f"Modelos: {list(model_encoder.classes_)}")
# print(f"Dominios: {list(domain_encoder.classes_)}")

Etiquetas codificadas:
Modelos:
0: chatgpt
1: cohere
2: cohere-chat
3: gpt2
4: gpt3
5: gpt4
6: human
7: llama-chat
8: mistral
9: mistral-chat
10: mpt
11: mpt-chat

Dominios:
0: abstracts
1: books
2: news
3: poetry
4: reddit
5: reviews
6: wiki


In [None]:
# Visualizar texto de registros aleatorios de dominio poetry
pd.set_option('display.max_colwidth', None)  # Sin límite de ancho
pd.set_option('display.max_rows', None)      # Sin límite de filas (usar con cuidado)

poetry_df = features_df[features_df['domain'] == 'poetry']
display(poetry_df[['id_original', 'sentence_num', 'model', 'domain', 'text']].sample(n=10, random_state=11))
# 2 7 
pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')

Unnamed: 0,id_original,sentence_num,model,domain,text
2782,1c7df1ce-03c0-4e51-8b78-453f3f3f065b,2,llama-chat,poetry,"He jokes and laughs, and makes us feel at ease, With him around, work is a breeze."
2375,fc475e41-678b-4334-a926-70206344ecea,0,llama-chat,poetry,"Once, I thought happiness was a destination, A place I could reach and settle in, A feeling that would stay and never fade, A constant state of being, a permanent shade."
7280,96915885-996d-407c-bab7-f5ccfdb669b7,0,llama-chat,poetry,"In the depths of a sunset's golden glow, Where the sky and earth meet, and the world slows, There lies a beauty, a truth profound, A moment that takes the breath, and grounds."
3241,f08102e8-63cb-40b1-bd7a-1b198565a2c0,4,human,poetry,He’s at work in the darkest night; for God is for you that guiding light.
184,1ef57226-f2b8-4346-8929-f516ac1d3feb,10,mpt,poetry,That it's like you get a feeling that's indescribable.
9695,8616fea2-d677-4dac-b964-e448018c0d64,1,mistral,poetry,It would be so dull!
809,cb9fbec9-b613-48a4-9a18-dc44cc374761,2,mpt-chat,poetry,"In this haven, the heart beats true, A symphony of love's sweet dew, The lips that once whispered sweet nothings, Now speak the language of love's offerings."
4993,fe958001-600c-4073-8f7e-c04381999db4,8,mistral,poetry,"Sis, it’s that for rightly it behoves you, That thou shouldst show me your hand."
7513,7c3e7f91-268e-445a-8c7c-e5e2e90d54bd,6,mpt-chat,poetry,Such quest demands more than fair visage graced By Cupid ’s arrow dipped within Venus ‘ embrace.
9801,f860be98-d71d-478c-9483-8a3fb2192c4f,0,llama-chat,poetry,Sure!


### Armado de DF

In [None]:
# Ordenar DF final
train_df = features_df.copy()

# trainCols = ['id_encoded', 'sentence_num', 'model_encoded', 'domain_encoded', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SENT', 'SY_SUBORD_SENT_PUNCT', 'SY_COORD_SENT', 'SY_COORD_SENT_PUNCT', 'SY_SIMPLE_SENT', 'SY_INVERSE_PATTERNS', 'SY_SIMILE', 'SY_FRONTING', 'SY_IRRITATION', 'SY_INTENSIFIER', 'SY_QUOT', 'VT_PRESENT_SIMPLE', 'VT_PRESENT_PROGRESSIVE', 'VT_PRESENT_PERFECT', 'VT_PRESENT_PERFECT_PROGR', 'VT_PRESENT_SIMPLE_PASSIVE', 'VT_PRESENT_PROGR_PASSIVE', 'VT_PRESENT_PERFECT_PASSIVE', 'VT_PAST_SIMPLE', 'VT_PAST_SIMPLE_BE', 'VT_PAST_PROGR', 'VT_PAST_PERFECT', 'VT_PAST_PERFECT_PROGR', 'VT_PAST_SIMPLE_PASSIVE', 'VT_PAST_POGR_PASSIVE', 'VT_PAST_PERFECT_PASSIVE', 'VT_FUTURE_SIMPLE', 'VT_FUTURE_PROGRESSIVE', 'VT_FUTURE_PERFECT', 'VT_FUTURE_PERFECT_PROGR', 'VT_FUTURE_SIMPLE_PASSIVE', 'VT_FUTURE_PROGR_PASSIVE', 'VT_FUTURE_PERFECT_PASSIVE', 'VT_WOULD', 'VT_WOULD_PASSIVE', 'VT_WOULD_PROGRESSIVE', 'VT_WOULD_PERFECT', 'VT_WOULD_PERFECT_PASSIVE', 'VT_SHOULD', 'VT_SHOULD_PASSIVE', 'VT_SHALL', 'VT_SHALL_PASSIVE', 'VT_SHOULD_PROGRESSIVE', 'VT_SHOULD_PERFECT', 'VT_SHOULD_PERFECT_PASSIVE', 'VT_MUST', 'VT_MUST_PASSIVE', 'VT_MUST_PROGRESSIVE', 'VT_MUST_PERFECT', 'VT_MST_PERFECT_PASSIVE', 'VT_CAN', 'VT_CAN_PASSIVE', 'VT_COULD', 'VT_COULD_PASSIVE', 'VT_CAN_PROGRESSIVE', 'VT_COULD_PROGRESSIVE', 'VT_COULD_PERFECT', 'VT_COULD_PERFECT_PASSIVE', 'VT_MAY', 'VT_MAY_PASSIVE', 'VT_MIGHT', 'VT_MIGHT_PASSIVE', 'VT_MAY_PROGRESSIVE', 'VT_MIGTH_PERFECT', 'VT_MIGHT_PERFECT_PASSIVE', 'VT_MAY_PERFECT_PASSIVE', 'ST_TYPE_TOKEN_RATIO_LEMMAS', 'ST_HERDAN_TTR', 'ST_MASS_TTR', 'ST_SENT_WRDSPERSENT', 'ST_SENT_DIFFERENCE', 'ST_REPETITIONS_WORDS', 'ST_REPETITIONS_SENT', 'ST_SENT_D_VP', 'ST_SENT_D_NP', 'ST_SENT_D_PP', 'ST_SENT_D_ADJP', 'ST_SENT_D_ADVP', 'L_I_PRON', 'L_HE_PRON', 'L_SHE_PRON', 'L_IT_PRON', 'L_YOU_PRON', 'L_WE_PRON', 'L_THEY_PRON', 'L_ME_PRON', 'L_YOU_OBJ_PRON', 'L_HIM_PRON', 'L_HER_OBJECT_PRON', 'L_IT_OBJECT_PRON', 'L_US_PRON', 'L_THEM_PRON', 'L_MY_PRON', 'L_YOUR_PRON', 'L_HIS_PRON', 'L_HER_PRON', 'L_ITS_PRON', 'L_OUR_PRON', 'L_THEIR_PRON', 'L_YOURS_PRON', 'L_THEIRS_PRON', 'L_HERS_PRON', 'L_OURS_PRON', 'L_MYSELF_PRON', 'L_YOURSELF_PRON', 'L_HIMSELF_PRON', 'L_HERSELF_PRON', 'L_ITSELF_PRON', 'L_OURSELVES_PRON', 'L_YOURSELVES_PRON', 'L_THEMSELVES_PRON', 'L_FIRST_PERSON_SING_PRON', 'L_SECOND_PERSON_PRON', 'L_THIRD_PERSON_SING_PRON', 'L_THIRD_PERSON_PLURAL_PRON', 'VF_INFINITIVE', 'G_PASSIVE', 'G_ACTIVE', 'G_PRESENT', 'G_PAST', 'G_FUTURE', 'G_MODALS_SIMPLE', 'G_MODALS_CONT', 'G_MODALS_PERFECT', 'AN', 'DDP', 'SVP', 'CDS', 'DDF', 'IS', 'PS', 'RE', 'ASF', 'ASM', 'OM', 'RCI', 'DMC', 'OR', 'QAS', 'PA', 'PR']

trainCols = ['id_encoded', 'sentence_num', 'model', 'domain', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SENT', 'SY_SUBORD_SENT_PUNCT', 'SY_COORD_SENT', 'SY_COORD_SENT_PUNCT', 'SY_SIMPLE_SENT', 'SY_INVERSE_PATTERNS', 'SY_SIMILE', 'SY_FRONTING', 'SY_IRRITATION', 'SY_INTENSIFIER', 'SY_QUOT', 'VT_PRESENT_SIMPLE', 'VT_PRESENT_PROGRESSIVE', 'VT_PRESENT_PERFECT', 'VT_PRESENT_PERFECT_PROGR', 'VT_PRESENT_SIMPLE_PASSIVE', 'VT_PRESENT_PROGR_PASSIVE', 'VT_PRESENT_PERFECT_PASSIVE', 'VT_PAST_SIMPLE', 'VT_PAST_SIMPLE_BE', 'VT_PAST_PROGR', 'VT_PAST_PERFECT', 'VT_PAST_PERFECT_PROGR', 'VT_PAST_SIMPLE_PASSIVE', 'VT_PAST_POGR_PASSIVE', 'VT_PAST_PERFECT_PASSIVE', 'VT_FUTURE_SIMPLE', 'VT_FUTURE_PROGRESSIVE', 'VT_FUTURE_PERFECT', 'VT_FUTURE_PERFECT_PROGR', 'VT_FUTURE_SIMPLE_PASSIVE', 'VT_FUTURE_PROGR_PASSIVE', 'VT_FUTURE_PERFECT_PASSIVE', 'VT_WOULD', 'VT_WOULD_PASSIVE', 'VT_WOULD_PROGRESSIVE', 'VT_WOULD_PERFECT', 'VT_WOULD_PERFECT_PASSIVE', 'VT_SHOULD', 'VT_SHOULD_PASSIVE', 'VT_SHALL', 'VT_SHALL_PASSIVE', 'VT_SHOULD_PROGRESSIVE', 'VT_SHOULD_PERFECT', 'VT_SHOULD_PERFECT_PASSIVE', 'VT_MUST', 'VT_MUST_PASSIVE', 'VT_MUST_PROGRESSIVE', 'VT_MUST_PERFECT', 'VT_MST_PERFECT_PASSIVE', 'VT_CAN', 'VT_CAN_PASSIVE', 'VT_COULD', 'VT_COULD_PASSIVE', 'VT_CAN_PROGRESSIVE', 'VT_COULD_PROGRESSIVE', 'VT_COULD_PERFECT', 'VT_COULD_PERFECT_PASSIVE', 'VT_MAY', 'VT_MAY_PASSIVE', 'VT_MIGHT', 'VT_MIGHT_PASSIVE', 'VT_MAY_PROGRESSIVE', 'VT_MIGTH_PERFECT', 'VT_MIGHT_PERFECT_PASSIVE', 'VT_MAY_PERFECT_PASSIVE', 'ST_TYPE_TOKEN_RATIO_LEMMAS', 'ST_HERDAN_TTR', 'ST_MASS_TTR', 'ST_SENT_WRDSPERSENT', 'ST_SENT_DIFFERENCE', 'ST_REPETITIONS_WORDS', 'ST_REPETITIONS_SENT', 'ST_SENT_D_VP', 'ST_SENT_D_NP', 'ST_SENT_D_PP', 'ST_SENT_D_ADJP', 'ST_SENT_D_ADVP', 'L_I_PRON', 'L_HE_PRON', 'L_SHE_PRON', 'L_IT_PRON', 'L_YOU_PRON', 'L_WE_PRON', 'L_THEY_PRON', 'L_ME_PRON', 'L_YOU_OBJ_PRON', 'L_HIM_PRON', 'L_HER_OBJECT_PRON', 'L_IT_OBJECT_PRON', 'L_US_PRON', 'L_THEM_PRON', 'L_MY_PRON', 'L_YOUR_PRON', 'L_HIS_PRON', 'L_HER_PRON', 'L_ITS_PRON', 'L_OUR_PRON', 'L_THEIR_PRON', 'L_YOURS_PRON', 'L_THEIRS_PRON', 'L_HERS_PRON', 'L_OURS_PRON', 'L_MYSELF_PRON', 'L_YOURSELF_PRON', 'L_HIMSELF_PRON', 'L_HERSELF_PRON', 'L_ITSELF_PRON', 'L_OURSELVES_PRON', 'L_YOURSELVES_PRON', 'L_THEMSELVES_PRON', 'L_FIRST_PERSON_SING_PRON', 'L_SECOND_PERSON_PRON', 'L_THIRD_PERSON_SING_PRON', 'L_THIRD_PERSON_PLURAL_PRON', 'VF_INFINITIVE', 'G_PASSIVE', 'G_ACTIVE', 'G_PRESENT', 'G_PAST', 'G_FUTURE', 'G_MODALS_SIMPLE', 'G_MODALS_CONT', 'G_MODALS_PERFECT', 'AN', 'DDP', 'SVP', 'CDS', 'DDF', 'IS', 'PS', 'RE', 'ASF', 'ASM', 'OM', 'RCI', 'DMC', 'OR', 'QAS', 'PA', 'PR']

# trainCols = ['id_original', 'id_encoded', 'sentence_num', 'model_encoded', 'domain_encoded', 'text']

train_df = train_df[trainCols]

train_df = train_df.rename(columns={
    'id_encoded': 'id',
    # 'model_encoded': 'model_label',
    # 'domain_encoded': 'domain_label'
})

train_df = train_df.sort_values(by=['id', 'sentence_num']).reset_index(drop=True)

In [None]:
# display(train_df.sample(5, random_state=42))
display(train_df.head())

Unnamed: 0,id,sentence_num,model,domain,POS_VERB,POS_NOUN,POS_ADJ,POS_ADV,POS_DET,POS_INTJ,...,RE,ASF,ASM,OM,RCI,DMC,OR,QAS,PA,PR
0,0,0,mistral,books,0.121212,0.212121,0.060606,0.090909,0.151515,0.0,...,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.121212,0.0,0.0
1,0,1,mistral,books,0.24,0.16,0.12,0.04,0.2,0.0,...,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.16,0.0,0.04
2,0,2,mistral,books,0.222222,0.111111,0.111111,0.0,0.222222,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111
3,0,3,mistral,books,0.285714,0.190476,0.047619,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,0.0
4,0,4,mistral,books,0.25,0.0625,0.0625,0.0,0.1875,0.0,...,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.125,0.0,0.0


In [None]:
# Contar ids únicos por modelo
unique_ids_per_model = train_df.groupby('model')['id'].nunique()
print("Conteo de IDs únicos por modelo:")
print(unique_ids_per_model)
print("Total IDs únicos:", train_df['id'].nunique())

Conteo de IDs únicos por modelo:
model
chatgpt          52
cohere           52
cohere-chat      55
gpt2            116
gpt3             59
gpt4             64
human            26
llama-chat      105
mistral         118
mistral-chat    127
mpt             120
mpt-chat        106
Name: id, dtype: int64
Total IDs únicos: 1000


In [None]:
# Guardar el DataFrame final a un archivo CSV
pdtrainDF = pd.DataFrame(train_df)
pdtrainDF.to_csv('train_df.csv', index=False)