# Desarrollo de DF único - Segunda versión de limpieza

## Preprocesamiento

In [None]:
import stylo_metrix as sm
import pandas as pd
from raid.utils import load_data

from processer import split_text_into_sentences
from sklearn.preprocessing import LabelEncoder

In [2]:
# Download the RAID dataset without adversarial attacks
or_train_noadv_df = load_data(split="train", include_adversarial=False)

In [3]:
print(f"Original training data shape: {or_train_noadv_df.shape}")

Original training data shape: (467985, 11)


In [33]:
# Contar ids únicos por modelo
unique_ids_per_model = or_train_noadv_df.groupby('model')['id'].nunique()
print("Conteo de IDs únicos por modelo:")
print(unique_ids_per_model)
print("Total IDs únicos:", or_train_noadv_df['id'].nunique())

Conteo de IDs únicos por modelo:
model
chatgpt         26742
cohere          26742
cohere-chat     26742
gpt2            53484
gpt3            26742
gpt4            26742
human           13371
llama-chat      53484
mistral         53484
mistral-chat    53484
mpt             53484
mpt-chat        53484
Name: id, dtype: int64
Total IDs únicos: 467985


In [4]:
intCols = ['id','model', 'domain', 'title', 'prompt', 'generation']

# Copia del dataframe con columnas específicas
train_noadv_df = or_train_noadv_df.copy()
train_noadv_df = train_noadv_df[intCols]

## Versión 1 - Fusión original - 50% documentos humanos

In [29]:
# filtered_by_domain = train_noadv_df[
#     (train_noadv_df['domain'] != 'recipes') |
#     (train_noadv_df['domain'] != 'poetry')
#     ]

# Separar documentos humanos y de IA
human_docs = train_noadv_df[train_noadv_df['model'] == 'human']
ai_docs = train_noadv_df[train_noadv_df['model'] != 'human']

total_samples = 26743
samples_per_class = total_samples // 2  # 500 de cada uno

# Samplear 50% de cada clase
human_sample = human_docs[['id', 'model', 'domain', 'generation']].sample(n=samples_per_class, random_state=50)
ai_sample = ai_docs[['id', 'model', 'domain', 'generation']].sample(n=samples_per_class, random_state=50)

# Combinar ambos samples
generation_sample = pd.concat([human_sample, ai_sample], ignore_index=True)

generation_sample = generation_sample.sample(frac=1, random_state=50).reset_index(drop=True)

print(f"Total muestras: {len(generation_sample)}")
print(f"Distribución por clase:")
print(generation_sample['model'].value_counts())
print(f"\nHumanos: {(generation_sample['model'] == 'human').sum()} ({(generation_sample['model'] == 'human').sum() / len(generation_sample) * 100:.1f}%)")
print(f"IA: {(generation_sample['model'] != 'human').sum()} ({(generation_sample['model'] != 'human').sum() / len(generation_sample) * 100:.1f}%)")

Total muestras: 26742
Distribución por clase:
model
human           13371
mistral          1626
mpt              1582
gpt2             1561
mistral-chat     1551
llama-chat       1550
mpt-chat         1534
chatgpt           854
gpt4              795
gpt3              780
cohere            769
cohere-chat       769
Name: count, dtype: int64

Humanos: 13371 (50.0%)
IA: 13371 (50.0%)


In [30]:
display(generation_sample)

Unnamed: 0,id,model,domain,generation
0,3b167591-8c07-4331-bcf1-cee6e2aecfe1,cohere,poetry,If tomorrow doesn’t come please remember\nThe...
1,5aee859c-8bf3-46ae-8617-3d700df9853f,human,books,Family of former fellow student and rival to ...
2,789f6e8b-d8d9-4adc-bf31-23c0621af322,llama-chat,books,When her fiancé dumps her just days before the...
3,cecb1eef-6024-4730-be1f-c557560d3495,human,poetry,"I’m sitting in class,\nMy eyes about to close,..."
4,ee326d72-692c-4a0c-a0f1-50e32dd16ec0,chatgpt,reviews,"""The Italian Job"" is a thrilling heist film th..."
...,...,...,...,...
26737,b2315c88-85be-4307-97c5-c0aaa4c56ff4,mistral-chat,recipes,Palatine Bacon and Onion Cake Recipe\n\nIngred...
26738,31870ab5-a091-455a-8683-7d8ce1828e0a,chatgpt,news,Former British Prime Minister Tony Blair has r...
26739,39837d44-f705-4543-9403-4dda9dc87c68,human,reviews,What can I say... This was by far the most ama...
26740,dcf6cf2f-291f-43d7-929c-25c1e1b1bc93,human,wiki,"""Papi"" is a song recorded by American singer J..."


In [31]:
def extract_features_from_dataset(df_original, sample_size=None):
    """
    Extrae features estilométricos a nivel de oración.
    
    Returns:
        DataFrame con estructura: id_original, model, domain, sentence_num, text, features...
    """
    if sample_size:
        df_original = df_original.sample(n=sample_size, random_state=42)
    
    # Inicializar StyloMetrix (sin guardar archivos)
    stylo = sm.StyloMetrix('en', debug=False)  # debug=False para evitar archivos
    
    all_results = []
    
    for idx, row in df_original.iterrows():
        # Dividir en oraciones (en memoria)
        sentences = split_text_into_sentences(row['generation'])
        
        # Extraer features para todas las oraciones del documento
        features_df = stylo.transform(sentences)
        
        # Agregar metadatos del documento original
        features_df.insert(0, 'id_original', row['id'])
        features_df.insert(1, 'model', row['model'])
        features_df.insert(2, 'domain', row['domain'])
        features_df.insert(3, 'sentence_num', range(len(sentences)))
        # La columna 'text' ya existe en features_df (viene de stylo.transform)
        
        all_results.append(features_df)
    
    # Concatenar todos los resultados
    final_df = pd.concat(all_results, ignore_index=True)
    
    return final_df

In [32]:
features_df = extract_features_from_dataset(generation_sample)



KeyboardInterrupt: 

In [10]:
# Uso:
print(f"Shape: {features_df.shape}")
display(features_df.head(5))

Shape: (12168, 201)


Unnamed: 0,id_original,model,domain,sentence_num,text,POS_VERB,POS_NOUN,POS_ADJ,POS_ADV,POS_DET,...,RE,ASF,ASM,OM,RCI,DMC,OR,QAS,PA,PR
0,3ef6e59b-f260-40eb-954d-9a8afb720bd3,human,abstracts,0,Image-to-image translation is a long-establish...,0.166667,0.5,0.083333,0.083333,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3ef6e59b-f260-40eb-954d-9a8afb720bd3,human,abstracts,1,In this paper we propose an adversarial based ...,0.166667,0.5,0.0,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0
2,3ef6e59b-f260-40eb-954d-9a8afb720bd3,human,abstracts,2,The regular deep neural-network based methods ...,0.208333,0.5,0.125,0.0,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.083333,0.0,0.0
3,3ef6e59b-f260-40eb-954d-9a8afb720bd3,human,abstracts,3,Our generative adversarial network based model...,0.166667,0.333333,0.25,0.0,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0
4,3ef6e59b-f260-40eb-954d-9a8afb720bd3,human,abstracts,4,This approach makes the image translation inde...,0.0625,0.375,0.1875,0.0,0.1875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0


## Versión 2 - Tags codificados

### Codificación de etiquetas 'model' y 'domain'

In [11]:
print("Información del dataset original:")
print(f"Forma del dataset: {or_train_noadv_df.shape}")
print(f"Columnas: {list(or_train_noadv_df.columns)}")
print(f"Modelos unicos: {or_train_noadv_df['model'].unique()}")
print(f"Dominios unicos: {or_train_noadv_df['domain'].unique()}")

Información del dataset original:
Forma del dataset: (467985, 11)
Columnas: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation']
Modelos unicos: ['human' 'llama-chat' 'mpt' 'mpt-chat' 'gpt2' 'mistral' 'mistral-chat'
 'gpt3' 'cohere' 'chatgpt' 'gpt4' 'cohere-chat']
Dominios unicos: ['abstracts' 'books' 'news' 'poetry' 'recipes' 'reddit' 'reviews' 'wiki']


In [12]:
print("Información del dataset generado:")
print(f"Forma del dataset: {features_df.shape}")
print(f"Columnas: {list(features_df.columns)}")
print(f"Modelos unicos: {features_df['model'].unique()}")
print(f"Dominios unicos: {features_df['domain'].unique()}")

Información del dataset generado:
Forma del dataset: (12168, 201)
Columnas: ['id_original', 'model', 'domain', 'sentence_num', 'text', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SENT', 'SY_SU

In [None]:
# Codificación de etiquetas id
id_encoder = LabelEncoder()
features_df['id_encoded'] = id_encoder.fit_transform(features_df['id_original'])

In [24]:
# Visualizar texto de registros aleatorios de dominio poetry
pd.set_option('display.max_colwidth', None)  # Sin límite de ancho
pd.set_option('display.max_rows', None)      # Sin límite de filas (usar con cuidado)

poetry_df = features_df[features_df['domain'] == 'poetry']
display(poetry_df[['id_original', 'sentence_num', 'model', 'domain', 'text']].sample(n=5, random_state=11))

pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')

Unnamed: 0,id_original,sentence_num,model,domain,text
5357,c8b310ff-8203-4f90-9bf6-5ac6c82b1819,3,human,poetry,"A dim desire, of pleasant places, And lush fields in the summer sun, And logs aflame, and walls, and faces, -- And wine, and old ambrosial talk, A golden ball in fountains dancing, And unforgotten hands. (Ah, God, I trod them down where I have trod, And they remain, and they remain, Etched in unutterable pain, Loved lips and faces now apart, That once were closer than my heart -- In agony, in agony, And horribly a part of me. . . ."
6148,d982e029-6184-489b-9251-37ad6ccd1a85,8,mpt,poetry,"This is the best interpretation of nature that man will ever find, And this is nature as it is for man had not yet set foot on nature's ground."
2619,ba673ef7-a429-4bfb-9d50-8486653e5a34,344,human,poetry,"Here, where the walls go down beneath our picks, These walls whose windows gap against the sky, Atom by atom of flesh and brain and marble Will build a glittering tower before we die . . ."
10364,2fb69dc2-6490-4ab2-a13a-d187b27efb95,8,gpt2,poetry,"So, I decided to start writing down some things that trigger me."
9844,23ca752c-298d-44d1-b933-a0f16690619f,3,human,poetry,We are the Little Folk--we!


#### Test: Visualización

In [15]:
# Ordenar DF de visualización de texto
vis_df = features_df.copy()

visCols = ['id_encoded', 'sentence_num', 'model', 'text', 'domain']

vis_df = vis_df[visCols]

vis_df = vis_df.rename(columns={
    'id_encoded': 'id',
    # 'model_encoded': 'model_label',
    # 'domain_encoded': 'domain_label'
})

vis_df = vis_df.sort_values(by=['id', 'sentence_num']).reset_index(drop=True)
display(vis_df.head(10))

Unnamed: 0,id,sentence_num,model,text,domain
0,0,0,human,This of course is not a great movie but I just...,reviews
1,0,1,human,I have never laughed more in my life.,reviews
2,0,2,human,Even the jokes that aren't funny are funny.,reviews
3,0,3,human,This film's comedic build up is about the best...,reviews
4,0,4,human,"It doesn't even matter what the punch line is,...",reviews
5,0,5,human,Anytime you have Randy Quaid in a film you jus...,reviews
6,0,6,human,Kingpin couldn't have shown this any better as...,reviews
7,0,7,human,Woody Harrelson plays a has-been ex-profession...,reviews
8,0,8,human,Together they go on a ride across the country ...,reviews
9,0,9,human,Along the journey they are great supporting co...,reviews


In [36]:
pd.set_option('display.max_colwidth', None)  # Sin límite de ancho
pd.set_option('display.max_rows', None)      # Sin límite de filas (usar con cuidado)


# Verificar documentos con solo una oración (sentence_num = 0 únicamente)
# Contar oraciones por ID
oraciones_por_id = vis_df.groupby('id')['sentence_num'].apply(lambda x: x.tolist())

# Filtrar IDs que solo tienen sentence_num = 0
ids_una_oracion = oraciones_por_id[oraciones_por_id.apply(lambda x: x == [0])].index

print(f"Número de documentos con solo una oración: {len(ids_una_oracion)}")
print(f"Total de documentos: {vis_df['id'].nunique()}")
print(f"Porcentaje: {len(ids_una_oracion) / vis_df['id'].nunique() * 100:.2f}%")

# Mostrar df con los documentos con solo una oración
df_una_oracion = vis_df[vis_df['id'].isin(ids_una_oracion)]
display(df_una_oracion)


pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')

Número de documentos con solo una oración: 36
Total de documentos: 1000
Porcentaje: 3.60%


Unnamed: 0,id,sentence_num,model,text,domain
615,49,0,human,"i have a nostril piercing i got done in late august which got infected because i was allergic to the nickel in the ring. i then got a hypoallergenic titanium but started to notice the same sort of issue and figured i just didn’t do well with metal, so i’ve tried using a plastic bioflex ring, but the same thing started happening again when i started using saltwater with my new septum piercing. what happens is that a ball of pus forms on the inside of my nose around the piercing, and it gets quite uncomfortable, but it doesn’t seem to be rejecting or anything. the place i got it pierced said to use antibacterial soap on it, but it doesn’t really seem to be helping. what should i do? should i switch back to the hypoallergenic ring and use a salt soak for both of them? i’m unsure of what to do at this point and i really don’t want to have to get rid of it.",reddit
688,59,0,mpt,On the beach at night alone I can hear the waves rolling in The moon is bright and the stars are shining I can see the lights of the city I can hear the seagulls crying I can feel the sand beneath my feet I can smell the salt in the air I can taste the ocean on my lips I can feel the wind in my hair I can hear the waves crashing on the shore I can feel the sand between my toes I can see the moon reflecting on the water I can hear the seagulls calling I can feel the wind in my hair I can smell the salt in the air I can taste the ocean on my lips I can feel the sand beneath my feet I can hear the waves rolling in I can see the lights of the city I can hear the seagulls crying I can feel the sand beneath my feet I can smell the salt in the air I can taste the ocean on my lips I can feel the wind in my hair I can hear the waves crashing on the shore I can feel the sand between my toes I can see the moon reflecting on the water I can hear the seagulls calling I can feel the wind in my hair I can smell the salt in the air I can taste the ocean on my lips I can feel the wind in my hair I can hear the waves crashing on the shore I can feel the sand between my toes I can see the moon reflecting on the water I can hear the seagulls calling I can feel the wind in my hair I can smell the salt in the air I can taste the ocean on my lips I can feel the wind in my hair I can hear the waves crashing on the shore I can feel the sand between my toes I can see the moon reflecting on the water I can hear the seagulls calling I can feel the wind in my hair I can smell the salt in the air I can taste the ocean on my lips I can feel the wind in my hair,poetry
976,87,0,human,"Yes, we get blasé and dismissive on this site – reading in the reams of Poemhunter posts ‘yet another poem of teenage angst’, we say, scanning the raw wounds too rough for poetry – but what else will ease the pain? ‘I thought we loved each other for ever – how could you do this to me? …’ we nod our older heads, remembering just slightly, as if it doesn’t really matter now, how it was for us… but then, who writes of happiness, when happiness seems eternal, hope untested, and life is to be lived? that first heartbreak … it seems there has to be a first one; a worst one; and only poetry can begin to salve the wound.. at least you’ve got your poetry.. yes, reams of poems, and all much the same – but this time, it’s from someone that I know, and that – that really hurts.. it’s almost worse than if it happened to me myself – in that there’s nothing I can say, that’s not been said a million times.. and a million times, she's said ‘You just don’t understand…’ and of course, she’s right – it’s the first time in the world, her world, that this has ever happened… I’ll say nothing more than what I’ve said; than what your parents and your sisters say; for, how can time heal, when there’s no time for time? (and how could I dare to voice to you the thought – your poetry will deepen when your heartbreak heals? ...)",poetry
1156,107,0,human,"Ladies and Gentlemen, this is my first poetry reading and I’m very nervous and nearly cancelled tonight but I spoke about this to a friend who’s a therapist and he said, this is what you do, you imagine the audience in front of you all in their underwear (as you probably are; but without the outer layer) with the men wearing the most idiotic flowered Hawaiian two sizes too large boxer shorts and the ladies in the frumpiest underwear you can imagine apparently it works so if I look at you, ladies and gentlemen, somewhat strangely from time to time or giggle uncontrollably or my eyes rest on you personally as if we’re sharing some huge x-ray joke, please forgive me and understand that’s it for your benefit as well as mine and that you will have put your outerwear back on in the interval before the next poem I hope this is OK with you and nobody’s too embarrassed in fact you could do the same trick with me if it’s any help and now I think I’ve said enough as I’m OK now how about you? (For maximum effect, pause noticeably at the end of each line) (with thanks to Ernestine for the idea)",poetry
1412,126,0,gpt2,A Token Of Sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy A token of sympathy,poetry
1413,127,0,mpt-chat,"The sun wields mercy, as it rises high In a world that's often grey and sky It pierces through the clouds with warmth and light A beacon that dispels the veil of night With gentle rays it caresses the earth And in its touch, it whispers a word Of comfort that removes fear and dearth And brings a sense of hope where once was a void It shines on the just as well as the unjust For in its eyes, there is no such distinction It spreads its light far and wide To all in need of its benediction Oh, how we forget to thank the sun For all it does each day until undone But it keeps shining with all its might A true symbol of the divine right So let us cherish this gift of the sun For in its light, hope is forever begun",poetry
2682,232,0,mpt,"Plot The story starts off with young John Fowles (then known as Jon) at Cambridge University, where he had studied English in 1939 and 1940 before enlisting his local infantry regiment - initially to take up arms against Germany on France's Maginot Line but instead posted back home due not only Britain being outflanked into direct conflict by both Hitler invading Poland alongside Mussolini ordering an invasion across Egypt priorly ordered all around that country which then resulted after this last sentence was written along side another one above it saying something about how many months ago since its release there were 3 or 4 movies filmed each week over several years during WW2 plus also noting down other facts like age 15th birthday parties taking place when asked if I wanted chocolate cake so decided go ahead purchase two books including Rough Copy even though originally thought would buy just 1 copy myself anyways hopefully having fun meeting new people while trying think who may turn better friend than others especially given past experience don't care should be true today too sometimes happens either way good news bad luck none seems worse worst however important realize certain things matter more often could tell whether positive negative depending point view always thinking differently regardless what else going perhaps never fully understand some truths unless personal experiences cause feelings change mind maybe right wrong opinions isn need explain anything no answer everything anyway most certainly doesn exist perfect world although know few folks feel themselves top dogs do make attempts seem pretty self satisfied nevertheless enjoy life best we can despite challenges present reality bring forward hope future might get simpler ways well least attempt keep spirits high strive improve quality lives ours try maintain sanity knowing won lose war ourselves fight daily basis because will happen must live within own capabilities stay strong through hard times love family friends still here tomorrow willing put away troubles until later day arrive cannot forget sadness sorrow suffering felt loss loved ones yet reason hold firm believe great days come again soon enough time allow us look brighter aspects our shared humanity rest assured happy moments fill these darkest hours sure find yourself laughing once tears dried smile earliest morn dawn break silence longest nights blackness fall light heartache happiness awaits bright sides continue moving forwards together peacefully united someday freedom found equal liberty justice prosperity among humankind brotherhood sister hood free shall give birth children promise renewed purpose boundless possibilities unknown knowledge endless endeavors share beauty wonder surrounding created divine source universe eternal soul God Father heaven Mother earth child spirit human potential nature itself let stand unified brothers sisters forevermore amen So far end paragraph comes below please",books
3252,285,0,human,"whenever she seems off or upset i ask her what’s wrong and 99% of the time she changes the subject or denies that anything is wrong. i assure her that i’m here for her and she gets annoyed or mad at me for asking and trying to help. every time i tell her we can talk about it and she says “no”. and now one time, i was busy. i wasn’t there for her. and i’m really upset at myself for that. because she needed me. at first when i tried talking to her when i wasn’t busy she said “it doesn’t even matter” but i could tell she was upset. i tried to get her to talk about it or to at least assure her that i do care about her and that im sorry. and she’s like “you’re only here for me when you want to be.” like literally wtf i try to ask her what’s wrong all the time and she refuses to tell me anything. and now she’s saying “fuck you” and “i hate you” and everything and like wtf. this what i get for opening up and falling in love with someone. fucking hate it",reddit
3563,314,0,llama-chat,"The sun shines bright on wooden benches Where couples sit, hands intertwined Children chase each other 'round the fountain Their laughter echoes, nevermind A gentle breeze rustles leaves above As ducks swim in the pond's embrace A few birds sing their sweetest love songs Filling the air with melodic grace In June at Woodruff, all is well The gardens bloom, colors tell Of nature's beauty, pure and true A peaceful haven for me and you The scent of roses fills the air As we stroll hand in hand, without care The world outside may be unkind But here, love and joy entwine",poetry
3714,326,0,human,"Mutlu olamıyorum bir türlü amk dünyasında yaşamak o kadar anlamsız ki hadi diyorum madem yaşamak bu kadar anlamsız intihar edeyim ölmek de çok anlamsız geliyor keşke hiç var olmasaydım diyorum,bir türlü yaşadığımı hissedemiyorum belki alkol kullansak yaşadığımı hissederim diyorum aile baskısı pahalı olması gibi nedenlerden kullanamıyorum, uyuşturucu desen şeriatı teklif edebilirsin ama uyuşturucuyu teklif edemezsin,sex desen siktiğimin toplumunun baskısı kızların burnun fazla kalkık olması olmuyor,ulan bir kızı sevdim biraz mutlu olurum dedim o da bakmadı,ders çalışayım desem kafamı toparlayamıyorum.Galiba en son liseyi kazanınca adamakıllı sevinmiştim.Ne mutlu olabiliyorum ne de yaşadığımı hissedebiliyorum 2 seçeneğim var ya intihar ya da Avrupa'da şansımı denicem.Avrupada da tutunabileceğimi düşünmüyorum belki ailem dışında birileri beni sevse mutlu olabilirim o da imkansız geliyor kafanızı siktim ama gerçekten canım baya sıkkın",reddit


### Armado de DF

In [32]:
# Ordenar DF final
train_df = features_df.copy()

trainCols = ['id_encoded', 'sentence_num', 'model', 'domain', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SENT', 'SY_SUBORD_SENT_PUNCT', 'SY_COORD_SENT', 'SY_COORD_SENT_PUNCT', 'SY_SIMPLE_SENT', 'SY_INVERSE_PATTERNS', 'SY_SIMILE', 'SY_FRONTING', 'SY_IRRITATION', 'SY_INTENSIFIER', 'SY_QUOT', 'VT_PRESENT_SIMPLE', 'VT_PRESENT_PROGRESSIVE', 'VT_PRESENT_PERFECT', 'VT_PRESENT_PERFECT_PROGR', 'VT_PRESENT_SIMPLE_PASSIVE', 'VT_PRESENT_PROGR_PASSIVE', 'VT_PRESENT_PERFECT_PASSIVE', 'VT_PAST_SIMPLE', 'VT_PAST_SIMPLE_BE', 'VT_PAST_PROGR', 'VT_PAST_PERFECT', 'VT_PAST_PERFECT_PROGR', 'VT_PAST_SIMPLE_PASSIVE', 'VT_PAST_POGR_PASSIVE', 'VT_PAST_PERFECT_PASSIVE', 'VT_FUTURE_SIMPLE', 'VT_FUTURE_PROGRESSIVE', 'VT_FUTURE_PERFECT', 'VT_FUTURE_PERFECT_PROGR', 'VT_FUTURE_SIMPLE_PASSIVE', 'VT_FUTURE_PROGR_PASSIVE', 'VT_FUTURE_PERFECT_PASSIVE', 'VT_WOULD', 'VT_WOULD_PASSIVE', 'VT_WOULD_PROGRESSIVE', 'VT_WOULD_PERFECT', 'VT_WOULD_PERFECT_PASSIVE', 'VT_SHOULD', 'VT_SHOULD_PASSIVE', 'VT_SHALL', 'VT_SHALL_PASSIVE', 'VT_SHOULD_PROGRESSIVE', 'VT_SHOULD_PERFECT', 'VT_SHOULD_PERFECT_PASSIVE', 'VT_MUST', 'VT_MUST_PASSIVE', 'VT_MUST_PROGRESSIVE', 'VT_MUST_PERFECT', 'VT_MST_PERFECT_PASSIVE', 'VT_CAN', 'VT_CAN_PASSIVE', 'VT_COULD', 'VT_COULD_PASSIVE', 'VT_CAN_PROGRESSIVE', 'VT_COULD_PROGRESSIVE', 'VT_COULD_PERFECT', 'VT_COULD_PERFECT_PASSIVE', 'VT_MAY', 'VT_MAY_PASSIVE', 'VT_MIGHT', 'VT_MIGHT_PASSIVE', 'VT_MAY_PROGRESSIVE', 'VT_MIGTH_PERFECT', 'VT_MIGHT_PERFECT_PASSIVE', 'VT_MAY_PERFECT_PASSIVE', 'ST_TYPE_TOKEN_RATIO_LEMMAS', 'ST_HERDAN_TTR', 'ST_MASS_TTR', 'ST_SENT_WRDSPERSENT', 'ST_SENT_DIFFERENCE', 'ST_REPETITIONS_WORDS', 'ST_REPETITIONS_SENT', 'ST_SENT_D_VP', 'ST_SENT_D_NP', 'ST_SENT_D_PP', 'ST_SENT_D_ADJP', 'ST_SENT_D_ADVP', 'L_I_PRON', 'L_HE_PRON', 'L_SHE_PRON', 'L_IT_PRON', 'L_YOU_PRON', 'L_WE_PRON', 'L_THEY_PRON', 'L_ME_PRON', 'L_YOU_OBJ_PRON', 'L_HIM_PRON', 'L_HER_OBJECT_PRON', 'L_IT_OBJECT_PRON', 'L_US_PRON', 'L_THEM_PRON', 'L_MY_PRON', 'L_YOUR_PRON', 'L_HIS_PRON', 'L_HER_PRON', 'L_ITS_PRON', 'L_OUR_PRON', 'L_THEIR_PRON', 'L_YOURS_PRON', 'L_THEIRS_PRON', 'L_HERS_PRON', 'L_OURS_PRON', 'L_MYSELF_PRON', 'L_YOURSELF_PRON', 'L_HIMSELF_PRON', 'L_HERSELF_PRON', 'L_ITSELF_PRON', 'L_OURSELVES_PRON', 'L_YOURSELVES_PRON', 'L_THEMSELVES_PRON', 'L_FIRST_PERSON_SING_PRON', 'L_SECOND_PERSON_PRON', 'L_THIRD_PERSON_SING_PRON', 'L_THIRD_PERSON_PLURAL_PRON', 'VF_INFINITIVE', 'G_PASSIVE', 'G_ACTIVE', 'G_PRESENT', 'G_PAST', 'G_FUTURE', 'G_MODALS_SIMPLE', 'G_MODALS_CONT', 'G_MODALS_PERFECT', 'AN', 'DDP', 'SVP', 'CDS', 'DDF', 'IS', 'PS', 'RE', 'ASF', 'ASM', 'OM', 'RCI', 'DMC', 'OR', 'QAS', 'PA', 'PR']

# trainCols = ['id', 'sentence_num', 'model', 'domain', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SENT', 'SY_SUBORD_SENT_PUNCT', 'SY_COORD_SENT', 'SY_COORD_SENT_PUNCT', 'SY_SIMPLE_SENT', 'SY_INVERSE_PATTERNS', 'SY_SIMILE', 'SY_FRONTING', 'SY_IRRITATION', 'SY_INTENSIFIER', 'SY_QUOT', 'VT_PRESENT_SIMPLE', 'VT_PRESENT_PROGRESSIVE', 'VT_PRESENT_PERFECT', 'VT_PRESENT_PERFECT_PROGR', 'VT_PRESENT_SIMPLE_PASSIVE', 'VT_PRESENT_PROGR_PASSIVE', 'VT_PRESENT_PERFECT_PASSIVE', 'VT_PAST_SIMPLE', 'VT_PAST_SIMPLE_BE', 'VT_PAST_PROGR', 'VT_PAST_PERFECT', 'VT_PAST_PERFECT_PROGR', 'VT_PAST_SIMPLE_PASSIVE', 'VT_PAST_POGR_PASSIVE', 'VT_PAST_PERFECT_PASSIVE', 'VT_FUTURE_SIMPLE', 'VT_FUTURE_PROGRESSIVE', 'VT_FUTURE_PERFECT', 'VT_FUTURE_PERFECT_PROGR', 'VT_FUTURE_SIMPLE_PASSIVE', 'VT_FUTURE_PROGR_PASSIVE', 'VT_FUTURE_PERFECT_PASSIVE', 'VT_WOULD', 'VT_WOULD_PASSIVE', 'VT_WOULD_PROGRESSIVE', 'VT_WOULD_PERFECT', 'VT_WOULD_PERFECT_PASSIVE', 'VT_SHOULD', 'VT_SHOULD_PASSIVE', 'VT_SHALL', 'VT_SHALL_PASSIVE', 'VT_SHOULD_PROGRESSIVE', 'VT_SHOULD_PERFECT', 'VT_SHOULD_PERFECT_PASSIVE', 'VT_MUST', 'VT_MUST_PASSIVE', 'VT_MUST_PROGRESSIVE', 'VT_MUST_PERFECT', 'VT_MST_PERFECT_PASSIVE', 'VT_CAN', 'VT_CAN_PASSIVE', 'VT_COULD', 'VT_COULD_PASSIVE', 'VT_CAN_PROGRESSIVE', 'VT_COULD_PROGRESSIVE', 'VT_COULD_PERFECT', 'VT_COULD_PERFECT_PASSIVE', 'VT_MAY', 'VT_MAY_PASSIVE', 'VT_MIGHT', 'VT_MIGHT_PASSIVE', 'VT_MAY_PROGRESSIVE', 'VT_MIGTH_PERFECT', 'VT_MIGHT_PERFECT_PASSIVE', 'VT_MAY_PERFECT_PASSIVE', 'ST_TYPE_TOKEN_RATIO_LEMMAS', 'ST_HERDAN_TTR', 'ST_MASS_TTR', 'ST_SENT_WRDSPERSENT', 'ST_SENT_DIFFERENCE', 'ST_REPETITIONS_WORDS', 'ST_REPETITIONS_SENT', 'ST_SENT_D_VP', 'ST_SENT_D_NP', 'ST_SENT_D_PP', 'ST_SENT_D_ADJP', 'ST_SENT_D_ADVP', 'L_I_PRON', 'L_HE_PRON', 'L_SHE_PRON', 'L_IT_PRON', 'L_YOU_PRON', 'L_WE_PRON', 'L_THEY_PRON', 'L_ME_PRON', 'L_YOU_OBJ_PRON', 'L_HIM_PRON', 'L_HER_OBJECT_PRON', 'L_IT_OBJECT_PRON', 'L_US_PRON', 'L_THEM_PRON', 'L_MY_PRON', 'L_YOUR_PRON', 'L_HIS_PRON', 'L_HER_PRON', 'L_ITS_PRON', 'L_OUR_PRON', 'L_THEIR_PRON', 'L_YOURS_PRON', 'L_THEIRS_PRON', 'L_HERS_PRON', 'L_OURS_PRON', 'L_MYSELF_PRON', 'L_YOURSELF_PRON', 'L_HIMSELF_PRON', 'L_HERSELF_PRON', 'L_ITSELF_PRON', 'L_OURSELVES_PRON', 'L_YOURSELVES_PRON', 'L_THEMSELVES_PRON', 'L_FIRST_PERSON_SING_PRON', 'L_SECOND_PERSON_PRON', 'L_THIRD_PERSON_SING_PRON', 'L_THIRD_PERSON_PLURAL_PRON', 'VF_INFINITIVE', 'G_PASSIVE', 'G_ACTIVE', 'G_PRESENT', 'G_PAST', 'G_FUTURE', 'G_MODALS_SIMPLE', 'G_MODALS_CONT', 'G_MODALS_PERFECT', 'AN', 'DDP', 'SVP', 'CDS', 'DDF', 'IS', 'PS', 'RE', 'ASF', 'ASM', 'OM', 'RCI', 'DMC', 'OR', 'QAS', 'PA', 'PR']

train_df = train_df[trainCols]

train_df = train_df.rename(columns={
    'id_encoded': 'id',
})

train_df = train_df.sort_values(by=['id', 'sentence_num']).reset_index(drop=True)

In [33]:
# display(train_df.sample(5, random_state=42))
display(train_df.head())

Unnamed: 0,id,sentence_num,model,domain,POS_VERB,POS_NOUN,POS_ADJ,POS_ADV,POS_DET,POS_INTJ,...,RE,ASF,ASM,OM,RCI,DMC,OR,QAS,PA,PR
0,0,0,human,reviews,0.176471,0.117647,0.058824,0.058824,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,human,reviews,0.25,0.125,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,human,reviews,0.25,0.125,0.25,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0
3,0,3,human,reviews,0.230769,0.230769,0.153846,0.153846,0.153846,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0
4,0,4,human,reviews,0.333333,0.142857,0.047619,0.047619,0.095238,0.0,...,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.095238,0.0,0.0


In [34]:
# Contar ids únicos por modelo
unique_ids_per_model = train_df.groupby('model')['id'].nunique()
print("Conteo de IDs únicos por modelo:")
print(unique_ids_per_model)
print("Total IDs únicos:", train_df['id'].nunique())

Conteo de IDs únicos por modelo:
model
chatgpt          35
cohere           31
cohere-chat      27
gpt2             64
gpt3             34
gpt4             24
human           500
llama-chat       50
mistral          54
mistral-chat     62
mpt              58
mpt-chat         61
Name: id, dtype: int64
Total IDs únicos: 1000


In [35]:
# Guardar el DataFrame final a un archivo CSV
pdtrainDF = pd.DataFrame(train_df)
pdtrainDF.to_csv('train_df.csv', index=False)