# 3. Séparation en train / test / validation

## Variables d'environnement

In [8]:
TWEETS_PATH = 'data/cleaned_tweets.parquet'
TRAIN_OUTPUT_PATH = 'data/train_tweets.parquet'
TEST_OUTPUT_PATH = 'data/test_tweets.parquet'
VALIDATION_OUTPUT_PATH = 'data/validation_tweets.parquet'

TRAIN_RATIO = 0.7
VALIDATION_RATIO = 0.15
TEST_RATIO = 0.15

## Imports des dépendances

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Imports des jeux de données

In [10]:
tweets_df = pd.read_parquet(TWEETS_PATH)

In [11]:
tweets_df.head()

Unnamed: 0,target,original_text,tokens_lemmatized,tokens_stemmed,bert_text,text_lemmatized,text_stemmed
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[awww, thats, bummer, shoulda, got, david, car...","[awww, that, bummer, shoulda, got, david, carr...","- awww, that's a bummer. you shoulda got david...",awww thats bummer shoulda got david carr third...,awww that bummer shoulda got david carr third ...
1,0,is upset that he can't update his Facebook by ...,"[upset, not, update, facebook, texting, might,...","[upset, not, updat, facebook, text, might, cri...",is upset that he cannot update his facebook by...,upset not update facebook texting might cry re...,upset not updat facebook text might cri result...
2,0,@Kenichan I dived many times for the ball. Man...,"[dived, many, time, ball, managed, save, rest,...","[dive, mani, time, ball, manag, save, rest, bo...",i dived many times for the ball. managed to sa...,dived many time ball managed save rest bound,dive mani time ball manag save rest bound
3,0,my whole body feels itchy and like its on fire,"[whole, body, feel, itchy, like, fire]","[whole, bodi, feel, itchi, like, fire]",my whole body feels itchy and like its on fire,whole body feel itchy like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....","[no, not, behaving, mad, not, see]","[no, not, behav, mad, not, see]","no, it is not behaving at all. i am mad. why a...",no not behaving mad not see,no not behav mad not see


## Séparation en train / test / validation

In [12]:
def split_dataset_stratified(df, target_col, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_state=42):
    """
    Divise le dataframe en 3 sets : Train, Validation, Test en respectant la stratification.
    """
    # 1. Vérification que la somme fait bien 1 (ou presque, pour gérer les flottants)
    if not np.isclose(train_ratio + val_ratio + test_ratio, 1.0):
        raise ValueError("La somme des ratios (train + val + test) doit être égale à 1.")

    # 2. Premier split : Séparer le Train du reste (Validation + Test)
    # Le reste représente (1 - train_ratio) du total
    df_train, df_temp = train_test_split(
        df,
        train_size=train_ratio,
        stratify=df[target_col],
        random_state=random_state
    )

    # 3. Calcul du ratio relatif pour le deuxième split
    # Si on veut 15% de test sur le TOTAL, mais qu'il ne reste que 30% des données dans df_temp,
    # il faut prendre 50% de df_temp (0.15 / (0.15 + 0.15) = 0.5)
    relative_test_ratio = test_ratio / (val_ratio + test_ratio)

    # 4. Deuxième split : Séparer Validation et Test
    df_val, df_test = train_test_split(
        df_temp,
        test_size=relative_test_ratio,
        stratify=df_temp[target_col],
        random_state=random_state
    )

    # Affichage des résultats pour vérification
    print(f"Jeu complet : {len(df)} lignes")
    print(f"Train set   : {len(df_train)} lignes ({len(df_train)/len(df):.1%})")
    print(f"Val set     : {len(df_val)} lignes ({len(df_val)/len(df):.1%})")
    print(f"Test set    : {len(df_test)} lignes ({len(df_test)/len(df):.1%})")

    return df_train, df_val, df_test

In [13]:
def check_balance(sets_dict, target_col='target'):
    """
    Affiche la distribution de la target pour plusieurs dataframes.
    sets_dict : Dictionnaire {'Nom du set': dataframe}
    """
    for name, df in sets_dict.items():
        print(f"\n--- Distribution : {name} ---")
        # Calcul des comptes et des pourcentages
        counts = df[target_col].value_counts()
        percents = df[target_col].value_counts(normalize=True) * 100

        # Création d'un petit tableau pour l'affichage
        dist_df = pd.DataFrame({'Total': counts, 'Pourcentage (%)': percents})
        print(dist_df.round(2)) # Arrondi à 2 décimales pour la lisibilité

In [14]:
df_train, df_val, df_test = split_dataset_stratified(tweets_df, 'target', TRAIN_RATIO, VALIDATION_RATIO, TEST_RATIO)
check_balance({'Train': df_train, 'Validation': df_val, 'Test': df_test})

Jeu complet : 1600000 lignes
Train set   : 1120000 lignes (70.0%)
Val set     : 240000 lignes (15.0%)
Test set    : 240000 lignes (15.0%)

--- Distribution : Train ---
         Total  Pourcentage (%)
target                         
1       560000             50.0
0       560000             50.0

--- Distribution : Validation ---
         Total  Pourcentage (%)
target                         
0       120000             50.0
1       120000             50.0

--- Distribution : Test ---
         Total  Pourcentage (%)
target                         
1       120000             50.0
0       120000             50.0


In [15]:
df_train.head()

Unnamed: 0,target,original_text,tokens_lemmatized,tokens_stemmed,bert_text,text_lemmatized,text_stemmed
1022641,1,@ErinGifford Please give me a swift kick in th...,"[please, give, swift, kick, arse, change, ever...","[pleas, give, swift, kick, ars, chang, ever, h...",please give me a swift kick in the arse if i c...,please give swift kick arse change ever hit big,pleas give swift kick ars chang ever hit big
1436643,1,@alexbach your too pretty and too awesome. He ...,"[pretty, awesome, not, handle]","[pretti, awesom, not, handl]",your too pretty and too awesome. he cannot han...,pretty awesome not handle,pretti awesom not handl
734718,0,my neck is red like a boiled lobster and is gi...,"[neck, red, like, boiled, lobster, giving, mas...","[neck, red, like, boil, lobster, give, massiv,...",my neck is red like a boiled lobster and is gi...,neck red like boiled lobster giving massive am...,neck red like boil lobster give massiv amount ...
864253,1,Heiï¿½e Ecke war toll,"[heiï¿½e, ecke, war, toll]","[heiï¿½, eck, war, toll]",heiï¿½e ecke war toll,heiï¿½e ecke war toll,heiï¿½ eck war toll
1263647,1,@Ashh_x3 YAY @KrystensAreject don't be a par...,"[yay, not, party, pooper]","[yay, not, parti, pooper]",yay do not be a party pooper!,yay not party pooper,yay not parti pooper


In [16]:
df_val.head()

Unnamed: 0,target,original_text,tokens_lemmatized,tokens_stemmed,bert_text,text_lemmatized,text_stemmed
564885,0,Cummon people. Change the water. I'm thirsty,"[cummon, people, change, water, thirsty]","[cummon, peopl, chang, water, thirsti]",cummon people. change the water. i'm thirsty,cummon people change water thirsty,cummon peopl chang water thirsti
1552321,1,@BigBrotherBoo @ ur boo . I quite agree . Lif...,"[boo, quite, agree, life, living, not, watchin...","[boo, quit, agre, life, live, not, watch, pred...",@ ur boo . i quite agree . life is for living ...,boo quite agree life living not watching predi...,boo quit agre life live not watch predict stuf...
159754,0,Wtf is wrong with me..im not like this....i ne...,"[wtf, wrong, meim, not, like, thisi, need, sleep]","[wtf, wrong, meim, not, like, thisi, need, sleep]",wtf is wrong with me..im not like this....i ne...,wtf wrong meim not like thisi need sleep,wtf wrong meim not like thisi need sleep
1445479,1,http://bit.ly/VwV6H vote vote vote for Tom Fle...,"[vote, vote, vote, tom, fletcher, please, plea...","[vote, vote, vote, tom, fletcher, pleas, pleas...","vote vote vote for tom fletcher ()! please, pl...",vote vote vote tom fletcher please please plea...,vote vote vote tom fletcher pleas pleas pleas ...
1525177,1,@leeannekenny Sounds good. Enjoy.,"[sound, good, enjoy]","[sound, good, enjoy]",sounds good. enjoy.,sound good enjoy,sound good enjoy


In [17]:
df_test.head()

Unnamed: 0,target,original_text,tokens_lemmatized,tokens_stemmed,bert_text,text_lemmatized,text_stemmed
848658,1,@pauljchambers What's wrong with Hitman? ofco...,"[whats, wrong, hitman, ofcourse, not, oscar, m...","[what, wrong, hitman, ofcours, not, oscar, mat...",what's wrong with hitman? ofcourse it is not o...,whats wrong hitman ofcourse not oscar material...,what wrong hitman ofcours not oscar materi but...
1073168,1,@ZacharyZips theres cake at my grandma's house...,"[there, cake, grandma, house, birthday, week, ...","[there, cake, grandma, hous, birthday, week, ago]",theres cake at my grandma's house. her birthda...,there cake grandma house birthday week ago,there cake grandma hous birthday week ago
889379,1,just let it go...,[let],[let],just let it go...,let,let
1303917,1,the sun is boling today but friday its raining...,"[sun, boling, today, but, friday, raining, wei...","[sun, bole, today, but, friday, rain, weird, m...",the sun is boling today but friday its raining...,sun boling today but friday raining weird mayb...,sun bole today but friday rain weird maybee il...
925289,1,@mileycyrus http://twitpic.com/3pf7v - my dad ...,"[dad, take, picture, like, womens, hat, put, f...","[dad, take, pictur, like, women, hat, put, fac...",- my dad takes pictures like that or with wome...,dad take picture like womens hat put facebook ...,dad take pictur like women hat put facebook he...


## Exportation des données

In [18]:
df_train.to_parquet(TRAIN_OUTPUT_PATH, engine='pyarrow', index=False)
df_val.to_parquet(VALIDATION_OUTPUT_PATH, engine='pyarrow', index=False)
df_test.to_parquet(TEST_OUTPUT_PATH, engine='pyarrow', index=False)

In [19]:
del df_train
del df_val
del df_test
del tweets_df