In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/raw/adult.csv') # Chargement du dataset
df.head() # Affiche les 5 premières lignes du dataset

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


## Chargement du dataset

Le dataset Adult Census Income (UCI / Kaggle) est chargé depuis le dossier `data/raw`.
Les chemins relatifs sont utilisés afin de garantir la portabilité du projet.

On affiche les 5 premières lignes du dataset avec "df.head" pour voir à quoi ressemble le dataset, sans forcément afficher toutes les informations du fichier CSV.

In [None]:
df.info() # Affiche les informations sur le dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [None]:
df.describe(include='all') # Affiche les statistiques descriptives du dataset

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


## Identification des valeurs manquantes

Dans ce dataset, certaines valeurs manquantes ne sont pas codées comme NaN mais par le caractère "?",
ce qui empêche leur détection automatique par pandas.

In [None]:
df = df.replace('?', np.nan) # Remplace les "?" par NaN
df.isna().sum() # Affiche le nombre de valeurs manquantes dans chaque colonne

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

## Remplacement des valeurs manquantes

Les caractères "?" sont remplacés par NaN afin de permettre une gestion correcte
des valeurs manquantes lors des étapes suivantes.
Aucune imputation n’est réalisée à ce stade afin d’éviter toute fuite de données.

In [12]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:
X = df.drop("income", axis=1) # Supprime la colonne "income"
y = df["income"] # Sélectionne la colonne "income" comme variable cible

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
) # Divise le dataset en ensembles d'entraînement et de test

In [None]:
num_cols = X_train.select_dtypes(include=np.number).columns # Sélectionne les colonnes numériques

for col in num_cols:
    median = X_train[col].median() # Calcule la médiane de la colonne
    X_train[col] = X_train[col].fillna(median) # Remplace les NaN par la médiane
    X_test[col] = X_test[col].fillna(median)

In [None]:
cat_cols = X_train.select_dtypes(exclude=np.number).columns # Sélectionne les colonnes catégoriques

for col in cat_cols:
    mode = X_train[col].mode()[0] # Calcule le mode de la colonne
    X_train[col] = X_train[col].fillna(mode) # Remplace les NaN par le mode
    X_test[col] = X_test[col].fillna(mode)

In [None]:
X_train[num_cols].describe() # Affiche les statistiques descriptives des colonnes numériques

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,26048.0,26048.0,26048.0,26048.0,26048.0,26048.0
mean,38.530098,190276.4,10.080505,1086.900683,88.575054,40.420531
std,13.628681,106143.8,2.577431,7413.533054,406.506165,12.325439
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117983.0,9.0,0.0,0.0,40.0
50%,37.0,178660.0,10.0,0.0,0.0,40.0
75%,47.0,238189.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [None]:
X_train_encoded = pd.get_dummies(X_train, drop_first=True) # Encode les colonnes catégoriques
X_test_encoded = pd.get_dummies(X_test, drop_first=True) 

In [None]:
X_train_encoded, X_test_encoded = X_train_encoded.align(
    X_test_encoded,
    join="left",
    axis=1,
    fill_value=0
) # Aligne les colonnes des ensembles d'entraînement et de test

In [None]:
y_train = y_train.map({"<=50K": 0, ">50K": 1}) 
y_test = y_test.map({"<=50K": 0, ">50K": 1})

In [None]:
scale_cols = ["age", "hours.per.week", "capital.gain", "capital.loss", "fnlwgt"] # Sélectionne les colonnes à normaliser

for col in scale_cols:
    mean = X_train_encoded[col].mean() # Calcule la moyenne de la colonne
    std = X_train_encoded[col].std() # Calcule l'écart-type de la colonne

    X_train_encoded[col] = (X_train_encoded[col] - mean) / std # Normalise la colonne
    X_test_encoded[col] = (X_test_encoded[col] - mean) / std # Normalise la colonne

In [None]:
X_train_encoded.isna().sum().sum(), X_test_encoded.isna().sum().sum() # Affiche le nombre de valeurs manquantes dans chaque colonne

(np.int64(0), np.int64(0))

In [None]:
X_train_encoded.shape, X_test_encoded.shape # Affiche les dimensions des ensembles d'entraînement et de test

((26048, 96), (6513, 96))

## Vérifications finales

Cette étape permet de vérifier :
- l’absence de valeurs manquantes après le prétraitement,
- la cohérence des dimensions entre les jeux d’entraînement et de test.
Le dataset est désormais prêt pour une phase d’analyse ou de modélisation.

In [None]:
df.to_csv("../data/processed/adult_cleaned.csv", index=False) # Sauvegarde le dataset prétraité