In [4]:
# --- CELLULE 1 : INITIALISATION ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# On travaille sur une copie pour prot√©ger le dataset original
df_ml = df.copy()

print("‚úÖ Initialisation termin√©e. Copie cr√©√©e.")

‚úÖ Initialisation termin√©e. Copie cr√©√©e.


In [5]:
# --- CELLULE 2 : SUPPRESSION DES COLONNES (DROP) ---

cols_to_drop = [
    'Nacionality',             # 98% de Portugais (Bruit)
    'Application mode',         # Redondant avec l'√Çge (Simplification)
    'Previous qualification',   # Redondant/Complexe (Simplification)
    'Mother\'s qualification',  # Encodage trompeur
    'Father\'s qualification', 
    'Mother\'s occupation',     # Trop de cat√©gories
    'Father\'s occupation',
    'Educational special needs', # Trop rare (<1%)
    'International',
    'Curricular units 1st sem (without evaluations)', # Variance nulle
    'Curricular units 2nd sem (without evaluations)',
    'Curricular units 1st sem (credited)',
    'Curricular units 2nd sem (credited)'
]

df_ml = df_ml.drop(columns=cols_to_drop, errors='ignore')
print(f"‚úÖ Nettoyage termin√©. Il reste {df_ml.shape[1]} colonnes.")

‚úÖ Nettoyage termin√©. Il reste 24 colonnes.


In [6]:
# --- CELLULE 3 : REGROUPEMENT ET CORRECTION ---

# 1. On regroupe les petites cat√©gories (< 20 personnes) pour Course ET Marital Status
# Cela √©vite que l'IA apprenne sur les 4 veufs ou les 12 √©tudiants en biocarburant.
cols_to_bin = ['Course', 'Marital status']

for col in cols_to_bin:
    if col in df_ml.columns:
        counts = df_ml[col].value_counts()
        valid_cats = counts[counts >= 20].index
        # Les cat√©gories rares deviennent 99
        df_ml[col] = df_ml[col].apply(lambda x: x if x in valid_cats else 99)
        print(f"   -> Colonne '{col}' nettoy√©e.")

# 2. Correction de 'Application order'
# C'est une √©chelle (1=Top, 6=Bof). On remet le 0 √† 1 et le 9 √† 6.
if 'Application order' in df_ml.columns:
    df_ml['Application order'] = df_ml['Application order'].clip(lower=1, upper=6)
    print("   -> Colonne 'Application order' corrig√©e (0->1 et 9->6).")

print("‚úÖ √âtape 3 termin√©e.")

   -> Colonne 'Course' nettoy√©e.
   -> Colonne 'Marital status' nettoy√©e.
   -> Colonne 'Application order' corrig√©e (0->1 et 9->6).
‚úÖ √âtape 3 termin√©e.


In [7]:
# --- CELLULE 4 : CIBLE & ENCODAGE ---

# 1. Cible Binaire : Dropout = 1, Reste = 0
df_ml['Target'] = df_ml['Target'].apply(lambda x: 1 if x == 'Dropout' else 0)

# 2. Encodage One-Hot CIBL√â
# On ne touche QU'√Ä ces deux colonnes qui sont des "Faux Chiffres" (Nominales)
cols_to_encode = ['Course', 'Marital status']

# On les convertit en texte d'abord pour que get_dummies comprenne bien
for col in cols_to_encode:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].astype(str)

# get_dummies avec columns=... ne touche QUE les colonnes sp√©cifi√©es
# Le reste (Gender, Application Order...) reste intact !
df_encoded = pd.get_dummies(df_ml, columns=cols_to_encode, drop_first=True)

print(f"‚úÖ Encodage termin√©. Dimensions actuelles : {df_encoded.shape}")

‚úÖ Encodage termin√©. Dimensions actuelles : (4424, 42)


In [8]:
# --- CELLULE 5 : SPLIT & SCALING ---

# 1. S√©paration
X = df_encoded.drop('Target', axis=1)
y = df_encoded['Target']

# 2. Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Scaling (Uniquement sur les vrais chiffres continus)
scaler = StandardScaler()

# On d√©finit manuellement les colonnes qui sont des mesures physiques
# Note : On ne met PAS 'Application order' ici car c'est un classement 1-6, pas une mesure continue.
# Note : On ne met PAS 'Gender' ni les colonnes One-Hot (0/1).
cols_numeric_real = [
    'Admission grade', 
    'Age at enrollment', 
    'Unemployment rate', 
    'Inflation rate', 
    'GDP',
    'Curricular units 1st sem (evaluations)', 
    'Curricular units 1st sem (approved)', 
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (evaluations)', 
    'Curricular units 2nd sem (approved)', 
    'Curricular units 2nd sem (grade)'
]

# Petite s√©curit√© : on ne prend que celles qui existent encore
cols_to_scale = [c for c in cols_numeric_real if c in X_train.columns]

# Application
X_train_final = X_train.copy()
X_test_final = X_test.copy()

X_train_final[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test_final[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

print("\nüöÄ TOUT EST PR√äT !")
print(f"Final Train Shape : {X_train_final.shape}")
print(f"Nombre de colonnes : {X_train_final.shape[1]}")


üöÄ TOUT EST PR√äT !
Final Train Shape : (3539, 41)
Nombre de colonnes : 41
