In [97]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

# Modeling 

## Chargement des features

In [98]:


X_path = '../data/processed/X_features.npy'
y_path = '../data/processed/y_labels.npy'
metadata_path = '../data/processed/features_metadata.json'

if not os.path.exists(X_path):
    raise FileNotFoundError(
        f"Les fichiers de features n'existent pas.\n"
        f"Exécutez d'abord '03_feature_engineering.ipynb' jusqu'à la fin."
    )

# Charger les données
X = np.load(X_path)
y = np.load(y_path)

# Charger les métadonnées
with open(metadata_path, 'r') as f:
    metadata = json.load(f)
    column_names = np.array(metadata['column_names'])
    encode_dict = metadata['encode_dict']

print(f"Features chargées :")
print(f"   - X: {X.shape} (samples × features)")
print(f"   - y: {y.shape} (labels)")
print(f"   - Features: {len(column_names)}")
print(f"\n Distribution des classes :")
print(f"   - Classe 0 (Pass): {np.sum(y == 0)} ({100*np.mean(y == 0):.1f}%)")
print(f"   - Classe 1 (Fail): {np.sum(y == 1)} ({100*np.mean(y == 1):.1f}%)")

Features chargées :
   - X: (23743, 26) (samples × features)
   - y: (23743,) (labels)
   - Features: 26

 Distribution des classes :
   - Classe 0 (Pass): 8436 (35.5%)
   - Classe 1 (Fail): 15307 (64.5%)


Here we use Random Forest simply because it's a good model for classification on tabular data. It's also good out-of-the-box, i.e. not a lot of fussing over hyperparameters. Moreover, there is a straight-forward implementation for permutation feature importance.  

In [99]:
reg = RandomForestClassifier(n_estimators = 200,
                             max_features = 'sqrt', 
                             min_samples_split = 10, 
                            )

kf = KFold(n_splits=5, shuffle = True)

def train(X,y,reg,kf):
    if kf == False: 
        reg.fit(X,y)
    else:
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            reg.fit(X_train,y_train)
            print(f'evaluate score: {reg.score(X_test,y_test)}')

In [100]:
train(X,y,reg,kf)

evaluate score: 0.742682670035797
evaluate score: 0.7576331859338808
evaluate score: 0.7359444093493367
evaluate score: 0.7392586352148273
evaluate score: 0.7529486099410277


Our evaluation score is around 75%. Not terribly impressive. Let's try again with more days. 

In [101]:
# NOTE: Cette cellule nécessite les fonctions de préparation de données
# Pour l'instant, on utilise les features déjà créées (X, y)
# Si vous voulez tester différents seuils de jours, 
# réexécutez le notebook 02 et 03 avec les paramètres souhaités

print("Pour tester différents seuils de jours (120, 150, 180, 210),")
print("modifiez les paramètres (score_deadline,click_deadline) dans le notebook 02_data_preparation.ipynb")
print("et réexécutez les notebooks 02 → 03 → 04")

Pour tester différents seuils de jours (120, 150, 180, 210),
modifiez les paramètres (score_deadline,click_deadline) dans le notebook 02_data_preparation.ipynb
et réexécutez les notebooks 02 → 03 → 04


As we can expect, more data means higher accuracy, though it also means later intervention if we decide to reach out to the students based on their performance. The sweet spot for our data seems to be 180 days, which would leave 60-80 days before final exam. This might be enough time to help failing students, considering the courses are structured so that majority of the grade depends on the final exam. 

In [102]:
# feature importance

from sklearn.inspection import permutation_importance

train(X,y,reg,kf = False)
importance = permutation_importance(reg, X, y, n_repeats=10, random_state = 0)
importance_mean = np.round(importance['importances_mean'],3)
importance_table = pd.DataFrame({'importance': importance_mean,
                                 'column': column_names,
                                })
importance_table.sort_values(by = 'importance', ascending = False)

Unnamed: 0,importance,column
25,0.264,mean_score_day90
13,0.085,quiz
20,0.067,highest_education
5,0.064,forumng
17,0.054,url
7,0.054,homepage
15,0.05,resource
0,0.048,code_module
19,0.038,region
1,0.036,code_presentation


As expected, the most important feature is the average score for each student. 

## Sauvegarde du modèle entraîné

In [103]:
import joblib
import os

# Créer le répertoire models s'il n'existe pas
os.makedirs('../models', exist_ok=True)

# Sauvegarder le modèle
model_path = '../models/random_forest_model.pkl'
joblib.dump(reg, model_path)

print(f"Modèle sauvegardé : {model_path}")
print(f"Paramètres du modèle :")
print(f"   - n_estimators: {reg.n_estimators}")
print(f"   - max_features: {reg.max_features}")
print(f"   - min_samples_split: {reg.min_samples_split}")

# Sauvegarder aussi les métadonnées du modèle
import json
from datetime import datetime

model_metadata = {
    'model_type': 'RandomForestClassifier',
    'n_estimators': reg.n_estimators,
    'max_features': reg.max_features,
    'min_samples_split': reg.min_samples_split,
    'n_features': X.shape[1],
    'feature_names': column_names.tolist() if hasattr(column_names, 'tolist') else list(column_names),
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'training_samples': X.shape[0]
}

metadata_path = '../models/model_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(model_metadata, f, indent=2)

print(f"Métadonnées sauvegardées : {metadata_path}")

Modèle sauvegardé : ../models/random_forest_model.pkl
Paramètres du modèle :
   - n_estimators: 200
   - max_features: sqrt
   - min_samples_split: 10
Métadonnées sauvegardées : ../models/model_metadata.json
