# Feature Engineering - OULAD Dataset

## 1. Imports

In [77]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime

## 2. Chargement des données préparées

Chargement du DataFrame `final_df` créé dans le notebook `02_data_preparation.ipynb`.

In [78]:
# Charger le DataFrame préparé
data_path = '../data/processed/final_df.csv'

if not os.path.exists(data_path):
    raise FileNotFoundError(
        f"Le fichier {data_path} n'existe pas.\n"
        f"Veuillez d'abord exécuter le notebook '02_data_preparation.ipynb' "
        f"pour générer les données préparées."
    )

final_df = pd.read_csv(data_path)
print(f"Données chargées depuis : {data_path}")
print(f"Dimensions : {final_df.shape}")
print(f"Colonnes : {list(final_df.columns)}")
print(f"\n Aperçu des données :")
final_df.head()

Données chargées depuis : ../data/processed/final_df.csv
Dimensions : (23743, 32)
Colonnes : ['code_module', 'code_presentation', 'id_student', 'dataplus', 'dualpane', 'externalquiz', 'forumng', 'glossary', 'homepage', 'htmlactivity', 'oucollaborate', 'oucontent', 'ouelluminate', 'ouwiki', 'page', 'questionnaire', 'quiz', 'repeatactivity', 'resource', 'sharedsubpage', 'subpage', 'url', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits', 'disability', 'final_result', 'mean_score_day90']

 Aperçu des données :


Unnamed: 0,code_module,code_presentation,id_student,dataplus,dualpane,externalquiz,forumng,glossary,homepage,htmlactivity,...,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,mean_score_day90
0,AAA,2013J,11391,0.0,0.0,0.0,3.514286,0.0,3.478261,0.0,...,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,81.5
1,AAA,2013J,28400,0.0,0.0,0.0,2.988889,0.0,5.194444,0.0,...,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,69.0
2,AAA,2013J,31604,0.0,0.0,0.0,2.955882,1.0,4.34,0.0,...,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,71.5
3,AAA,2013J,32885,0.0,0.0,0.0,3.12766,2.0,3.6,0.0,...,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,49.5
4,AAA,2013J,38053,7.0,0.0,0.0,3.457364,0.0,4.438596,0.0,...,M,Wales,A Level or Equivalent,80-90%,35-55,0,60,N,Pass,74.0


## 3. Création des features

Finally we do some touch-ups to the dataframe before feeding it to our model. First we want to encode all the categorical variables. Here we do label encoding instead of one-hot encoding, simply because it works for tree-based algorithms, and it's easier to read if later we want to visually inspect individual trees. 

In [79]:
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

In [80]:
def create_Xy(final_df):

    X = final_df.drop(['final_result','id_student','imd_band'],axis = 1)
    column_names = X.columns
    y = final_df['final_result']

    le = LabelEncoder()
    encode_dict = {}
    Xcat_features = ['code_module', 'code_presentation',
                    'gender', 'region',
                    'highest_education',  
                    'age_band','disability',
                   ]

    for cat_feature in Xcat_features: 
        X[cat_feature] = le.fit_transform(X[cat_feature])
        encode_dict[cat_feature] = le.classes_

    y = le.fit_transform(y)
    encode_dict['final_result'] = le.classes_

    X = X.to_numpy()
    
    return X,y,column_names,encode_dict

In [81]:
X,y,column_names,encode_dict = create_Xy(final_df)

We also want to handle potential collinearity in the data. While collinearity does not impact Random Forest model per se, it might negatively affect the feature importance analysis we do after. 

In [82]:
corr = spearmanr(X).correlation
corr_linkage = hierarchy.ward(corr)
cluster_ids = hierarchy.fcluster(corr_linkage, 1, criterion='distance')
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]

X = X[:, selected_features]
column_names = column_names[selected_features]

## 3. Sauvegarde des features pour le modeling

In [83]:
# Créer le répertoire de sauvegarde s'il n'existe pas

os.makedirs('../data/processed', exist_ok=True)

# Sauvegarder X et y en format numpy
np.save('../data/processed/X_features.npy', X)
np.save('../data/processed/y_labels.npy', y)

# Sauvegarder les métadonnées
# Convertir les numpy arrays en listes pour JSON
encode_dict_serializable = {k: v.tolist() if hasattr(v, 'tolist') else v 
                            for k, v in encode_dict.items()}

metadata = {
    'column_names': column_names.tolist(),
    'encode_dict': encode_dict_serializable,
    'n_samples': int(X.shape[0]),
    'n_features': int(X.shape[1])
}

with open('../data/processed/features_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Features sauvegardées :")
print(f"   - X: {X.shape} → data/processed/X_features.npy")
print(f"   - y: {y.shape} → data/processed/y_labels.npy")
print(f"   - Metadata → data/processed/features_metadata.json")

Features sauvegardées :
   - X: (23743, 26) → data/processed/X_features.npy
   - y: (23743,) → data/processed/y_labels.npy
   - Metadata → data/processed/features_metadata.json
