**NOTES :** Test du modèle v1 de DIORES sur le jeu de données des bacheliers de l'année 2018-2019

# Libraries and functions definition

In [None]:
# # @title
# !pip uninstall scikit-learn -y
# !pip install -q scikit-learn>=1.4
# !pip list | grep scikit-learn

Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
scikit-learn                       1.6.1


In [None]:
# @title
import os
import sys
import time

# Built-in imports
import warnings
from collections import Counter
import pickle

# Data manipulation
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import missingno

# Sklearn imports
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    root_mean_squared_error,
    mean_absolute_error
)

# ML Models - Linear
from sklearn.linear_model import (
    LogisticRegression,
    Perceptron,
    SGDClassifier,
    Lasso,
    LassoCV
)

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Disable warnings
warnings.filterwarnings('ignore')

In [None]:
# @title
from math import sqrt
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import pickle

In [None]:
# @title
def preprocess_data(df):
    """
    Effectue le prétraitement des données sur le DataFrame donné.

    Args:
        df (pd.DataFrame): Le DataFrame à prétraiter.

    Returns:
        pd.DataFrame: Le DataFrame prétraité.
    """
    # Remplacer les valeurs manquantes par 0
    df = df.fillna(0)

    # Encodage des séries
    def encode_series(df):
        series = ['S1', 'S2', 'S3']
        for serie in series:
            df[serie] = df['Série'].apply(lambda x: 1 if x == serie else 0)
        df.drop('Série', axis=1, inplace=True)
        return df

    # Encodage du sexe
    def encode_sexe(df):
        df['Homme'] = df['Sexe'].apply(lambda x: 1 if x == 'M' else 0)
        df['Femme'] = df['Sexe'].apply(lambda x: 1 if x == 'F' else 0)
        df.drop('Sexe', axis=1, inplace=True)
        return df

    # Moyennes par Académie
    def encode_academie_performance(df):
        academie_mean = df.groupby("Académie de l'Ets. Prov.")['Moy. Gle'].mean().to_dict()
        df['Academie perf.'] = df["Académie de l'Ets. Prov."].map(academie_mean)
        df.drop("Académie de l'Ets. Prov.", axis=1, inplace=True)
        return df

    # Moyennes par Résidence
    def encode_residence_performance(df):
        residence_mean = df.groupby("Résidence")['Moy. Gle'].mean().to_dict()
        df['Residence perf.'] = df["Résidence"].map(residence_mean)
        df.drop("Résidence", axis=1, inplace=True)
        return df

    # Conversion des colonnes non numériques en numériques
    def convert_non_numeric_columns(df):
        non_numeric_cols = df.select_dtypes(include=['object']).columns
        for col in non_numeric_cols:
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except Exception:
                pass
        return df

    # Supprimer la colonne "Mention"
    def drop_columns(df, columns):
        df.drop(columns, axis=1, inplace=True)
        return df

    # Appel des sous-fonctions
    df = encode_series(df)
    df = encode_sexe(df)
    df = encode_academie_performance(df)
    df = encode_residence_performance(df)
    df = drop_columns(df, ["Mention"])
    # df = convert_non_numeric_columns(df)

    return df

In [None]:
# @title
class DataFrameProcessor:
   def __init__(self, df):
       self.df = df.copy()
       # Features pour DecisionTree
       self.tree_features = [
           'Année BAC', 'Nbre Fois au BAC', 'Groupe Résultat', 'Moy. nde',
           'Moy. ère', 'Moy. S Term.', 'Moy. S Term..1', 'MATH', 'SCPH', 'FR',
           'PHILO', 'AN', 'Tot. Pts au Grp.', 'Moyenne au Grp.', 'Moy. Gle',
           'Moy. sur Mat.Fond.', 'Age en Décembre 2018', 'Sexe_F', 'Sexe_M',
           'Série_S1', 'Série_S2', 'Série_S3', 'Mention_ABien', 'Mention_Bien',
           'Mention_Pass', 'Résidence', 'Ets. de provenance', 'Centre d\'Ec.',
           'Académie de l\'Ets. Prov.', 'REGION_DE_NAISSANCE', 'Academie perf.'
       ]

   def preprocess_data_tree(self):
       """Prétraitement pour les DecisionTrees"""
       df = self.df.copy()

       # One-hot encoding pour colonnes catégorielles
       if 'Sexe' in df.columns:
           df['Sexe_F'] = (df['Sexe'] == 'F').astype(int)
           df['Sexe_M'] = (df['Sexe'] == 'M').astype(int)
           df.drop('Sexe', axis=1, inplace=True)

       if 'Série' in df.columns:
           df['Série_S1'] = (df['Série'] == 'S1').astype(int)
           df['Série_S2'] = (df['Série'] == 'S2').astype(int)
           df['Série_S3'] = (df['Série'] == 'S3').astype(int)
           df.drop('Série', axis=1, inplace=True)

       if 'Mention' in df.columns:
           df['Mention_Pass'] = (df['Mention'] == 'Passable').astype(int)
           df['Mention_ABien'] = (df['Mention'] == 'Assez-Bien').astype(int)
           df['Mention_Bien'] = (df['Mention'] == 'Bien').astype(int)
           df.drop('Mention', axis=1, inplace=True)

       # Label encoding pour colonnes catégorielles restantes
       categorical_cols = [
           'Résidence', 'Ets. de provenance', 'Centre d\'Ec.',
           'Académie de l\'Ets. Prov.', 'REGION_DE_NAISSANCE'
       ]

       le = LabelEncoder()
       for col in categorical_cols:
           if col in df.columns:
               df[col] = le.fit_transform(df[col].astype(str))

       # Calcul performance académique
       if 'Académie de l\'Ets. Prov.' in df.columns and 'Moy. Gle' in df.columns:
           academie_mean = df.groupby('Académie de l\'Ets. Prov.')['Moy. Gle'].mean()
           df['Academie perf.'] = df['Académie de l\'Ets. Prov.'].map(academie_mean)

       # Assurer présence de toutes les colonnes
       for col in self.tree_features:
           if col not in df.columns:
               df[col] = 0

       # Conversion en numérique et gestion NaN
       df = df[self.tree_features].apply(pd.to_numeric, errors='coerce').fillna(0)

       return df

   def preprocess_data_lasso(self, features_needed):
    """Prétraitement pour modèles Lasso"""
    df = self.df.copy()

    # print("Features attendues:", features_needed)
    # print("Colonnes disponibles:", df.columns.tolist())

    # Encodage des séries pour S1
    if 'Série' in df.columns:
        df['S1'] = (df['Série'] == 'S1').astype(int)
    elif 'Série_S1' in df.columns:
        df['S1'] = df['Série_S1']

    # Garder les colonnes qui existent déjà
    existing_features = ['MATH', 'SCPH', 'FR']
    for col in existing_features:
        if col not in df.columns:
            print(f"Colonne manquante: {col}")

    # Calcul des performances
    if 'Académie de l\'Ets. Prov.' in df.columns:
        academie_mean = df.groupby('Académie de l\'Ets. Prov.')['Moy. Gle'].mean()
        df['Academie perf.'] = df['Académie de l\'Ets. Prov.'].map(academie_mean)

    if 'Résidence' in df.columns:
        residence_mean = df.groupby('Résidence')['Moy. Gle'].mean()
        df['Residence perf.'] = df['Résidence'].map(residence_mean)

    # print("Colonnes après traitement:", df.columns.tolist())

    # Créer les colonnes manquantes
    for feature in features_needed:
        if feature not in df.columns:
            print(f"Création colonne manquante: {feature}")
            df[feature] = 0

    # Retourner dans le bon ordre
    return df[list(features_needed)].apply(pd.to_numeric, errors='coerce').fillna(0)

In [None]:
# @title
def evaluate_predictions(df_result, rank_method='average'):
    df = df_result.copy()
    df['Rang_L1_New'] = df['Score L1'].rank(ascending=False, method=rank_method)

    #score_adjustments = {
    #    'DEUXIÈME SESSION': -1000,
    #    'PASSABLE': -5000,
    #    'MENTION SUPÉRIEURE': -10000
    #}
    #for status, adjustment in score_adjustments.items():
     #   df.loc[df['Prediction_Status'] == status, 'Score_Predit'] += adjustment

    df['Rang_Predit'] = df['Score_Predit'].rank(ascending=False, method=rank_method)
    df = df[['Rang_L1_New', 'Rang_Predit', 'RESULTAT']]

    #rmse = np.sqrt(mean_squared_error(df['Rang_L1_New'], df['Rang_Predit']))
    rmse = np.sqrt(root_mean_squared_error(df['Rang_L1_New'], df['Rang_Predit']))

    # df['diff'] = (df['Rang_L1_New'] - df['Rang_Predit']).pow(2)
    # rmse = np.sqrt(np.mean(df['diff']))
    # print(f"RMSE_{suffix} : {rmse}")

    df_strict = df.loc[df['RESULTAT'] == 'PASSE']
    df_strict['mrr'] = 1 / df_strict['Rang_Predit']
    mrr_strict = df_strict['mrr'].sum()
    # print(f"MRR_{suffix} strict : {mrr_strict}")

    df_open = df.loc[df['RESULTAT'] != 'NON ADMIS']
    df_open['mrr'] = 1 / df_open['Rang_Predit']
    mrr_open = df_open['mrr'].sum()
    # print(f"MRR_{suffix} open : {mrr_open}")

    return rmse, mrr_strict, mrr_open

In [None]:
# # @title
def evaluate_all_predictions(df_results, suffix, rank_method='average'):
  print(f"EVALUATIONS POUR {suffix}")
  qualities = (0, 0, 0)
  for idx, df_result in enumerate(df_results):
    rmse, mrr_strict, mrr_open  = evaluate_predictions(df_result, rank_method)
    qualities = np.add(qualities, (rmse, mrr_strict, mrr_open))
  qualities = qualities / len(df_results)
  print(f"RMSE_{suffix}_{rank_method} : {qualities[0]}")
  print(f"MRR_{suffix}_{rank_method} strict : {qualities[1]}")
  print(f"MRR_{suffix}_{rank_method} open : {qualities[2]}")



# def evaluate_all_predictions(df_results, suffix, rank_method='average'):
#     print(f"EVALUATIONS POUR {suffix}")

#     # Calculer les métriques pour chaque document séparément
#     for idx, df_result in enumerate(df_results):
#         doc_name = f"Doc{idx+1}"
#         rmse, mrr_strict, mrr_open = evaluate_predictions(df_result, rank_method)
#         print(f"\nMétriques pour {doc_name}:")
#         print(f"RMSE_{suffix}_{rank_method} : {rmse}")
#         print(f"MRR_{suffix}_{rank_method} strict : {mrr_strict}")
#         print(f"MRR_{suffix}_{rank_method} open : {mrr_open}")

#     # Calculer aussi les moyennes globales
#     qualities = (0, 0, 0)
#     for idx, df_result in enumerate(df_results):
#         rmse, mrr_strict, mrr_open = evaluate_predictions(df_result, rank_method)
#         qualities = np.add(qualities, (rmse, mrr_strict, mrr_open))

#     qualities = qualities / len(df_results)
#     print("\nMoyennes globales:")
#     print(f"RMSE_{suffix}_{rank_method} moyen: {qualities[0]}")
#     print(f"MRR_{suffix}_{rank_method} strict moyen: {qualities[1]}")
#     print(f"MRR_{suffix}_{rank_method} open moyen: {qualities[2]}")

In [None]:
# @title
def evaluate(docs, predictors, suffix='', rank_method='average'):
  results = []
  for i in range(len(docs)):
    results.append(predictors[i].predict(docs[i].copy()))

  evaluate_all_predictions(results, suffix, rank_method)

# Data loading

In [None]:
# https://www.geeksforgeeks.org/how-to-convert-categorical-string-data-into-numeric-in-python/
from sklearn.preprocessing import LabelEncoder

def oneHotEncoding(df, columnName):
	le = LabelEncoder()
	label = le.fit_transform(df[columnName])
	df.drop(columnName, axis=1, inplace=True)
	df[columnName] = label
	return df

In [None]:
def load_data(filePath):
  df = pd.read_csv(filePath)
  #for column in [ 'SEXE', 'Résidence', "Académie de l'Ets. Prov.", "Série", "Mention" ]:
  #  oneHotEncoding(df, column)
  return preprocess_data(df)

In [None]:
from google.colab import drive
drive.mount("/content/drive/", force_remount=True)

MessageError: Error: credential propagation was unsuccessful

In [None]:
directory = "/content/drive/MyDrive/Memoire/DIORES/Datasets/L1MPI/"
doc1_df_test = load_data(os.path.join(directory, "doc1", "doc1_df_test.csv"));
doc2_df_test = load_data(os.path.join(directory, "doc2", "doc2_df_test.csv"));
doc3_df_test = load_data(os.path.join(directory, "doc3", "doc3_df_test.csv"));

docs = []
docs.append( doc1_df_test)
docs.append( doc2_df_test)
docs.append( doc3_df_test)

In [None]:
doc1_df_test

Unnamed: 0,REGION_DE_NAISSANCE,NIVEAU,SESSION,MENTION,MOYENNE ANNUELLE,RESULTAT,RESULTAT APP EVALUATION,Année BAC,Ets. de provenance,Centre d'Ec.,...,Rang DAP,Score L1,Rang L1,S1,S2,S3,Homme,Femme,Academie perf.,Residence perf.
0,Thiès,1,Deuxième Session,0,7.82,AUTORISE,1,2018,LYCEE DE FISSEL,LYCEE DE FISSEL,...,248,7.82,216,0,1,0,1,0,10.995000,11.675000
1,Matam,1,Deuxième Session,0,2.64,NON ADMIS,1,2018,LYCEE DE OUROSSOGUI,LYCEE DE OUROSSOGUI,...,330,2.64,339,0,1,0,0,1,11.175000,10.270000
2,Tambacounda,1,Deuxième Session,Passable,11.83,PASSE,1,2018,LYCEE DE KIDIRA,LYCEE DE KIDIRA,...,99,11.83,104,0,1,0,1,0,10.720000,10.650000
3,Kaffrine,1,Deuxième Session,0,5.28,NON ADMIS,1,2018,LYCEE DE MEDINA SABAKH,LYCEE DE MEDINA SABAKH,...,291,5.28,281,0,1,0,0,1,10.593333,10.000000
4,Fatick,1,Deuxième Session,0,2.91,NON ADMIS,1,2018,COURS PRIVES MBOUTOU SOW,COLLEGE PIE XII,...,278,2.91,332,0,1,0,0,1,10.593333,10.307500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,Kolda,1,Deuxième Session,0,8.32,NON ADMIS,1,2018,"ECOLE PRIVEE LAIQUE ""LYCEE D'ELITE""",LYCEE MAME CHEIKH MBAYE,...,305,8.32,203,0,1,0,1,0,10.720000,10.737500
66,Kaolack,1,Deuxième Session,0,6.12,NON ADMIS,1,2018,LYCEE MABA DIAKHOU BA,LYCEE MABA DIAKHOU BA,...,281,6.12,263,0,1,0,1,0,10.593333,10.880000
67,Thiès,1,Deuxième Session,Passable,10.57,PASSE,1,2018,LYCEE DE MEDINA FALL,COLLEGE SAINT GABRIEL,...,211,10.57,130,0,1,0,1,0,10.995000,11.432500
68,Tambacounda,1,Deuxième Session,0,9.42,AUTORISE,1,2018,LYCEE MAME CHEIKH MBAYE,LYCEE MAME CHEIKH MBAYE,...,97,9.42,158,1,0,0,0,1,10.720000,10.737500


In [None]:
# @title
docs[0].dtypes

Unnamed: 0,0
REGION_DE_NAISSANCE,object
NIVEAU,int64
SESSION,object
MENTION,object
MOYENNE ANNUELLE,float64
RESULTAT,object
RESULTAT APP EVALUATION,int64
Année BAC,int64
Ets. de provenance,object
Centre d'Ec.,object


# Testing models

Test et comparaison des trois modèles : DIORES, LassoGLobal et DAP

## Testing DIORES' model

## EVALUATIONS POUR LassoEnsembleV1

In [None]:
class DioresPredictorEnsemblisteLasso(BaseEstimator, ClassifierMixin):
   def __init__(self):
       self.dt_paths = {
           'admission': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/admi_non_admi_best_model_DecisionTree.pkl',
           'session': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/session_best_model_DecisionTree.pkl',
           'mention': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/mention_best_model_DecisionTree.pkl'
       }

       self.lasso_base_paths = {
           'non_admi': '/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session/NON_ADMI/non_admi/',
           'deuxieme_session': '/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session/DEUXIME_SESSION/deuxieme_session/',
           'passable': '/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session/PASSABLE/passable/',
           'mention': '/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session/MENTION/mention/'
       }

       self.dt_models = {}
       self.lasso_models = {}
       self.lasso_scalers = {}
       self.lasso_features = {}

       self.load_models()

   def load_models(self):
       # Chargement DecisionTrees
       for key, path in self.dt_paths.items():
           with open(path, 'rb') as f:
               self.dt_models[key] = pickle.load(f)

       # Chargement modèles Lasso
       for key, base_path in self.lasso_base_paths.items():
           try:
               with open(f"{base_path}/{key}_lasso_model.pkl", 'rb') as f:
                   self.lasso_models[key] = pickle.load(f)
               with open(f"{base_path}/{key}_lasso_scaler.pkl", 'rb') as f:
                   self.lasso_scalers[key] = pickle.load(f)
               with open(f"{base_path}/{key}_lasso_info.pkl", 'rb') as f:
                   self.lasso_features[key] = pickle.load(f)['features']
           except Exception as e:
               print(f"Erreur chargement modèle {key}: {str(e)}")

   def predict_student(self, X):
       try:
           # Prédiction admission avec DecisionTree
           X_tree = DataFrameProcessor(X).preprocess_data_tree()
           admission_pred = self.dt_models['admission'].predict(X_tree)[0]

           if admission_pred == 0:  # NON ADMIS
               X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['non_admi'])
               X_scaled = self.lasso_scalers['non_admi'].transform(X_lasso)
               score = self.lasso_models['non_admi'].predict(X_scaled)[0]
               return {'status': 'NON ADMIS', 'score': score, 'model': 'non_admi'}

           # Prédiction session
           session_pred = self.dt_models['session'].predict(X_tree)[0]

           if session_pred == 0:  # DEUXIÈME SESSION
               X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['deuxieme_session'])
               X_scaled = self.lasso_scalers['deuxieme_session'].transform(X_lasso)
               score = self.lasso_models['deuxieme_session'].predict(X_scaled)[0]
               return {'status': 'DEUXIÈME SESSION', 'score': score, 'model': 'deuxieme_session'}

           # Prédiction mention
           mention_pred = self.dt_models['mention'].predict(X_tree)[0]

           if mention_pred == 0:  # PASSABLE
               X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['passable'])
               X_scaled = self.lasso_scalers['passable'].transform(X_lasso)
               score = self.lasso_models['passable'].predict(X_scaled)[0]
               return {'status': 'PASSABLE', 'score': score, 'model': 'passable'}
           else:
               X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['mention'])
               X_scaled = self.lasso_scalers['mention'].transform(X_lasso)
               score = self.lasso_models['mention'].predict(X_scaled)[0]
               return {'status': 'MENTION SUPÉRIEURE', 'score': score, 'model': 'mention'}

       except Exception as e:
           print(f"Erreur prédiction: {str(e)}")
           raise
           'DEUXIÈME SESSION'
           'PASSABLE'
           'MENTION SUPÉRIEURE'

   def predict(self, X):
    """
    Prédit pour un ensemble d'étudiants et retourne le DataFrame original
    avec les colonnes de prédiction ajoutées

    Parameters:
    -----------
    X : pandas.DataFrame
        DataFrame contenant les données des étudiants

    Returns:
    --------
    pandas.DataFrame
        DataFrame original avec les colonnes de prédiction ajoutées
    """
    # Garder une copie du DataFrame original
    results_df = X.copy()

    predictions = []
    for idx, student in X.iterrows():
        try:
            pred = self.predict_student(pd.DataFrame([student]))
            predictions.append(pred)
        except Exception as e:
            print(f"Erreur de prédiction: {str(e)}")
            predictions.append({
                'status': 'ERREUR',
                'score': None,
                'model': None
            })

    # Ajouter les colonnes de prédiction au DataFrame original
    results_df['Prediction_Status'] = [p['status'] for p in predictions]
    results_df['Score_Predit'] = [p['score'] for p in predictions]
    results_df['Model_Utilise'] = [p['model'] for p in predictions]

    return results_df

## EVALUATIONS POUR LassoEnsembleV1

In [None]:
predictor = DioresPredictorEnsemblisteLasso()
predictors = [predictor, predictor, predictor]

In [None]:
evaluate(docs, predictors, suffix='DIORES', rank_method='average')

EVALUATIONS POUR DIORES
RMSE_DIORES_average : 4.656531868382516
MRR_DIORES_average strict : 3.4225587721342676
MRR_DIORES_average open : 4.004464394807058


In [None]:
evaluate(docs, predictors, suffix='DIORES', rank_method='first')

EVALUATIONS POUR DIORES
RMSE_DIORES_first : 4.6571583640325125
MRR_DIORES_first strict : 3.4259658730413274
MRR_DIORES_first open : 4.005093717936339


In [None]:
evaluate(docs, predictors, suffix='DIORES', rank_method='min')

EVALUATIONS POUR DIORES
RMSE_DIORES_min : 4.657017991388039
MRR_DIORES_min strict : 3.4262428802296583
MRR_DIORES_min open : 4.011323106077052


In [None]:
class DioresPredictorEnsemblisteLassoV2(BaseEstimator, ClassifierMixin):
    def __init__(self, doc_id):
        self.doc_id = doc_id  # 'Doc1', 'Doc2', ou 'Doc3'

        self.dt_paths = {
            'admission': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/admi_non_admi_best_model_DecisionTree.pkl',
            'session': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/session_best_model_DecisionTree.pkl',
            'mention': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/mention_best_model_DecisionTree.pkl'
        }
        self.lasso_base_paths = {
            'non_admi': f'/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session_V2/{self.doc_id}/NON_ADMI/',
            'deuxieme_session': f'/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session_V2/{self.doc_id}/DEUXIME_SESSION/',
            'passable': f'/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session_V2/{self.doc_id}/PASSABLE/',
            'mention': f'/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session_V2/{self.doc_id}/MENTION/'
        }
        self.dt_models = {}
        self.lasso_models = {}
        self.lasso_scalers = {}
        self.lasso_features = {}
        self.load_models()

    def load_models(self):
        # Chargement DecisionTrees
        for key, path in self.dt_paths.items():
            with open(path, 'rb') as f:
                self.dt_models[key] = pickle.load(f)
        # Chargement modèles Lasso
        for key, base_path in self.lasso_base_paths.items():
            try:
                with open(f"{base_path}{key}_lasso_model.pkl", 'rb') as f:
                    self.lasso_models[key] = pickle.load(f)
                with open(f"{base_path}{key}_lasso_scaler.pkl", 'rb') as f:
                    self.lasso_scalers[key] = pickle.load(f)
                with open(f"{base_path}{key}_lasso_info.pkl", 'rb') as f:
                    self.lasso_features[key] = pickle.load(f)['features']
            except Exception as e:
                print(f"Erreur chargement modèle {key} pour {self.doc_id}: {str(e)}")

    def predict_student(self, X):
        try:
            # Prédiction admission avec DecisionTree
            X_tree = DataFrameProcessor(X).preprocess_data_tree()
            admission_pred = self.dt_models['admission'].predict(X_tree)[0]

            if admission_pred == 0:  # NON ADMIS
                X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['non_admi'])
                X_scaled = self.lasso_scalers['non_admi'].transform(X_lasso)
                score = self.lasso_models['non_admi'].predict(X_scaled)[0]
                return {'status': 'NON ADMIS', 'score': score, 'model': 'non_admi'}

            # Prédiction session
            session_pred = self.dt_models['session'].predict(X_tree)[0]

            if session_pred == 0:  # DEUXIÈME SESSION
                X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['deuxieme_session'])
                X_scaled = self.lasso_scalers['deuxieme_session'].transform(X_lasso)
                score = self.lasso_models['deuxieme_session'].predict(X_scaled)[0]
                return {'status': 'DEUXIÈME SESSION', 'score': score, 'model': 'deuxieme_session'}

            # Prédiction mention
            mention_pred = self.dt_models['mention'].predict(X_tree)[0]

            if mention_pred == 0:  # PASSABLE
                X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['passable'])
                X_scaled = self.lasso_scalers['passable'].transform(X_lasso)
                score = self.lasso_models['passable'].predict(X_scaled)[0]
                return {'status': 'PASSABLE', 'score': score, 'model': 'passable'}
            else:
                X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['mention'])
                X_scaled = self.lasso_scalers['mention'].transform(X_lasso)
                score = self.lasso_models['mention'].predict(X_scaled)[0]
                return {'status': 'MENTION SUPÉRIEURE', 'score': score, 'model': 'mention'}

        except Exception as e:
            print(f"Erreur prédiction: {str(e)}")
            raise

    def predict(self, X):
        """
        Prédit pour un ensemble d'étudiants et retourne le DataFrame original
        avec les colonnes de prédiction ajoutées
        """
        # Garder une copie du DataFrame original
        results_df = X.copy()

        predictions = []
        for idx, student in X.iterrows():
            try:
                pred = self.predict_student(pd.DataFrame([student]))
                predictions.append(pred)
            except Exception as e:
                print(f"Erreur de prédiction pour {self.doc_id}: {str(e)}")
                predictions.append({
                    'status': 'ERREUR',
                    'score': None,
                    'model': None
                })

        # Ajouter les colonnes de prédiction au DataFrame original
        results_df['Prediction_Status'] = [p['status'] for p in predictions]
        results_df['Score_Predit'] = [p['score'] for p in predictions]
        results_df['Model_Utilise'] = [p['model'] for p in predictions]

        # Appliquer les ajustements de score
        score_adjustments = {
            'DEUXIÈME SESSION': -1000,
            'PASSABLE': -5000,
            'MENTION SUPÉRIEURE': -10000
        }

        for status, adjustment in score_adjustments.items():
            results_df.loc[results_df['Prediction_Status'] == status, 'Score_Predit'] += adjustment

        return results_df

## EVALUATIONS POUR LassoEnsembleV2

In [None]:
# Créer les prédicteurs pour chaque document
predictors = []
for doc_id in ['Doc1', 'Doc2', 'Doc3']:
    predictor = DioresPredictorEnsemblisteLassoV2(doc_id)
    predictors.append(predictor)

# Charger les données de test
docs = [doc1_df_test, doc2_df_test, doc3_df_test]

# Évaluer les modèles
evaluate(docs, predictors, suffix='LassoEnsembleV2', rank_method='average')

EVALUATIONS POUR LassoEnsembleV2
RMSE_LassoEnsembleV2_average : 5.446271031197647
MRR_LassoEnsembleV2_average strict : 2.7746995177261233
MRR_LassoEnsembleV2_average open : 3.126375524849997


In [None]:
evaluate(docs, predictors, suffix='LassoEnsembleV2', rank_method='first')

EVALUATIONS POUR LassoEnsembleV2
RMSE_LassoEnsembleV2_first : 5.44719651878139
MRR_LassoEnsembleV2_first strict : 2.7737247284675814
MRR_LassoEnsembleV2_first open : 3.124491703154097


In [None]:
evaluate(docs, predictors, suffix='LassoEnsembleV2', rank_method='min')

EVALUATIONS POUR LassoEnsembleV2
RMSE_LassoEnsembleV2_min : 5.448201724637315
MRR_LassoEnsembleV2_min strict : 2.7758614806043336
MRR_LassoEnsembleV2_min open : 3.1291874415188565


In [None]:
class DioresPredictorEnsemblisteLassoV3(BaseEstimator, ClassifierMixin):
    def __init__(self, doc_id):
        self.doc_id = doc_id  # 'Doc1', 'Doc2', ou 'Doc3'

        # self.dt_paths = {
        #     'admission': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/admi_non_admi_best_model_DecisionTree.pkl',
        #     'session': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/session_best_model_DecisionTree.pkl',
        #     'mention': '/content/drive/MyDrive/Memoire/DIORES/Models/V2/mention_best_model_DecisionTree.pkl'
        # }
        self.dt_paths = {
            'admission': '/content/drive/MyDrive/Memoire/DIORES/Models/ClassifieurV3/admi_non_admi_best_model_DecisionTree.pkl',
            'session': '/content/drive/MyDrive/Memoire/DIORES/Models/ClassifieurV3/session_best_model_DecisionTree.pkl',
            'mention': '/content/drive/MyDrive/Memoire/DIORES/Models/ClassifieurV3/mention_best_model_DecisionTree.pkl'
        }
        self.lasso_base_paths = {
            'non_admi': f'/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session_V3/{self.doc_id}/NON_ADMI/',
            'deuxieme_session': f'/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session_V3/{self.doc_id}/DEUXIME_SESSION/',
            'passable': f'/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session_V3/{self.doc_id}/PASSABLE/',
            'mention': f'/content/drive/MyDrive/Memoire/DIORES/Models/Lasso_Admi_Session_V3/{self.doc_id}/MENTION/'
        }
        self.dt_models = {}
        self.lasso_models = {}
        self.lasso_scalers = {}
        self.lasso_features = {}
        self.load_models()

    def load_models(self):
        # Chargement DecisionTrees
        for key, path in self.dt_paths.items():
            with open(path, 'rb') as f:
                self.dt_models[key] = pickle.load(f)
        # Chargement modèles Lasso
        for key, base_path in self.lasso_base_paths.items():
            try:
                with open(f"{base_path}{key}_lasso_model.pkl", 'rb') as f:
                    self.lasso_models[key] = pickle.load(f)
                with open(f"{base_path}{key}_lasso_scaler.pkl", 'rb') as f:
                    self.lasso_scalers[key] = pickle.load(f)
                with open(f"{base_path}{key}_lasso_info.pkl", 'rb') as f:
                    self.lasso_features[key] = pickle.load(f)['features']
            except Exception as e:
                print(f"Erreur chargement modèle {key} pour {self.doc_id}: {str(e)}")

    def predict_student(self, X):
        try:
            # Prédiction admission avec DecisionTree
            X_tree = DataFrameProcessor(X).preprocess_data_tree()
            admission_pred = self.dt_models['admission'].predict(X_tree)[0]

            if admission_pred == 0:  # NON ADMIS
                X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['non_admi'])
                X_scaled = self.lasso_scalers['non_admi'].transform(X_lasso)
                score = self.lasso_models['non_admi'].predict(X_scaled)[0]
                return {'status': 'NON ADMIS', 'score': score, 'model': 'non_admi'}

            # Prédiction session
            session_pred = self.dt_models['session'].predict(X_tree)[0]

            if session_pred == 0:  # DEUXIÈME SESSION
                X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['deuxieme_session'])
                X_scaled = self.lasso_scalers['deuxieme_session'].transform(X_lasso)
                score = self.lasso_models['deuxieme_session'].predict(X_scaled)[0]
                return {'status': 'DEUXIÈME SESSION', 'score': score, 'model': 'deuxieme_session'}

            # Prédiction mention
            mention_pred = self.dt_models['mention'].predict(X_tree)[0]

            if mention_pred == 0:  # PASSABLE
                X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['passable'])
                X_scaled = self.lasso_scalers['passable'].transform(X_lasso)
                score = self.lasso_models['passable'].predict(X_scaled)[0]
                return {'status': 'PASSABLE', 'score': score, 'model': 'passable'}
            else:
                X_lasso = DataFrameProcessor(X).preprocess_data_lasso(self.lasso_features['mention'])
                X_scaled = self.lasso_scalers['mention'].transform(X_lasso)
                score = self.lasso_models['mention'].predict(X_scaled)[0]
                return {'status': 'MENTION SUPÉRIEURE', 'score': score, 'model': 'mention'}

        except Exception as e:
            print(f"Erreur prédiction: {str(e)}")
            raise

    def predict(self, X):
        """
        Prédit pour un ensemble d'étudiants et retourne le DataFrame original
        avec les colonnes de prédiction ajoutées
        """
        # Garder une copie du DataFrame original
        results_df = X.copy()

        predictions = []
        for idx, student in X.iterrows():
            try:
                pred = self.predict_student(pd.DataFrame([student]))
                predictions.append(pred)
            except Exception as e:
                print(f"Erreur de prédiction pour {self.doc_id}: {str(e)}")
                predictions.append({
                    'status': 'ERREUR',
                    'score': None,
                    'model': None
                })

        # Ajouter les colonnes de prédiction au DataFrame original
        results_df['Prediction_Status'] = [p['status'] for p in predictions]
        results_df['Score_Predit'] = [p['score'] for p in predictions]
        results_df['Model_Utilise'] = [p['model'] for p in predictions]

        # Appliquer les ajustements de score
        score_adjustments = {
            'DEUXIÈME SESSION': -1000,
            'PASSABLE': -5000,
            'MENTION SUPÉRIEURE': -10000
        }

        for status, adjustment in score_adjustments.items():
            results_df.loc[results_df['Prediction_Status'] == status, 'Score_Predit'] += adjustment

        return results_df

## EVALUATIONS POUR LassoEnsembleV3

In [None]:
# Créer les prédicteurs pour chaque document
predictors = []
for doc_id in ['Doc1', 'Doc2', 'Doc3']:
    predictor = DioresPredictorEnsemblisteLassoV3(doc_id)
    predictors.append(predictor)

# Charger les données de test
docs = [doc1_df_test, doc2_df_test, doc3_df_test]

# Évaluer les modèles
evaluate(docs, predictors, suffix='LassoEnsembleV3', rank_method='average')

EVALUATIONS POUR LassoEnsembleV3
RMSE_LassoEnsembleV3_average : 5.625248727420844
MRR_LassoEnsembleV3_average strict : 2.1814746205478017
MRR_LassoEnsembleV3_average open : 2.7547706467460116


In [None]:
evaluate(docs, predictors, suffix='LassoEnsembleV3', rank_method='first')

EVALUATIONS POUR LassoEnsembleV3
RMSE_LassoEnsembleV3_first : 5.6266054111058805
MRR_LassoEnsembleV3_first strict : 2.1815146671249686
MRR_LassoEnsembleV3_first open : 2.7544067921423707


In [None]:
evaluate(docs, predictors, suffix='LassoEnsembleV3', rank_method='min')

EVALUATIONS POUR LassoEnsembleV3
RMSE_LassoEnsembleV3_min : 5.623075905120044
MRR_LassoEnsembleV3_min strict : 2.1820598534498945
MRR_LassoEnsembleV3_min open : 2.7567796344725846


## Testing Global LASSO

In [None]:
def load_model_and_scaler(model_path):
    """
    Charge un modèle et un scaler à partir du chemin donné.

    Args:
        model_path (str): Chemin du dossier contenant le modèle et le scaler.

    Returns:
        tuple: Le modèle et le scaler chargés.
    """
    with open(os.path.join(model_path, 'lasso_globale_model.pkl'), 'rb') as f:
        model = pickle.load(f)
    with open(os.path.join(model_path, 'lasso_globale_scaler.pkl'), 'rb') as f:
        scaler = pickle.load(f)
    return model, scaler

In [None]:
directory = "/content/drive/MyDrive/Memoire/DIORES/Models/LassoSimple/"
predictors = []
for i in range(1, 4):
    model_path = os.path.join(directory, f"Doc{i}")
    predictor, scaler = load_model_and_scaler(model_path)
    predictors.append(predictor)

In [None]:
class DioresPredictorLasso:
    def __init__(self, model, scaler, features):
        self.model = model
        self.scaler = scaler
        self.features = features

    def predict(self, df):
        """
        Fait des prédictions sur un DataFrame et renvoie les résultats au format attendu.
        """
        # Créer une copie du DataFrame pour ne pas modifier l'original
        df_result = df.copy()

        # Vérifier les colonnes manquantes
        missing_cols = set(self.features) - set(df.columns)
        if missing_cols:
            print(f"Attention: Colonnes manquantes: {missing_cols}")

            raise ValueError(f"Colonnes manquantes dans le DataFrame: {missing_cols}")

        # Sélectionner et standardiser les features
        try:
            X = df_result[list(self.features)]
            X_scaled = self.scaler.transform(X)

            # Faire les prédictions
            predictions = self.model.predict(X_scaled)

            # Ajouter les prédictions au DataFrame résultat
            df_result['Score_Predit'] = predictions

            return df_result
        except Exception as e:
            print(f"Erreur lors de la prédiction: {str(e)}")
            print(f"Features attendues: {self.features}")
            print(f"Colonnes disponibles: {df_result.columns.tolist()}")
            raise

# Charger les modèles et créer les prédicteurs
directory = "/content/drive/MyDrive/Memoire/DIORES/Models/LassoSimple/"
predictors = []

for i in range(1, 4):
    model_path = os.path.join(directory, f"Doc{i}")
    # Charger le modèle et le scaler
    model, scaler = load_model_and_scaler(model_path)

    # Charger les informations sur les features
    with open(os.path.join(model_path, 'lasso_globale_info.pkl'), 'rb') as f:
        info = pickle.load(f)

    # Créer le prédicteur avec les features spécifiques
    predictor = DioresPredictorLasso(model, scaler, info['features'])
    predictors.append(predictor)

In [None]:
docs = [doc1_df_test, doc2_df_test, doc3_df_test]

In [None]:
evaluate(docs, predictors, suffix='LassoGlobal', rank_method='average')

EVALUATIONS POUR LassoGlobal
RMSE_LassoGlobal_average : 4.215667030792903
MRR_LassoGlobal_average strict : 3.424976356220227
MRR_LassoGlobal_average open : 3.7140835480405294


In [None]:
evaluate(docs, predictors, suffix='LassoGlobal', rank_method='first')

EVALUATIONS POUR LassoGlobal
RMSE_LassoGlobal_first : 4.219315867737798
MRR_LassoGlobal_first strict : 3.4819016450903093
MRR_LassoGlobal_first open : 3.771517483319504


In [None]:
evaluate(docs, predictors, suffix='LassoGlobal', rank_method='min')

EVALUATIONS POUR LassoGlobal
RMSE_LassoGlobal_min : 4.220180557474298
MRR_LassoGlobal_min strict : 3.6894862190088236
MRR_LassoGlobal_min open : 3.980624013340426


## Testing DAP's model

In [None]:
class DAP_predictor:
  def __init__(self):
    pass

  def predict(self, X):
    df = X.copy()
    df['Score_Predit'] = df['Score DAP']
    return df

In [None]:
predictor = DAP_predictor()
predictors = [predictor, predictor, predictor]

In [None]:
evaluate(docs, predictors, suffix='DAP', rank_method='average')

EVALUATIONS POUR DAP
RMSE_DAP_average : 4.025313650680667
MRR_DAP_average strict : 3.6265349804462663
MRR_DAP_average open : 3.9386937592646496


In [None]:
evaluate(docs, predictors, suffix='DAP', rank_method='first')

EVALUATIONS POUR DAP
RMSE_DAP_first : 4.02759345535162
MRR_DAP_first strict : 3.626363230502386
MRR_DAP_first open : 3.9403988559087835


In [None]:
evaluate(docs, predictors, suffix='DAP', rank_method='min')

EVALUATIONS POUR DAP
RMSE_DAP_min : 4.026857762888797
MRR_DAP_min strict : 3.649204490913355
MRR_DAP_min open : 3.9634771955042
