In [45]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import des modèles et outils d'évaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

In [46]:
# 1. Chargement du dataset
df = pd.read_csv('final_depression_dataset_1.csv')

# # Optionnel : Nettoyer les noms de colonnes (retirer les espaces en début/fin)
# df.columns = [col.strip() for col in df.columns]

# Afficher la liste des colonnes pour vérification
print("Colonnes du dataset :", df.columns.tolist())

# # Vérifier que la colonne de segmentation existe bien
# if "Working Professional or Student" not in df.columns:
#     raise KeyError("La colonne 'Working Professional or Student' n'existe pas dans le dataset!")

Colonnes du dataset : ['Name', 'Gender', 'Age', 'City', 'Working Professional or Student', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Depression']


In [47]:
# Obtenir le nombre de lignes du dataset
row_count = df.shape[0]

# Obtenir le nombre de valeurs non nulles pour chaque colonne
column_values_count = df.count()

# Afficher les résultats
row_count, column_values_count

(2556,
 Name                                     2556
 Gender                                   2556
 Age                                      2556
 City                                     2556
 Working Professional or Student          2556
 Profession                               1883
 Academic Pressure                         502
 Work Pressure                            2054
 CGPA                                      502
 Study Satisfaction                        502
 Job Satisfaction                         2054
 Sleep Duration                           2556
 Dietary Habits                           2556
 Degree                                   2556
 Have you ever had suicidal thoughts ?    2556
 Work/Study Hours                         2556
 Financial Stress                         2556
 Family History of Mental Illness         2556
 Depression                               2556
 dtype: int64)

In [48]:
print(df["Degree"].value_counts())

Degree
Class 12    275
B.Com       115
B.Ed        112
MCA         108
BCA         103
MSc          95
MBA          95
BSc          94
BBA          92
BHM          90
B.Arch       89
BA           89
B.Pharm      88
M.Tech       85
M.Pharm      85
ME           84
LLM          84
BE           84
LLB          82
PhD          81
MHM          81
M.Ed         81
MA           79
MBBS         75
MD           74
B.Tech       71
M.Com        65
Name: count, dtype: int64


In [49]:
# Catégorisation de l'âge
bins_age = [0, 18, 25, 40, 60, 100]
labels_age = ["Adolescent", "Jeune Adulte", "Adulte", "Senior Actif", "Retraité"]
df["Age Category"] = pd.cut(df["Age"], bins=bins_age, labels=labels_age, right=False)

# Catégorisation du diplôme
degree_mapping = {
    "Class 12": "Secondaire",
    "B.Com": "Bac+3",
    "B.Ed": "Bac+3",
    "BCA": "Bac+3",
    "BSc": "Bac+3",
    "BBA": "Bac+3",
    "BHM": "Bac+3",
    "BA": "Bac+3",
    "B.Arch": "Bac+3",
    "B.Pharm": "Bac+3",
    "BE": "Bac+3",
    "B.Tech": "Bac+3",
    
    "MCA": "Bac+5 et plus",
    "MSc": "Bac+5 et plus",
    "MBA": "Bac+5 et plus",
    "M.Tech": "Bac+5 et plus",
    "M.Pharm": "Bac+5 et plus",
    "ME": "Bac+5 et plus",
    "M.Com": "Bac+5 et plus",
    "M.Ed": "Bac+5 et plus",
    "MHM": "Bac+5 et plus",
    "MA": "Bac+5 et plus",
    "LLM": "Bac+5 et plus",
    
    "PhD": "Doctorat et plus",
    "MBBS": "Doctorat et plus",
    "MD": "Doctorat et plus",
    "LLB": "Bac+5 et plus",  # LLB est un diplôme en droit souvent considéré comme Bac+5
}
df["Degree Category"] = df["Degree"].map(degree_mapping)

# Catégorisation des heures de travail/études
bins_hours = [-1, 0, 3, 6, 24]  # -1 pour inclure 0 correctement
labels_hours = ["Inactif", "Temps Partiel Léger", "Temps Partiel", "Temps Plein"]
df["Work/Study Hours Category"] = pd.cut(df["Work/Study Hours"], bins=bins_hours, labels=labels_hours, right=True)

# Vérification des valeurs uniques de City
unique_cities = df["City"].unique()

# Grandes villes (population > 5M)
large_cities = {"Mumbai", "Delhi", "Bangalore", "Hyderabad", "Chennai", "Kolkata", "Pune"}

# Villes moyennes (1M - 5M)
medium_cities = {"Ahmedabad", "Surat", "Jaipur", "Lucknow", "Kanpur", "Nagpur", "Patna", "Indore", "Thane", "Bhopal"}

# Petites villes (<1M)
small_cities = set(unique_cities) - large_cities - medium_cities

# Fonction de catégorisation des villes
def categorize_city(city):
    if city in large_cities:
        return "Grande Ville"
    elif city in medium_cities:
        return "Ville Moyenne"
    elif city in small_cities:
        return "Petite Ville"
    else:
        return "Inconnue"

df["City Category"] = df["City"].apply(categorize_city)


In [50]:
columns_to_drop = ["Name", "Age", "Degree", "Work/Study Hours", "City"]  # les colonnes à supprimer
df = df.drop(columns=columns_to_drop, errors='ignore')

In [51]:
# Affichage rapide du nombre de valeurs manquantes par colonne
print("Valeurs manquantes par colonne :")
print(df.isnull().sum())

Valeurs manquantes par colonne :
Gender                                      0
Working Professional or Student             0
Profession                                673
Academic Pressure                        2054
Work Pressure                             502
CGPA                                     2054
Study Satisfaction                       2054
Job Satisfaction                          502
Sleep Duration                              0
Dietary Habits                              0
Have you ever had suicidal thoughts ?       0
Financial Stress                            0
Family History of Mental Illness            0
Depression                                  0
Age Category                                0
Degree Category                             0
Work/Study Hours Category                   0
City Category                               0
dtype: int64


In [52]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype   
---  ------                                 --------------  -----   
 0   Gender                                 2556 non-null   object  
 1   Working Professional or Student        2556 non-null   object  
 2   Profession                             1883 non-null   object  
 3   Academic Pressure                      502 non-null    float64 
 4   Work Pressure                          2054 non-null   float64 
 5   CGPA                                   502 non-null    float64 
 6   Study Satisfaction                     502 non-null    float64 
 7   Job Satisfaction                       2054 non-null   float64 
 8   Sleep Duration                         2556 non-null   object  
 9   Dietary Habits                         2556 non-null   object  
 10  Have you ever had suicidal thoughts ?  2556 non-null   objec

In [53]:
print(df.head())

   Gender Working Professional or Student         Profession  \
0  Female            Working Professional            Teacher   
1    Male            Working Professional  Financial Analyst   
2  Female            Working Professional            Teacher   
3  Female            Working Professional            Teacher   
4    Male            Working Professional     UX/UI Designer   

   Academic Pressure  Work Pressure  CGPA  Study Satisfaction  \
0                NaN            2.0   NaN                 NaN   
1                NaN            4.0   NaN                 NaN   
2                NaN            2.0   NaN                 NaN   
3                NaN            3.0   NaN                 NaN   
4                NaN            4.0   NaN                 NaN   

   Job Satisfaction Sleep Duration Dietary Habits  \
0               4.0      7-8 hours       Moderate   
1               3.0      5-6 hours      Unhealthy   
2               3.0      5-6 hours       Moderate   
3           

In [54]:
# 2. Séparation en deux sous-datasets
# On crée un masque pour les étudiants en se basant sur une recherche du mot "student" (insensible à la casse)
mask_student = df["Working Professional or Student"].str.lower().str.contains("student")

# Création des sous-datasets pour les étudiants et les professionnels
df_students = df[mask_student].copy()
df_professionals = df[~mask_student].copy()

print("Nombre d'étudiants :", df_students.shape[0])
print("Nombre de professionnels :", df_professionals.shape[0])

Nombre d'étudiants : 502
Nombre de professionnels : 2054


In [55]:
# Vérifier les valeurs manquantes dans les colonnes "Academic Pressure", "CGPA", "Study Satisfaction"
missing_values_professionals = df_professionals[["Academic Pressure", "CGPA", "Study Satisfaction"]].isna().sum()

# Afficher les résultats
missing_values_professionals

Academic Pressure     2054
CGPA                  2054
Study Satisfaction    2054
dtype: int64

In [56]:
df_professionals = df_professionals.drop(columns=["Academic Pressure", "CGPA", "Study Satisfaction"], errors="ignore")


In [57]:
# Vérifier les valeurs manquantes dans les colonnes 
missing_values_students = df_students[["Work Pressure", "Job Satisfaction"]].isna().sum()

# Afficher les résultats
missing_values_students

Work Pressure       502
Job Satisfaction    502
dtype: int64

In [58]:
df_students = df_students.drop(columns=["Work Pressure", "Job Satisfaction"], errors="ignore")

In [59]:
print(df_students.columns)

Index(['Gender', 'Working Professional or Student', 'Profession',
       'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Have you ever had suicidal thoughts ?',
       'Financial Stress', 'Family History of Mental Illness', 'Depression',
       'Age Category', 'Degree Category', 'Work/Study Hours Category',
       'City Category'],
      dtype='object')


In [60]:
# Gestion de "Sleep Duration"
# On suppose que la colonne contient les catégories suivantes :
# "Less than 5 hours", "5-6 hours", "7-8 hours", "More than 8 hours"
sleep_mapping = {
    'Less than 5 hours': 4.5,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    'More than 8 hours': 9.0
}
# Pour les étudiants
if "Sleep Duration" in df_students.columns:
    df_students["Sleep Duration"] = df_students["Sleep Duration"].map(sleep_mapping)
    # Imputation par la médiane en cas de valeurs non mappées
    median_sleep = df_students["Sleep Duration"].median()
    df_students["Sleep Duration"] = df_students["Sleep Duration"].fillna(median_sleep)
# Pour les professionnels
if "Sleep Duration" in df_professionals.columns:
    df_professionals["Sleep Duration"] = df_professionals["Sleep Duration"].map(sleep_mapping)
    median_sleep = df_professionals["Sleep Duration"].median()
    df_professionals["Sleep Duration"] = df_professionals["Sleep Duration"].fillna(median_sleep)


In [61]:
# Gestion des autres colonnes catégorielles communes
categorical_columns = [
    'Gender', 'Dietary Habits', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Age Category', 'Degree Category', 'Work/Study Hours Category', 'City Category']

# Pour la colonne "Profession" :
# Pour les étudiants, on met "Student" (puisque leur profession n'est pas renseignée)
if "Profession" in df_students.columns:
    df_students["Profession"] = "Student"
# Pour les professionnels, on impute les valeurs manquantes par "Unknown"
if "Profession" in df_professionals.columns:
    df_professionals["Profession"] = df_professionals["Profession"].fillna("Unknown")


In [62]:
print(df_professionals["Age Category"].value_counts())

Age Category
Senior Actif    1199
Adulte           607
Jeune Adulte     198
Retraité          50
Adolescent         0
Name: count, dtype: int64


In [63]:
# Supposons que df_students est votre DataFrame contenant uniquement les étudiants.
# Vous pouvez supprimer les colonnes redondantes comme suit :
df_students = df_students.drop(['Working Professional or Student', 'Profession'], axis=1)

# Par exemple, pour le sous-dataset des professionnels :
df_professionals = df_professionals.drop('Working Professional or Student', axis=1)


In [64]:
# 4. Encodage des variables catégorielles
# On va encoder à la fois nos colonnes catégorielles communes et la colonne "Working Professional or Student"
# (même si ici, elle est utilisée pour la séparation, on peut l'enlever ensuite)
encode_cols = categorical_columns + ["Profession"]

df_students = pd.get_dummies(df_students, columns=categorical_columns, drop_first=True)
df_professionals = pd.get_dummies(df_professionals, columns=encode_cols, drop_first=True)


In [65]:
print(df_students.value_counts())


Academic Pressure  CGPA   Study Satisfaction  Sleep Duration  Financial Stress  Depression  Gender_Male  Dietary Habits_Moderate  Dietary Habits_Unhealthy  Have you ever had suicidal thoughts ?_Yes  Family History of Mental Illness_Yes  Age Category_Jeune Adulte  Age Category_Adulte  Age Category_Senior Actif  Age Category_Retraité  Degree Category_Bac+5 et plus  Degree Category_Doctorat et plus  Degree Category_Secondaire  Work/Study Hours Category_Temps Partiel Léger  Work/Study Hours Category_Temps Partiel  Work/Study Hours Category_Temps Plein  City Category_Petite Ville  City Category_Ville Moyenne
1.0                5.10   3.0                 4.5             3                 No          False        False                    False                     True                                       False                                 True                       False                False                      False                  False                          False                  

In [66]:
print(df_professionals.value_counts())


Work Pressure  Job Satisfaction  Sleep Duration  Financial Stress  Depression  Gender_Male  Dietary Habits_Moderate  Dietary Habits_Unhealthy  Have you ever had suicidal thoughts ?_Yes  Family History of Mental Illness_Yes  Age Category_Jeune Adulte  Age Category_Adulte  Age Category_Senior Actif  Age Category_Retraité  Degree Category_Bac+5 et plus  Degree Category_Doctorat et plus  Degree Category_Secondaire  Work/Study Hours Category_Temps Partiel Léger  Work/Study Hours Category_Temps Partiel  Work/Study Hours Category_Temps Plein  City Category_Petite Ville  City Category_Ville Moyenne  Profession_Architect  Profession_Business Analyst  Profession_Chef  Profession_Chemist  Profession_Civil Engineer  Profession_Consultant  Profession_Content Writer  Profession_Customer Support  Profession_Data Scientist  Profession_Digital Marketer  Profession_Doctor  Profession_Educational Consultant  Profession_Electrician  Profession_Entrepreneur  Profession_Finanancial Analyst  Profession_Finan

In [67]:
# 5. Séparation des variables explicatives (X) et de la cible (y)
# On suppose que la variable cible est "Depression"
if "Depression" not in df.columns:
    raise KeyError("La colonne 'Depression' n'existe pas dans le dataset!")

if "Depression" not in df_students.columns or "Depression" not in df_professionals.columns:
    raise KeyError("La colonne 'Depression' manque dans l'un des sous-datasets!")

X_students = df_students.drop("Depression", axis=1)
y_students = df_students["Depression"]

X_professionals = df_professionals.drop("Depression", axis=1)
y_professionals = df_professionals["Depression"]

In [68]:
print(df_students.info())
print("/////////////")
print(df_professionals.info())

<class 'pandas.core.frame.DataFrame'>
Index: 502 entries, 17 to 2555
Data columns (total 23 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Academic Pressure                              502 non-null    float64
 1   CGPA                                           502 non-null    float64
 2   Study Satisfaction                             502 non-null    float64
 3   Sleep Duration                                 502 non-null    float64
 4   Financial Stress                               502 non-null    int64  
 5   Depression                                     502 non-null    object 
 6   Gender_Male                                    502 non-null    bool   
 7   Dietary Habits_Moderate                        502 non-null    bool   
 8   Dietary Habits_Unhealthy                       502 non-null    bool   
 9   Have you ever had suicidal thoughts ?_Yes      502 non-nu

In [69]:
# 6. Normalisation des colonnes numériques (optionnel mais souvent utile)
# On définit une liste de colonnes numériques potentielles
numeric_columns = ['Academic Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Work Pressure', 'Financial Stress']

# On récupère celles qui existent dans chaque sous-dataset
num_cols_students = [col for col in numeric_columns if col in X_students.columns]
num_cols_prof = [col for col in numeric_columns if col in X_professionals.columns]

scaler_students = StandardScaler()
X_students[num_cols_students] = scaler_students.fit_transform(X_students[num_cols_students])

scaler_prof = StandardScaler()
X_professionals[num_cols_prof] = scaler_prof.fit_transform(X_professionals[num_cols_prof])


In [70]:
# # 7. Division en ensembles d'entraînement et de test
# X_train_students, X_test_students, y_train_students, y_test_students = train_test_split(
#     X_students, y_students, test_size=0.25, random_state=20
# )
# X_train_prof, X_test_prof, y_train_prof, y_test_prof = train_test_split(
#     X_professionals, y_professionals, test_size=0.25, random_state=20
# )


In [71]:
# # Pour les étudiants
# model_students = LogisticRegression(max_iter=1000, random_state=42)
# model_students.fit(X_train_students, y_train_students)
# y_pred_students = model_students.predict(X_test_students)

# print("=== Résultats pour les étudiants ===")
# print("Matrice de confusion:")
# print(confusion_matrix(y_test_students, y_pred_students))
# print("\nRapport de classification:")
# print(classification_report(y_test_students, y_pred_students))

# # Pour les professionnels
# model_prof = LogisticRegression(max_iter=1000, random_state=42)
# model_prof.fit(X_train_prof, y_train_prof)
# y_pred_prof = model_prof.predict(X_test_prof)

# print("=== Résultats pour les professionnels ===")
# print("Matrice de confusion:")
# print(confusion_matrix(y_test_prof, y_pred_prof))
# print("\nRapport de classification:")
# print(classification_report(y_test_prof, y_pred_prof))

In [72]:
# Pour chaque sous-ensemble, on sépare en variables explicatives (X) et cible (y)
#    et on crée des ensembles d'entraînement et de test.
# =============================================================================
def preparer_data(df_subset):
    if "Depression" not in df_subset.columns:
        raise KeyError("La colonne 'Depression' n'existe pas dans le sous-dataset!")
    X = df_subset.drop("Depression", axis=1)
    y = df_subset["Depression"]
    return train_test_split(X, y, test_size=0.2, random_state=42)

X_train_students, X_test_students, y_train_students, y_test_students = preparer_data(df_students)
X_train_prof, X_test_prof, y_train_prof, y_test_prof = preparer_data(df_professionals)

In [73]:
# Conversion pour le sous-ensemble étudiants
y_train_students = y_train_students.map({'No': 0, 'Yes': 1})
y_test_students  = y_test_students.map({'No': 0, 'Yes': 1})

# Conversion pour le sous-ensemble professionnels
y_train_prof = y_train_prof.map({'No': 0, 'Yes': 1})
y_test_prof  = y_test_prof.map({'No': 0, 'Yes': 1})

In [74]:
# Ajouter ça pour avoir la meme longueur des colonnes mais ça réduit les performances 0.89
# # Trouver les colonnes communes et ajouter celles qui manquent avec des zéros
# common_cols = set(X_train_students.columns).intersection(set(X_train_prof.columns))

# # Ajouter les colonnes manquantes aux étudiants
# for col in common_cols - set(X_train_students.columns):
#     X_train_students[col] = 0
# for col in common_cols - set(X_test_students.columns):
#     X_test_students[col] = 0

# # Ajouter les colonnes manquantes aux professionnels
# for col in common_cols - set(X_train_prof.columns):
#     X_train_prof[col] = 0
# for col in common_cols - set(X_test_prof.columns):
#     X_test_prof[col] = 0

# # Réordonner les colonnes pour qu'elles aient exactement le même ordre
# X_train_students = X_train_students[sorted(common_cols)]
# X_test_students = X_test_students[sorted(common_cols)]
# X_train_prof = X_train_prof[sorted(common_cols)]
# X_test_prof = X_test_prof[sorted(common_cols)]

In [75]:
# Modèle de régression logistique déjà entraîné sur chaque sous-ensemble
# =============================================================================
model_lr_students = LogisticRegression(max_iter=1000, random_state=42)
model_lr_students.fit(X_train_students, y_train_students)
y_pred_lr_students = model_lr_students.predict(X_test_students)

model_lr_prof = LogisticRegression(max_iter=1000, random_state=42)
model_lr_prof.fit(X_train_prof, y_train_prof)
y_pred_lr_prof = model_lr_prof.predict(X_test_prof)

# Combinaison des prédictions et des vérités pour le modèle LR
y_test_combined_lr = np.concatenate([y_test_students, y_test_prof])
y_pred_combined_lr = np.concatenate([y_pred_lr_students, y_pred_lr_prof])

print("=== Résultats - Régression Logistique (combiné) ===")
print("Accuracy :", accuracy_score(y_test_combined_lr, y_pred_combined_lr))
print("Classification Report :")
print(classification_report(y_test_combined_lr, y_pred_combined_lr))
print("Matrice de Confusion :")
print(confusion_matrix(y_test_combined_lr, y_pred_combined_lr))

=== Résultats - Régression Logistique (combiné) ===
Accuracy : 0.953125
Classification Report :
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       416
           1       0.93      0.81      0.87        96

    accuracy                           0.95       512
   macro avg       0.94      0.90      0.92       512
weighted avg       0.95      0.95      0.95       512

Matrice de Confusion :
[[410   6]
 [ 18  78]]


In [76]:
# Entraîner 5 autres modèles performants sur chaque sous-ensemble
#    et combiner leurs prédictions pour comparer les performances.
# =============================================================================

# On définit un dictionnaire de modèles à entraîner
modeles = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42)
}

def entrainer_modeles(modeles, X_train, y_train, X_test, y_test):
    """Entraîne les modèles donnés, retourne un dictionnaire avec
       les performances, les prédictions et les modèles entraînés."""
    resultats = {}
    for nom, modele in modeles.items():
        modele.fit(X_train, y_train)
        y_pred = modele.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        resultats[nom] = {
            'model': modele,
            'accuracy': acc,
            'y_pred': y_pred,
            'report': classification_report(y_test, y_pred, output_dict=True),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }
    return resultats

# Entraîner sur le sous-ensemble étudiants
resultats_students = entrainer_modeles(modeles, X_train_students, y_train_students, X_test_students, y_test_students)

# Entraîner sur le sous-ensemble professionnels
resultats_prof = entrainer_modeles(modeles, X_train_prof, y_train_prof, X_test_prof, y_test_prof)




In [77]:
# Combiner les prédictions pour chaque modèle et afficher les résultats globaux
# =============================================================================
def combiner_resultats(nom_modele, res_students, res_prof, y_test_students, y_test_prof):
    """Combine les prédictions et calcul l'accuracy globale pour un modèle donné."""
    y_pred_students = res_students[nom_modele]['y_pred']
    y_pred_prof = res_prof[nom_modele]['y_pred']
    y_test_combined = np.concatenate([y_test_students, y_test_prof])
    y_pred_combined = np.concatenate([y_pred_students, y_pred_prof])
    acc = accuracy_score(y_test_combined, y_pred_combined)
    return y_test_combined, y_pred_combined, acc

# Affichage des performances combinées pour chacun des modèles supplémentaires
for nom in modeles.keys():
    y_test_comb, y_pred_comb, acc = combiner_resultats(nom, resultats_students, resultats_prof,
                                                        y_test_students, y_test_prof)
    print(f"=== {nom} - Performance combinée ===")
    print("Accuracy :", acc)
    print(classification_report(y_test_comb, y_pred_comb))
    print("Matrice de confusion :\n", confusion_matrix(y_test_comb, y_pred_comb))
    print("\n" + "="*50 + "\n")

=== RandomForest - Performance combinée ===
Accuracy : 0.935546875
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       416
           1       0.92      0.72      0.81        96

    accuracy                           0.94       512
   macro avg       0.93      0.85      0.88       512
weighted avg       0.93      0.94      0.93       512

Matrice de confusion :
 [[410   6]
 [ 27  69]]


=== GradientBoosting - Performance combinée ===
Accuracy : 0.955078125
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       416
           1       0.95      0.80      0.87        96

    accuracy                           0.96       512
   macro avg       0.95      0.90      0.92       512
weighted avg       0.95      0.96      0.95       512

Matrice de confusion :
 [[412   4]
 [ 19  77]]


=== AdaBoost - Performance combinée ===
Accuracy : 0.95703125
              precision    recall  f1-score   su

In [78]:
print(resultats_students['RandomForest']['model'])
print(resultats_prof['RandomForest']['model'])


RandomForestClassifier(random_state=42)
RandomForestClassifier(random_state=42)


In [79]:
# # Affichage des Features Importantes (CORRECTION INCLUSE)
# # =============================================================================
# def afficher_feature_importances(modele, X, titre="Feature Importances"):
#     """Affiche un barplot des features importantes pour un modèle RandomForest."""
#     if modele is None or not hasattr(modele, 'feature_importances_'):
#         print(f"Erreur : {titre} - Le modèle ne fournit pas de feature_importances_.")
#         return

#     importances = modele.feature_importances_
#     features = X.columns

#     # Vérifier si les longueurs correspondent
#     if len(importances) != len(features):
#         print(f"⚠ Erreur : {titre} - Taille mismatch entre importances ({len(importances)}) et features ({len(features)}).")
#         return

#     df_importances = pd.DataFrame({'feature': features, 'importance': importances}).sort_values(by='importance', ascending=False)

#     plt.figure(figsize=(18, 8))
#     sns.barplot(data=df_importances, x='importance', y='feature', palette="viridis")
#     plt.title(titre)
#     plt.tight_layout()
#     plt.show()


In [80]:
# # Affichage pour RandomForest sur les étudiants et les professionnels
# if 'RandomForest' in resultats_students:
#     afficher_feature_importances(resultats_students['RandomForest']['model'], X_train_students,
#                                  titre="RandomForest - Feature Importances (Étudiants)")

# if 'RandomForest' in resultats_prof:
#     afficher_feature_importances(resultats_prof['RandomForest']['model'], X_train_prof,
#                                  titre="RandomForest - Feature Importances (Professionnels)")

In [81]:
# Fonction pour extraire les feature importances d'un modèle donné ---
def get_feature_importance(model, X):
    """
    Extrait les importances de features pour un modèle qui possède l'attribut feature_importances_.
    Retourne un DataFrame avec deux colonnes : 'feature' et 'importance'.
    """
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
        # On suppose que l'ordre correspond aux colonnes de X
        return pd.DataFrame({'feature': X.columns, 'importance': importance})
    else:
        print("Ce modèle ne fournit pas d'attribut 'feature_importances_'.")
        return None

In [82]:
pd.set_option('display.max_columns', None)
df_students.head()

Unnamed: 0,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Financial Stress,Depression,Gender_Male,Dietary Habits_Moderate,Dietary Habits_Unhealthy,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes,Age Category_Jeune Adulte,Age Category_Adulte,Age Category_Senior Actif,Age Category_Retraité,Degree Category_Bac+5 et plus,Degree Category_Doctorat et plus,Degree Category_Secondaire,Work/Study Hours Category_Temps Partiel Léger,Work/Study Hours Category_Temps Partiel,Work/Study Hours Category_Temps Plein,City Category_Petite Ville,City Category_Ville Moyenne
17,2.0,6.51,4.0,7.5,2,No,True,True,False,True,True,False,True,False,False,False,False,False,False,False,True,True,False
19,4.0,7.48,5.0,5.5,1,No,True,False,False,True,True,False,True,False,False,False,True,False,False,False,True,False,False
28,1.0,7.21,3.0,5.5,4,Yes,True,False,True,True,False,False,True,False,False,False,True,False,False,False,True,True,False
33,1.0,9.9,4.0,9.0,2,No,True,False,True,True,True,True,False,False,False,True,False,False,False,False,True,False,False
35,1.0,5.97,5.0,9.0,2,No,False,False,False,True,True,False,True,False,False,True,False,False,False,True,False,True,False


In [83]:
pd.set_option('display.max_columns', None)
df_professionals.head()

Unnamed: 0,Work Pressure,Job Satisfaction,Sleep Duration,Financial Stress,Depression,Gender_Male,Dietary Habits_Moderate,Dietary Habits_Unhealthy,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes,Age Category_Jeune Adulte,Age Category_Adulte,Age Category_Senior Actif,Age Category_Retraité,Degree Category_Bac+5 et plus,Degree Category_Doctorat et plus,Degree Category_Secondaire,Work/Study Hours Category_Temps Partiel Léger,Work/Study Hours Category_Temps Partiel,Work/Study Hours Category_Temps Plein,City Category_Petite Ville,City Category_Ville Moyenne,Profession_Architect,Profession_Business Analyst,Profession_Chef,Profession_Chemist,Profession_Civil Engineer,Profession_Consultant,Profession_Content Writer,Profession_Customer Support,Profession_Data Scientist,Profession_Digital Marketer,Profession_Doctor,Profession_Educational Consultant,Profession_Electrician,Profession_Entrepreneur,Profession_Finanancial Analyst,Profession_Financial Analyst,Profession_Graphic Designer,Profession_HR Manager,Profession_Investment Banker,Profession_Judge,Profession_Lawyer,Profession_Manager,Profession_Marketing Manager,Profession_Mechanical Engineer,Profession_Pharmacist,Profession_Pilot,Profession_Plumber,Profession_Research Analyst,Profession_Researcher,Profession_Sales Executive,Profession_Software Engineer,Profession_Teacher,Profession_Travel Consultant,Profession_UX/UI Designer,Profession_Unknown
0,2.0,4.0,7.5,2,No,False,True,False,False,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,4.0,3.0,5.5,4,No,True,False,True,True,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2.0,3.0,5.5,2,No,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,3.0,5.0,7.5,2,No,False,False,False,True,True,False,False,True,False,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
4,4.0,3.0,7.5,5,No,True,True,False,True,True,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
