In [1]:

import os
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score


In [2]:

# ðŸ”¹ DÃ©finir le dossier oÃ¹ enregistrer les modÃ¨les
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)  # CrÃ©e le dossier s'il n'existe pas

# ðŸ”¹ Charger les donnÃ©es
data = pd.read_csv("C:/Users/grego/Documents/USPN M2/8 RISQUE DE CREDIT/Projet/Data/credit_risk_dataset.csv")

# ðŸ”¹ Suppression des valeurs aberrantes
data = data[data['person_age'] <= 80]  # Garde les Ã¢ges <= 80 ans
data = data[data['person_emp_length'] <= 60]  # Garde les emplois <= 60 ans
data.dropna(inplace=True)  # Supprime les lignes avec valeurs manquantes

# ðŸ”¹ CrÃ©ation de nouvelles features catÃ©goriques
data['age_group'] = pd.cut(data['person_age'], bins=[20, 26, 36, 46, 56, 66], labels=['20-25', '26-35', '36-45', '46-55', '56-65'])
data['income_group'] = pd.cut(data['person_income'], bins=[0, 25000, 50000, 75000, 100000, float('inf')], labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])
data['loan_amount_group'] = pd.cut(data['loan_amnt'], bins=[0, 5000, 10000, 15000, float('inf')], labels=['small', 'medium', 'large', 'very large'])

# ðŸ”¹ VÃ©rification de la prÃ©sence des nouvelles colonnes
print("âœ… Colonnes crÃ©Ã©es avec succÃ¨s !")
print(data[['age_group', 'income_group', 'loan_amount_group']].head())

# ðŸ”¹ DÃ©finition des colonnes Ã  encoder (ðŸš¨ `loan_grade` SUPPRIMÃ‰)
ohe_columns = ['cb_person_default_on_file', 'person_home_ownership', 'loan_intent', 'income_group', 'age_group', 'loan_amount_group']

# ðŸ”¹ EntraÃ®nement du OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(data[ohe_columns])

# ðŸ”¹ Sauvegarde du OneHotEncoder
joblib.dump(ohe, os.path.join(MODEL_DIR, "onehot_encoder.pkl"))
print("âœ… OneHotEncoder sauvegardÃ© sous 'models/onehot_encoder.pkl'")

# ðŸ”¹ CrÃ©ation des nouvelles features numÃ©riques
data['loan_to_income_ratio'] = data['loan_amnt'] / data['person_income']
data['loan_to_emp_length_ratio'] = data['person_emp_length'] / data['loan_amnt']
data['int_rate_to_loan_amt_ratio'] = data['loan_int_rate'] / data['loan_amnt']

# ðŸ”¹ DÃ©finition des colonnes numÃ©riques Ã  normaliser
normal_cols = ['person_income', 'person_age', 'person_emp_length', 'loan_amnt', 'loan_int_rate',
               'cb_person_cred_hist_length', 'loan_percent_income', 'loan_to_income_ratio', 
               'loan_to_emp_length_ratio', 'int_rate_to_loan_amt_ratio']

# ðŸ”¹ EntraÃ®nement du StandardScaler
scaler = StandardScaler()
scaler.fit(data[normal_cols])

# ðŸ”¹ Sauvegarde du StandardScaler
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
print("âœ… StandardScaler sauvegardÃ© sous 'models/scaler.pkl'")


âœ… Colonnes crÃ©Ã©es avec succÃ¨s !
  age_group income_group loan_amount_group
1     20-25          low             small
2     20-25          low            medium
3     20-25       middle        very large
4     20-25       middle        very large
5     20-25          low             small
âœ… OneHotEncoder sauvegardÃ© sous 'models/onehot_encoder.pkl'
âœ… StandardScaler sauvegardÃ© sous 'models/scaler.pkl'


In [3]:

# ðŸ”¹ SÃ©paration des donnÃ©es en features (X) et target (y)
X = data.drop(columns=['loan_status'])
y = data['loan_status']

# ðŸ”¹ Division en train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
print(f"ðŸ“Š Taille du jeu d'entraÃ®nement : {X_train.shape}")
print(f"ðŸ“Š Taille du jeu de test : {X_test.shape}")

# ðŸ”¹ Transformation des donnÃ©es (OneHotEncoding + Normalisation)
X_train_encoded = pd.DataFrame(ohe.transform(X_train[ohe_columns]), columns=ohe.get_feature_names_out())
X_test_encoded = pd.DataFrame(ohe.transform(X_test[ohe_columns]), columns=ohe.get_feature_names_out())

X_train_scaled = pd.DataFrame(scaler.transform(X_train[normal_cols]), columns=normal_cols)
X_test_scaled = pd.DataFrame(scaler.transform(X_test[normal_cols]), columns=normal_cols)

# ðŸ”¹ Fusion des features encodÃ©es et normalisÃ©es
X_train_final = pd.concat([X_train_encoded, X_train_scaled], axis=1)
X_test_final = pd.concat([X_test_encoded, X_test_scaled], axis=1)

# ðŸ”¹ VÃ©rification des dimensions avant entraÃ®nement du modÃ¨le
print(f"âœ… X_train_final shape: {X_train_final.shape}")
print(f"âœ… X_test_final shape: {X_test_final.shape}")

# ðŸ”¹ DÃ©finition du modÃ¨le XGBoost
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100
)

# ðŸ”¹ EntraÃ®nement du modÃ¨le XGBoost
xgb_model.fit(X_train_final, y_train)

# ðŸ”¹ PrÃ©dictions et Ã©valuation
y_pred = xgb_model.predict(X_test_final)
accuracy = accuracy_score(y_test, y_pred)
print(f"ðŸŽ¯ PrÃ©cision du modÃ¨le XGBoost : {accuracy:.4f}")

# ðŸ”¹ Sauvegarde du modÃ¨le
joblib.dump(xgb_model, os.path.join(MODEL_DIR, "xgboost_credit_risk.pkl"))
print("âœ… ModÃ¨le sauvegardÃ© sous 'models/xgboost_credit_risk.pkl'")



ðŸ“Š Taille du jeu d'entraÃ®nement : (22904, 17)
ðŸ“Š Taille du jeu de test : (5727, 17)
âœ… X_train_final shape: (22904, 37)
âœ… X_test_final shape: (5727, 37)
ðŸŽ¯ PrÃ©cision du modÃ¨le XGBoost : 0.9265
âœ… ModÃ¨le sauvegardÃ© sous 'models/xgboost_credit_risk.pkl'


Parameters: { "use_label_encoder" } are not used.

