In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [2]:
# Charger les données
df = pd.read_csv("credit_risk_dataset.csv") 
df.head(5)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
# Vérification des valeurs manquantes
df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [4]:
# Suppression ou imputation des valeurs manquantes
df = df.dropna() 

In [5]:
# Encodage des variables catégorielles
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Normalisation des données
scaler = StandardScaler()

X = df.drop(columns=["loan_status"])  
y = df["loan_status"]

X_scaled = scaler.fit_transform(X)
joblib.dump(X_scaled, "scaler.joblib")

# Division en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


### 📊 Étape 2 : Analyse Exploratoire des Données (EDA)

In [None]:

# Distribution des variables
df.hist(figsize=(12, 10))
plt.show()

# Matrice de corrélation
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Matrice de corrélation")
plt.show()

# Boxplots des principales variables
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.title("Boxplot des variables")
plt.show()


### 🏗️ Étape 3 : Construction des Modèles

In [None]:
# Modèle de Régression Logistique
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)


In [None]:
# Évaluation des modèles
print("Régression Logistique:\n", classification_report(y_test, y_pred_log))

In [None]:
# Modèle d'Arbre de Décision
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)


In [None]:
# Évaluation des modèles
print("Arbre de Décision:\n", classification_report(y_test, y_pred_tree))

In [None]:
# Comparaison des scores AUC-ROC
print("AUC-ROC Régression Logistique:", roc_auc_score(y_test, y_pred_log))
print("AUC-ROC Arbre de Décision:", roc_auc_score(y_test, y_pred_tree))


### 🏆 Étape 4 : Sélection du Meilleur Modèle

In [None]:
best_model = log_model if roc_auc_score(y_test, y_pred_log) > roc_auc_score(y_test, y_pred_tree) else tree_model

# Sauvegarde du modèle
import joblib
joblib.dump(best_model, "model.joblib")
