In [30]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE


# 1. Charger les données
df = pd.read_csv("loan.csv", low_memory=False)

# 2. Garder uniquement lesdonnées  type prêts "Fully Paid" et "Charged Off"
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
df['target'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

# 3. Sélection des variables explicatifs
features = ['loan_amnt', 'term', 'int_rate', 'emp_length',
            'annual_inc', 'purpose', 'dti', 'total_acc']
df = df[features + ['target']].copy()

# 4. Nettoyage
df['int_rate'] = df['int_rate'].astype(str).str.rstrip('%').astype(float)
df[['term', 'emp_length', 'purpose']] = df[['term', 'emp_length', 'purpose']].fillna('Unknown')

# 5. Variables catégorielles
le_dict = {}
for col in ['term', 'emp_length', 'purpose']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

# 6. Supprimer les lignes avec NaN restantes
df = df.dropna()

# 7. Séparer X et y
X = df.drop('target', axis=1)
y = df['target']

# 8. Standardiser
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 9. Equilibrage des données avec SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 10. Train/Test Split sur les données rééquilibrées
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 10. Entraînement du model LogisticRegression 
model = LogisticRegression(max_iter=3000, class_weight='balanced')
model.fit(X_train, y_train)

# 11. Évaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# 12. Sauvegarde
joblib.dump(model, "credit_model_logreg_v3.pkl")
joblib.dump(le_dict, "label_encoders_v3.pkl")
joblib.dump(scaler, "scaler_v3.pkl")



              precision    recall  f1-score   support

           0       0.65      0.65      0.65     41787
           1       0.65      0.64      0.64     41303

    accuracy                           0.65     83090
   macro avg       0.65      0.65      0.65     83090
weighted avg       0.65      0.65      0.65     83090



['scaler_v3.pkl']

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# 1. Charger les données
df = pd.read_csv("loan.csv", low_memory=False)

# 2. Garder uniquement les prêts "Fully Paid" et "Charged Off"
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
df['target'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

# 3. Sélection des variables
features = ['loan_amnt', 'term', 'int_rate', 'emp_length',
            'annual_inc', 'purpose', 'dti']
df = df[features + ['target']].copy()

# 4. Nettoyage
df['int_rate'] = df['int_rate'].astype(str).str.rstrip('%').astype(float)
df[['term', 'emp_length', 'purpose']] = df[['term', 'emp_length', 'purpose']].fillna('Unknown')

# 5. Encodage des variables catégorielles
le_dict = {}
for col in ['term', 'emp_length', 'purpose']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

# 6. Supprimer les lignes avec NaN restantes
df = df.dropna()

# 7. Séparer X et y
X = df.drop('target', axis=1)
y = df['target']

# 8. Standardiser
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 9. Appliquer SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 10. Train/Test Split sur les données rééquilibrées
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 11. Entraînement Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 12. Prédiction et évaluation
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

# 13. Sauvegarde des objets
joblib.dump(rf, "credit_model_rf_smote.pkl")
joblib.dump(le_dict, "label_encoders_rf_smote.pkl")
joblib.dump(scaler, "scaler_rf_smote.pkl")

              precision    recall  f1-score   support

           0       0.86      0.87      0.86     41787
           1       0.86      0.86      0.86     41303

    accuracy                           0.86     83090
   macro avg       0.86      0.86      0.86     83090
weighted avg       0.86      0.86      0.86     83090



['scaler_rf_smote.pkl']