In [1]:
import pandas as pd
import numpy as np
import time
import shap
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns

Pour analyser la pertinence du modèle que nous sélectionnerons, nous devons créer une fonction qui pénalise les mauvaises prédiction et plus particulièrement les False Negative. Ce sont des prédictions du modèle qui prédise un 0 alors que nous attendons un 1. Cette prédiction est plus dangereuse pour la pertinence du modèle car si nous prédisons un remboursement probable d'un crédit et que cela n'est pas le cas, cela impacte directement la rentabilité de la banque, ce qui n'est pas souhaitable.

In [2]:
def custom_penalty(y_test, y_pred):
    y_test = y_test.to_numpy().flatten()
    penalty = 0.0
    sum_penalty = 0
    
    for yt, yp in zip(y_test, y_pred):
        if yt == 1 and yp == 0:
            penalty = penalty + 10
            
        elif yt == 0 and yp == 1:
            penalty = penalty + 1

            
    score = (penalty/len(y_test))*0.1#/ (sum_penalty + 1e-7) # Prevents zero division
    return 1 - score

In [3]:
X_train_hybrid = pd.read_csv('X_train_CSV_hybrid.csv')
X_test_hybrid = pd.read_csv('X_test_CSV_hybrid.csv')
y_train_hybrid = pd.read_csv('y_train_CSV_hybrid.csv')
y_test_hybrid = pd.read_csv('y_test_CSV_hybrid.csv')

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_hybrid)
X_test_scaled = scaler.transform(X_test_hybrid)

In [5]:
start_time = time.time()
y_pred_random = np.random.randint(2, size=y_test_hybrid.shape[0])
end_time = time.time()
execution_time = end_time - start_time
print(f"Le temps de traitement est de : {execution_time} secondes")

Le temps de traitement est de : 0.0002162456512451172 secondes


In [6]:
custom_penalty(y_test_hybrid, y_pred_random)

0.9158

In [7]:
f1 = f1_score(y_test_hybrid, y_pred_random)
print(f"F1-score : {f1}")

F1-score : 0.12790494495893764


In [8]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test_hybrid, y_pred_random)
auc_score_hybrid = roc_auc_score(y_test_hybrid, y_pred_random)
auc_score_hybrid

0.4958715224021558

In [9]:
X_train_under = pd.read_csv('X_train_CSV_under.csv')
X_test_under = pd.read_csv('X_test_CSV_under.csv')
y_train_under = pd.read_csv('y_train_CSV_under.csv')
y_test_under = pd.read_csv('y_test_CSV_under.csv')

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_under)
X_test_scaled = scaler.transform(X_test_under)

In [11]:
start_time = time.time()
y_pred_random = np.random.randint(2, size=y_test_under.shape[0])
end_time = time.time()
execution_time = end_time - start_time
print(f"Le temps de traitement est de : {execution_time} secondes")

Le temps de traitement est de : 0.0005259513854980469 secondes


In [12]:
custom_penalty(y_test_under, y_pred_random)

0.91571

In [13]:
f1 = f1_score(y_test_under, y_pred_random)
print(f"F1-score : {f1}")

F1-score : 0.12852233676975947


In [14]:
fpr, tpr, thresholds = roc_curve(y_test_hybrid, y_pred_random)
auc_score_hybrid = roc_auc_score(y_test_hybrid, y_pred_random)
auc_score_hybrid

0.49641702369114776