### Chargement du dataset

In [1]:
import kagglehub as kh
import pandas as pd

# Télécharger le dataset dans un dossier local
dataset_ref = "jacklizhi/creditcard"
path = kh.dataset_download(dataset_ref)

# Localiser le fichier CSV dans le dossier téléchargé
csv_file = f"{path}/creditcard.csv"

# Charger le fichier CSV dans un DataFrame
df = pd.read_csv(csv_file)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df['Class'].value_counts(normalize=True)

Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

### Preprocessing

In [5]:
# inliers = 1, outliers = -1
df['Class'] = df['Class'].replace(1,-1)
df['Class'] = df['Class'].replace(0,1)

df['Class'].value_counts(normalize=True)

Class
 1    0.998273
-1    0.001727
Name: proportion, dtype: float64

In [6]:
df = df.drop('Time',axis=1)

In [None]:
# REEQUILIBRAGE Inliers / Outliers pour classification supervisée
df = df.sample(frac=1)

frauds = df[df['Class'] == -1]
non_frauds = df[df['Class'] == 1][:500]

new_df = pd.concat([non_frauds, frauds])
df = new_df.sample(frac = 1, random_state = 42)

In [17]:
# Séparation train/test
X = df.drop('Class',axis=1)
y = df['Class']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
# REEQUILIBRAGE Inliers / Outliers pour classification supervisée
y_train_knn = pd.concat([y_train[(y_train==1)][:500],y_train[(y_train==-1)]])
X_train_knn = X_train.loc[y_train_knn.index]

In [25]:
# NORMALISATION des données
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
X_train_knn_norm = scaler.fit_transform(X_train_knn)
X_test_norm = scaler.transform(X_test)

### Entraînement supervisé

In [30]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.metrics import f1_score,make_scorer

# Grille de paramétrage
param_grid = {
    'n_neighbors' : np.arange(1,25)
}

# 3 folds stratifiés pour la validation croisée
skf = StratifiedKFold(n_splits = 3)

# Métrique pour la validation croisée : rappel
resc = make_scorer(f1_score,pos_label = -1)

# GridSearch
grid = GridSearchCV(KNeighborsClassifier(),param_grid,scoring=resc,cv=skf,n_jobs=-1)
grid.fit(X_train_knn_norm,y_train_knn)

0,1,2
,estimator,KNeighborsClassifier()
,param_grid,"{'n_neighbors': array([ 1, 2..., 22, 23, 24])}"
,scoring,make_scorer(f... pos_label=-1)
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_neighbors,np.int64(12)
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [31]:
# Prédiction sur le jeu de test
best_estim = grid.best_estimator_
y_pred = best_estim.predict(X_test_norm)

# Matrice de confusion
pd.crosstab(y_test,y_pred,rownames=['Réel'],colnames=["Prédit"])

Prédit,-1,1
Réel,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,87,11
1,718,56146


In [32]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.11      0.89      0.19        98
           1       1.00      0.99      0.99     56864

    accuracy                           0.99     56962
   macro avg       0.55      0.94      0.59     56962
weighted avg       1.00      0.99      0.99     56962



In [33]:
from sklearn.metrics import roc_auc_score, matthews_corrcoef
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))
print("#"*50)
print("MCC Score:", matthews_corrcoef(y_test, y_pred))


ROC-AUC Score: 0.9375642420727436
##################################################
MCC Score: 0.3072583336620065


In [40]:
bilan = X_test
bilan["Fraude Réelle"]=(y_test==-1)
bilan["Fraude Prédite"]=(y_pred==-1)
bilan = bilan.loc[bilan["Fraude Prédite"]].drop("Fraude Prédite",axis=1)

display(bilan)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Fraude Réelle
43428,-16.526507,8.584972,-18.649853,9.505594,-13.793819,-2.832404,-16.701694,7.517344,-8.507059,-14.110184,...,1.190739,-1.127670,-2.358579,0.673461,-1.413700,-0.462762,-2.018575,-1.042804,364.19,True
68279,0.676855,1.587029,-1.508147,1.443815,1.316790,-1.160342,1.149049,-0.419393,-0.090565,-0.323058,...,-0.191418,0.101451,0.058298,-0.135311,-0.604406,-0.452763,0.046430,-0.383642,0.89,False
219257,-29.942972,-25.831782,-16.227512,6.690679,-20.787846,13.085694,17.256623,-9.161746,5.003041,-2.431466,...,-2.494699,-0.660297,-8.537816,0.400804,-0.643023,0.496903,6.267709,-2.765070,3502.11,False
260891,1.691918,0.060811,-3.159448,0.759007,0.802683,-1.476673,0.755862,-0.324032,0.354501,-1.361956,...,0.051500,-0.094622,-0.235420,-0.621991,0.297541,-0.532990,-0.010200,0.030144,179.70,False
271159,1.887761,0.628270,-2.759571,1.622396,0.944622,-1.154559,0.364581,-0.135389,0.034552,-0.974695,...,0.106824,0.293626,-0.136295,-0.730963,0.263144,-0.419546,0.029239,0.019826,48.80,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190373,-1.020601,0.408378,1.346063,2.442500,1.872342,0.740481,0.198278,0.446584,-1.090081,0.550943,...,0.179631,0.395384,-0.375630,-1.127751,0.275364,0.079483,0.075846,0.131860,0.00,False
258316,1.709786,1.007509,-1.754137,3.955383,1.803040,1.263957,0.179941,0.388057,-1.181031,0.098890,...,-0.299799,-0.666118,0.421406,-1.676892,-0.546952,-0.213418,0.070210,0.004708,11.37,False
234313,0.028130,2.766468,-3.313335,5.464774,-0.390974,-0.281124,1.212967,0.490753,-2.157466,-0.306736,...,0.019584,0.237740,0.328578,-0.355494,-0.314413,0.492333,0.202147,-0.159341,290.80,False
34071,-1.920081,-0.815548,-0.585533,1.737926,4.230568,-2.461522,-0.354317,-0.099259,-0.922947,-1.177254,...,0.063216,-0.345277,0.022152,-0.832397,0.167621,-0.275638,-0.074467,0.038481,1.00,False
