### Chargement du dataset

In [1]:
import kagglehub as kh
import pandas as pd

# Télécharger le dataset dans un dossier local
dataset_ref = "jacklizhi/creditcard"
path = kh.dataset_download(dataset_ref)

# Localiser le fichier CSV dans le dossier téléchargé
csv_file = f"{path}/creditcard.csv"

# Charger le fichier CSV dans un DataFrame
df = pd.read_csv(csv_file)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df['Class'].value_counts(normalize=True)

Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

### Preprocessing

In [5]:
# inliers = 1, outliers = -1
df['Class'] = df['Class'].replace(1,-1)
df['Class'] = df['Class'].replace(0,1)

df['Class'].value_counts(normalize=True)

Class
 1    0.998273
-1    0.001727
Name: proportion, dtype: float64

In [6]:
df = df.drop('Time',axis=1)

In [7]:
# Séparation train/test
X = df.drop('Class',axis=1)
y = df['Class']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# On sélectionne uniquement les labels qui correspondent aux INLIERS
y_train_inliers = y_train[y_train.values==1]

# On récupère les variables explicatives de ces index
liste = list(y_train_inliers.index.values)
X_train_inliers = X_train.loc[liste]

In [None]:
# NORMALISATION des données
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
X_train_inliers = scaler.fit_transform(X_train_inliers)
X_test = scaler.transform(X_test)

### Entraînement non supervisé (sur inliers)

In [16]:
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.metrics import recall_score,make_scorer

# Grille de paramétrage
param_grid = {
    'nu':np.linspace(0.01,0.05,5)
}

# 3 folds stratifiés pour la validation croisée
skf = StratifiedKFold(n_splits = 3)

# Métrique pour la validation croisée : rappel
resc = make_scorer(recall_score,pos_label = -1)

# GridSearch
grid = GridSearchCV(OneClassSVM(),param_grid,scoring=resc,cv=skf,n_jobs=-1)
grid.fit(X_train_inliers,y_train_inliers)

0,1,2
,estimator,OneClassSVM()
,param_grid,"{'nu': array([0.01, ..., 0.04, 0.05])}"
,scoring,make_scorer(r... pos_label=-1)
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,nu,np.float64(0.01)
,shrinking,True
,cache_size,200
,verbose,False
,max_iter,-1


In [17]:
# Prédiction sur le jeu de test
best_estim = grid.best_estimator_
y_pred = best_estim.predict(X_test)

# Matrice de confusion
pd.crosstab(y_test,y_pred,rownames=['Réel'],colnames=["Prédit"])

Prédit,-1,1
Réel,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,78,20
1,591,56273


In [18]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.12      0.80      0.20        98
           1       1.00      0.99      0.99     56864

    accuracy                           0.99     56962
   macro avg       0.56      0.89      0.60     56962
weighted avg       1.00      0.99      0.99     56962



In [19]:
from sklearn.metrics import roc_auc_score, matthews_corrcoef
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))
print("#"*50)
print("MCC Score:", matthews_corrcoef(y_test, y_pred))


ROC-AUC Score: 0.8927625742193332
##################################################
MCC Score: 0.30217016275641956
