### Chargement du dataset

In [2]:
import kagglehub as kh
import pandas as pd

# Télécharger le dataset dans un dossier local
dataset_ref = "jacklizhi/creditcard"
path = kh.dataset_download(dataset_ref)

# Localiser le fichier CSV dans le dossier téléchargé
csv_file = f"{path}/creditcard.csv"

# Charger le fichier CSV dans un DataFrame
df = pd.read_csv(csv_file)

In [3]:
df['Class'].value_counts(normalize=True)

Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

### Preprocessing

In [6]:
# inliers = 1, outliers = -1
df['Class'] = df['Class'].replace(1,-1)
df['Class'] = df['Class'].replace(0,1)

df['Class'].value_counts(normalize=True)

Class
 1    0.998273
-1    0.001727
Name: proportion, dtype: float64

In [7]:
df = df.drop('Time',axis=1)

In [8]:
# Séparation train/test
X = df.drop('Class',axis=1)
y = df['Class']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
# On sélectionne uniquement les labels qui correspondent aux INLIERS
y_train_inliers = y_train[y_train.values==1]

# On récupère les variables explicatives de ces index
liste = list(y_train_inliers.index.values)
X_train_inliers = X_train.loc[liste]

In [10]:
# NORMALISATION des données
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
X_train_inliers = scaler.fit_transform(X_train_inliers)
X_test = scaler.transform(X_test)

### Entraînement non supervisé (sur inliers)

In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

# Définition de l'auto-encoder
input_dim = X_train_inliers.shape[1]

inputs = layers.Input(shape=(input_dim,))
encoded = layers.Dense(16, activation="relu")(inputs)
encoded = layers.Dense(8, activation="relu")(encoded)

decoded = layers.Dense(16, activation="relu")(encoded)
decoded = layers.Dense(input_dim, activation="tanh")(decoded) # sortie entre -1 et 1

autoencoder = models.Model(inputs, decoded)
autoencoder.compile(optimizer="adam", loss="mse")

In [13]:
# Entraînement
history = autoencoder.fit(
    X_train_inliers, X_train_inliers,
    epochs=20,
    batch_size=128,
    shuffle=True,
    validation_split=0.1,
    verbose=1
)

Epoch 1/20
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0120 - val_loss: 0.0049
Epoch 2/20
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0046 - val_loss: 0.0044
Epoch 3/20
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0042 - val_loss: 0.0041
Epoch 4/20
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0040 - val_loss: 0.0039
Epoch 5/20
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0036 - val_loss: 0.0033
Epoch 6/20
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0033 - val_loss: 0.0032
Epoch 7/20
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0032 - val_loss: 0.0032
Epoch 8/20
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0032 - val_loss: 0.0031
Epoch 9/20
[1m1600/1600

In [14]:
# 4. Évaluation sur le test set
reconstructions = autoencoder.predict(X_test)
errors = np.mean(np.square(reconstructions - X_test), axis=1)

# 5. Définition d’un seuil (ici 95e percentile)
threshold = np.percentile(errors, 95)
y_pred = (errors > threshold).astype(int)
y_pred[y_pred==1]=-1
y_pred[y_pred==0]=1

# Matrice de confusion
pd.crosstab(y_test,y_pred,rownames=['Réel'],colnames=["Prédit"])

[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 766us/step


Prédit,-1,1
Réel,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,83,15
1,2766,54098


In [15]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.03      0.85      0.06        98
           1       1.00      0.95      0.97     56864

    accuracy                           0.95     56962
   macro avg       0.51      0.90      0.52     56962
weighted avg       1.00      0.95      0.97     56962



In [16]:
from sklearn.metrics import roc_auc_score, matthews_corrcoef
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))
print("#"*50)
print("MCC Score:", matthews_corrcoef(y_test, y_pred))


ROC-AUC Score: 0.8991482003606168
##################################################
MCC Score: 0.15177452474022185
