**DETECTION DE FRAUDES FINANCIERES** \
**ANDRIANTAOLO Valisoaniony Anouchka ** \
Version 3.0

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
import itertools
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
import random
import lightgbm as lgbm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.cluster import KMeans
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as mk 
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


# 1. ANALYSE DE FORME

In [None]:
# Executez d'abord le notebook processus_ml1.0.ipynb
df = pd.read_csv('../data/dataset_fraud_detection.csv')

In [3]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
for col in df.select_dtypes(include='int'):
    df[col] = df[col].astype('object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6924041 entries, 0 to 6924040
Data columns (total 10 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Timestamp           datetime64[ns]
 1   From Bank           object        
 2   From Account        object        
 3   To Bank             object        
 4   To Account          object        
 5   Amount Received     float64       
 6   Receiving Currency  object        
 7   Amount Paid         float64       
 8   Payment Currency    object        
 9   Payment Format      object        
dtypes: datetime64[ns](1), float64(2), object(7)
memory usage: 528.3+ MB


In [5]:
datacopy = df.copy()

In [7]:
df.describe(include='all')

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format
count,6924041,6924041.0,6924041,6924041.0,6924041,6924041.0,6924041,6924041.0,6924041,6924041
unique,,41814.0,681281,21588.0,576176,,15,,15,7
top,,70.0,10042B660,11.0,10042B660,,US Dollar,,US Dollar,Cheque
freq,,609991.0,222037,66055.0,1553,,2537242,,2553886,2503158
mean,2022-09-05 07:09:11.304066560,,,,,6324074.0,,4676041.0,,
min,2022-09-01 00:00:00,,,,,1e-06,,1e-06,,
25%,2022-09-02 04:26:00,,,,,174.21,,175.38,,
50%,2022-09-05 12:12:00,,,,,1397.63,,1399.45,,
75%,2022-09-08 03:04:00,,,,,12296.53,,12226.87,,
max,2022-09-17 15:28:00,,,,,3644854000000.0,,3644854000000.0,,


# 2. ANALYSE DU FOND (EDA)

In [None]:
# VARIABLES QUALITATIVES
cat_cols = df.select_dtypes('object').columns
vars_quals = df[cat_cols]
vars_quals.head()

In [None]:
# VARIABLES QUANTITATIVES
cont_cols = df.select_dtypes('float').columns
vars_quants = df[cont_cols]
vars_quants.head()

## 2.1. ANALYSE UNIVARIEE

### 2.1.1. Variables qualitatives

In [None]:
# VALEURS UNIQUES
for col in cat_cols :
    print(f"Colonne : {col :.<50} {df[col].nunique()} valeurs uniques")

In [None]:
# MODALITE ET FREQUENCE
for col in cat_cols:
    if df[col].nunique() > 15 :
        fig, axes = plt.subplots(1,2, figsize=(12,8))
        # identifier les banques/comptes les plus actifs et les moins actifs
        top10 = df[col].value_counts(normalize=True).nlargest(10)
        bottom10 = df[col].value_counts(normalize=True).nsmallest(10)
        y_max = max(top10.max(),bottom10.max())*1.1
        axtop = top10.plot(kind='bar', ax=axes[0],
                           title=f'{col} (les 10 plus actifs)',
                           ylim =(0,y_max),
                           xlabel="Modalité",
                           ylabel="Fréquence")
        axbottom = bottom10.plot(kind='bar',ax=axes[1],
                                 title=f'{col} (les 10 moins actifs)',
                                 ylim =(0,y_max),
                                 xlabel="Modalité",
                                 ylabel="Fréquence")
        for p1 in axtop.patches:
            axtop.annotate(f'{p1.get_height():.2%}', (p1.get_x()+ p1.get_width()/2.,p1.get_height()), ha= 'center', va='center', xytext=(0,10), textcoords='offset points')
        for p2 in axbottom.patches:
            axbottom.annotate(f'{p2.get_height():.2%}', (p2.get_x()+ p2.get_width()/2.,p2.get_height()), ha= 'center', va='center', xytext=(0,10), textcoords='offset points')
    else:
        #  identifier les modalités dominantes et les valeurs rares potentiellement anormales
        plt.figure(figsize=(10,8))
        ax = df[col].value_counts(normalize=True).plot(kind='bar', title=f'{col}',
                                                       xlabel="Modalité",
                                                       ylabel="Fréquence",
                                                       figsize=(10,8))
        for p in ax.patches:
            ax.annotate(f'{p.get_height():.2%}', (p.get_x()+ p.get_width()/2.,p.get_height()), ha= 'center', va='center', xytext=(0,10), textcoords='offset points')



### 2.1.2. Variables quantitatives

In [None]:
# HISTOGRAMME : pour regarder la forme --> la symétrie, l'applatissement
fig, axes = plt.subplots(1,2, figsize=(10,8))
for i,col in enumerate(cont_cols) :
    sns.histplot(df[col], bins=30, kde=True, ax=axes[i])
    axes[i].set_title(f'{col}')
    axes[i].set_xlabel('Montants')
    axes[i].set_ylabel('Fréquence')

In [None]:
# BOXPLOT : pour les valeurs aberrantes --> valeurs extrêmes
plt.figure(figsize=(10,8))

sns.boxplot(data=vars_quants)
plt.title('Boxplot des montants')

In [None]:
# QQPLOT : pour savoir s'il s'agit d'une distribution normale --> comparer à une loi normale
fig, axes = plt.subplots(1,2, figsize=(10,8))
for i, col in enumerate(cont_cols):
    stats.probplot(df[col], dist='norm', plot=axes[i])
    axes[i].set_title(f'{col}')

## 2.2. ANALYSE BIVARIEE

### 2.2.1. Variables qualitatives

In [None]:
# SCATTER PLOT : pour voir le type de relation --> linéaire ou non
# Amount Paid vs Amount Received --> scatter plot + diagonale = montant inchangé
max_val = max(df['Amount Paid'].max(), df['Amount Received'].max())

plt.figure(figsize =(8,6))
plt.scatter(df['Amount Paid'], df['Amount Received'], alpha=0.4)
plt.plot([0, max_val],[0, max_val], color='red', ls='--', label='Amount Paid = Amount Received')
plt.xlabel('Amount Paid')
plt.ylabel('Amount Received')
plt.title('Amount Paid vs Amount Received')
plt.legend()

In [None]:
# SCATTER PLOT : pour voir si les écarts augmentent avec le montant
# Delta vs Amount Paid/Received
df['Delta'] = df['Amount Paid'] - df['Amount Received']

for col in cont_cols :
    plt.figure(figsize=(8,6))
    plt.scatter(df[col], df['Delta'], alpha=0.4)
    plt.axhline(0, color='red', ls='--')
    plt.xlabel(f'{col}')
    plt.ylabel(f'Delta')
    plt.title(f'Delta vs {col}')

In [None]:
# HEATMAP : correlation --> relation linéaire mais la normalité n'est pas vérifiée
sns.heatmap(vars_quants.corr(method='spearman'), annot=True, cmap='YlGnBu')

In [None]:
# TEST DE SPEARMAN

# H0 : il n'y a pas de corrélation monotone entre les deux variables (pvalue >= 0.05)
# H1 : il existe une corrélation monotone significative entre les deux variables (pvalue < 0.05)
stats.spearmanr(vars_quants['Amount Paid'],vars_quants['Amount Received'])

### 2.2.2. Variables qualitatives

In [None]:
"""From Bank vs To Bank"""
# pour la répartition croisée entre banque --> canaux bancaires les plus utilisés
# association les plus fréquentes entre banques
top10_from_bank = vars_quals[vars_quals['From Bank'].isin(vars_quals['From Bank'].value_counts().nlargest(10).index)]
top10_to_bank = vars_quals[vars_quals['To Bank'].isin(vars_quals['To Bank'].value_counts().nlargest(10).index)]

# HEATMAP
plt.figure(figsize=(15,6))
sns.heatmap(pd.crosstab(top10_from_bank['From Bank'],top10_to_bank['To Bank']), cmap='YlGnBu', annot=True, fmt='d')
plt.title('Flux bancaires : From Bank vers To Bank (sur les top 10)')

In [None]:
"""From Account vs To Account"""
# pour la répartition croisée entre compte--> comptes toujours connectés
# association suspecte entre comptes fréquents
top20_from_account = vars_quals[vars_quals['From Account'].isin(vars_quals['From Account'].value_counts().nlargest(20).index)]
top20_to_account = vars_quals[vars_quals['To Account'].isin(vars_quals['To Account'].value_counts().nlargest(20).index)]

# HEATMAP
sns.heatmap(pd.crosstab(top20_from_account['From Account'],top20_to_account['To Account']), cmap='YlGnBu', annot=True, fmt='d')
plt.title('Connexion des comptes : From Account vers To Account (sur les top 20)')

In [None]:
# Nombre de récepteurs de chaque compte --> envoie massive
to_account_by_top20_from_account = vars_quals.groupby('From Account')['To Account'].nunique().nlargest(20)

# BARPLOT
plt.figure()
sns.barplot(x=to_account_by_top20_from_account.index, y=to_account_by_top20_from_account.values)
plt.title('Nombre de récepteurs par From Account (top 20)')
plt.ylabel('Nombre (To Account)')
plt.xticks(rotation=90)


In [None]:
# Nombre d'émetteurs de chaque compte --> réception massive
from_account_by_top20_to_account = vars_quals.groupby('To Account')['From Account'].nunique().nlargest(20)

# BARPLOT
plt.figure()
sns.barplot(x=from_account_by_top20_to_account.index, y=from_account_by_top20_to_account.values)
plt.title('Nombre d\'émetteurs par To Account (top 20)')
plt.ylabel('Nombre (From Account)')
plt.xticks(rotation=90)

In [None]:
"""From Bank ve From/To Account"""

# compte lié par banque émettrice
from_account_by_top20_from_bank = vars_quals.groupby('From Bank')['From Account'].nunique().nlargest(20)
to_account_by_top20_from_bank = vars_quals.groupby('From Bank')['To Account'].nunique().nlargest(20)

# BARPLOT
fig, axes = plt.subplots(1,2, figsize=(8,6))

# combien de comptes une banque utilise pour émettre
sns.barplot(x=from_account_by_top20_from_bank.index, y=from_account_by_top20_from_bank.values, ax = axes[0],
            order=from_account_by_top20_from_bank.index)

# combien de comptes bénéficiaires ont reçu de l'argent
sns.barplot(x=to_account_by_top20_from_bank.index, y=to_account_by_top20_from_bank.values, ax = axes[1],
            order=to_account_by_top20_from_bank.index)

y_max = max(from_account_by_top20_from_bank.max(), to_account_by_top20_from_bank.max())*1.1

axes[0].set_title('From Bank <--> From Account')
axes[0].set_ylabel('Nombre de comptes (From Account)')
axes[0].tick_params(axis='x', rotation=90)
axes[0].set_ylim(0,y_max)

axes[1].set_title('From Bank <--> To Account')
axes[1].set_ylabel('Nombre de comptes (To Account)')
axes[1].tick_params(axis='x', rotation=90)
axes[1].set_ylim(0,y_max)


In [None]:
"""To Bank vs From/To Account"""

# compte lié par banque réceptrice
from_account_by_top20_to_bank = vars_quals.groupby('To Bank')['From Account'].nunique().nlargest(20)
to_account_by_top20_to_bank = vars_quals.groupby('To Bank')['To Account'].nunique().nlargest(20)

fig, axes = plt.subplots(1,2, figsize=(8,6))

# combien de comptes ont envoyé de l'argent
sns.barplot(x=from_account_by_top20_to_bank.index, y=from_account_by_top20_to_bank.values, ax = axes[0],
            order=from_account_by_top20_to_bank.index)

# combien de comptes une banque héberge en réception
sns.barplot(x=to_account_by_top20_to_bank.index, y=to_account_by_top20_to_bank.values, ax = axes[1],
            order=to_account_by_top20_to_bank.index)

y_max = max(from_account_by_top20_to_bank.max(), to_account_by_top20_to_bank.max())*1.1

axes[0].set_title('To Bank <--> From Account')
axes[0].set_ylabel('Nombre de comptes (From Account)')
axes[0].tick_params(axis='x', rotation=90)
axes[0].set_ylim(0,y_max)

axes[1].set_title('To Bank <--> To Account')
axes[1].set_ylabel('Nombre de comptes (To Account)')
axes[1].tick_params(axis='x', rotation=90)
axes[1].set_ylim(0,y_max)


In [None]:
"""Payment Currency vs Receiving Currency"""
# pour la répartition croisée entre devise --> conversion
# paire de devises inhabituelles

# HEATMAP
plt.figure(figsize=(15,6))
sns.heatmap(pd.crosstab(vars_quals['Payment Currency'],vars_quals['Receiving Currency']), cmap='YlGnBu', annot=True, fmt='d')
plt.title('Payment Currency vers Receiving Currency')

In [None]:
"""Source : From Bank/Payment Format vs Payment Currency"""

top10_from_bank = vars_quals[vars_quals['From Bank'].isin(vars_quals['From Bank'].value_counts().nlargest(10).index)]

# COUNTPLOT
for value in top10_from_bank['Payment Currency'].value_counts().index :
    plt.figure(figsize=(12,6))
    sns.countplot(data=top10_from_bank[top10_from_bank['Payment Currency'] == value],
                  x='From Bank', hue='Payment Format', order=vars_quals['From Bank'].value_counts().nlargest(10).index )
    plt.title(f'banque/format : {value}')
    plt.ylabel('Nombres')

In [None]:
"""destinataire : To Bank/Payment Format vs Receiving Currency"""

top10_to_bank = vars_quals[vars_quals['To Bank'].isin(vars_quals['To Bank'].value_counts().nlargest(10).index)]

# COUNTPLOT
for value in top10_to_bank['Receiving Currency'].value_counts().index :
    plt.figure(figsize=(12,6))
    sns.countplot(data=top10_to_bank[top10_to_bank['Receiving Currency'] == value],
                  x='To Bank', hue='Payment Format', order=vars_quals['To Bank'].value_counts().nlargest(10).index )
    plt.title(f'banque/format : {value}')
    plt.ylabel('Nombres')

In [None]:
"""Payment Format vs Payment/Received Currency"""
# incohérence entre format et devise

# BARPLOT
pd.crosstab(vars_quals['Payment Currency'], vars_quals['Payment Format']).reindex(vars_quals['Payment Currency'].value_counts().index).plot(kind='bar', stacked=True, figsize=(10,6))
plt.title('Format de paiement par devise payée')
plt.ylabel('Nombres')

pd.crosstab(vars_quals['Receiving Currency'], vars_quals['Payment Format']).reindex(vars_quals['Receiving Currency'].value_counts().index).plot(kind='bar', stacked=True, figsize=(10,6))
plt.title('Format de paiement par devise reçue')
plt.ylabel('Nombres')

In [None]:
# TEST DE CHI2

# H0 : il n'y a pas d'association entre les deux variables (pvalue >= 0.05)
# H1 : il y a une association entre les deux variables (pvalue < 0.05)

results =[]

for col1, col2 in list(itertools.combinations(cat_cols,2)) :
    if vars_quals[col1].nunique() > 15 :
        top10_col1 = vars_quals[vars_quals[col1].isin(vars_quals[col1].value_counts().nlargest(30).index)]
        # vars_quals[col1] = vars_quals[vars_quals[col1].isin(vars_quals[col1].value_counts().nlargest(20).index)][col1]
        if vars_quals[col2].nunique() > 15 :
            top10_col2 = vars_quals[vars_quals[col2].isin(vars_quals[col2].value_counts().nlargest(30).index)]
            contingency = pd.crosstab(top10_col1[col1], top10_col2[col2])
        else :
            contingency = pd.crosstab(top10_col1[col1], vars_quals[col2])
    else :
        contingency = pd.crosstab(vars_quals[col1], vars_quals[col2])

    # plt.figure()
    # sns.heatmap(contingency)

    chi2, p, dof, expected = stats.chi2_contingency(contingency)

    results.append({
        'Variable 1' : col1,
        'Variable 2' : col2,
        'Statistique du Chi2': chi2,
        'p-valeur' : p,
        'Degrés de liberté' : dof,
        'Significative' : p < 0.05,
        # 'Fréquences attendues' : expected
    })

    # si significative, voir la contribution et intensité de la relation (T de Tschuprow)
    if p < 0.05 :
        print(f'\n --- Analyse : {col1} vs {col2} ---')

        # T de Tschuprow
        n = contingency.sum().sum()
        min_dim = min(contingency.shape) - 1
        tschuprow_t = np.sqrt(chi2 / (n * min_dim))
        print(f'Coefficient T de Tschuprow  entre {col1} et {col2}: {tschuprow_t}')

        # contribution
        contrib = (contingency - expected)**2 / expected
        contrib_percent = 100 * contrib / chi2

        # contribution individuelle
        contrib_flat = contrib_percent.stack().reset_index()
        contrib_flat.columns = [col1, col2, 'Contribution (%)']

        # extraire que les plus gros contributeurs > 5%
        contrib_flat = contrib_flat[contrib_flat['Contribution (%)'] > 5]

        contrib_flat = contrib_flat.sort_values(by='Contribution (%)', ascending=False)
        display(contrib_flat)

print('\n')
print(f'\n ------ TEST DE CHI2 -----')
chi2_results = pd.DataFrame(results).sort_values(by='p-valeur')
display(chi2_results)

### 2.2.3. Variables quatitatives et qualitatives

In [None]:
# BOXPLOT
for col1 in cat_cols:
    fig, axes = plt.subplots(1,2, figsize=(10,8))
    if df[col1].nunique() > 15 :
        top10 = df[df[col1].isin(df[col1].value_counts().nlargest(10).index)]
        for i, col2 in enumerate(cont_cols):
            sns.boxplot(data=top10, x=col1, y=col2, ax = axes[i])
            axes[i].tick_params(axis='x', rotation=90)
    else :
        for i, col2 in enumerate(cont_cols):
            sns.boxplot(data=df, x=col1, y=col2, ax=axes[i])
            axes[i].tick_params(axis='x', rotation=90)

In [None]:
# TEST DE KRUSKAL-WALLIS

# H0 : les distributions de tous les groupes sont égales (pvalue >= 0.05)
# H1 : au moins une des distributions des groupes est différentes des autres (pvalue < 0.05)

results = []

for cat in cat_cols :
    for num in cont_cols :
        # extraire les groupes selon la variable catégorielle
        groups =[group[num].values for name, group in df.groupby(cat)]

        # pour éviter d'avoir des groupes vides ou à un seul groupe
        if len(groups) > 1 and all(len(g) > 0 for g in groups) :
            k_stat, p = stats.kruskal(*groups)
            results.append({
                'Variable catégorielle' : cat,
                'Variable numérique' : num,
                'Statistique de Kruskal-Wallis': k_stat,
                'p-valeur' : p,
                'Significative' : p < 0.05,
            })

kruskal_results = pd.DataFrame(results).sort_values(by='p-valeur')
display(kruskal_results)

### 2.2.4. Analyse temporelle

In [None]:
# manipulation temporelle
df = df.sort_values('Timestamp')

df['Hour'] = df['Timestamp'].dt.hour
df['Date'] = df['Timestamp'].dt.date
df['Day'] = df['Timestamp'].dt.day_of_week

df.head()

In [None]:
# VOLUME DES TRANSACTIONS --> repérer des pics

# LINEPLOT
fig, axes = plt.subplots(1,2, figsize=(10,8))
for i,col in enumerate(cont_cols) :
    df.groupby('Timestamp')[col].agg('count').resample('D').sum().plot(ax=axes[i], label='par date')
    df.groupby('Timestamp')[col].agg('count').resample('h').sum().plot(ax=axes[i], label='par heure')
    axes[i].set_title(f'{col}')
    axes[i].set_xlabel('Période couverte (date et heure)')
    axes[i].set_ylabel('Nombre des transactions')
    axes[i].legend()

In [None]:
# MONTANTS TOTALS --> repérer des pics

# lineplot
plt.figure(figsize=(10,8))
df.groupby('Timestamp')['Amount Paid'].agg('sum').resample('D').sum().plot(label='Amount Paid par date')
df.groupby('Timestamp')['Amount Received'].agg('sum').resample('D').sum().plot(label='Amount Received par date')
df.groupby('Timestamp')['Amount Paid'].agg('sum').resample('h').sum().plot(label='Amount Paid par heure')
df.groupby('Timestamp')['Amount Received'].agg('sum').resample('h').sum().plot(label='Amount Received par heure')
plt.title(f'Montants totals des transactions')
plt.xlabel('Période couverte (date et heure)')
plt.ylabel('Montants totals')
plt.legend()

In [None]:
# PAR HEURE --> repérer des activités

# BARPLOT
plt.figure()
df.groupby('Hour')[cont_cols].agg('count').plot(kind='bar')
plt.title('Volume des transactions par heure dans la journée')
plt.xlabel('Période couverte (heure)')
plt.ylabel('Nombre des transactions')

plt.figure()
df.groupby('Hour')[cont_cols].agg('sum').plot(kind='bar')
plt.title('Montants totals par heure dans la journée')
plt.xlabel('Période couverte (heure)')
plt.ylabel('Montants totals')

In [None]:
# PAR DATE --> repérer des activités

# BARPLOT
plt.figure()
df.groupby('Date')[cont_cols].agg('count').plot(kind='bar')
plt.title('Volume des transactions par date')
plt.xlabel('Période couverte (date)')
plt.ylabel('Nombre des transactions')

plt.figure()
df.groupby('Date')[cont_cols].agg('sum').plot(kind='bar')
plt.title('Montants totals par date')
plt.xlabel('Période couverte (date)')
plt.ylabel('Montants totals')

In [None]:
# PAR JOUR --> repérer des activités

# BARPLOT
plt.figure()
df.groupby('Day')[cont_cols].agg('count').plot(kind='bar')
plt.title('Volume des transactions par jour de la semaine')
plt.xlabel('Période couverte (jour)')
plt.ylabel('Nombre des transactions')

plt.figure()
df.groupby('Day')[cont_cols].agg('sum').plot(kind='bar')
plt.title('Montants totals par jour de la semaine')
plt.xlabel('Période couverte (jour)')
plt.ylabel('Montants totals')

In [None]:
# PATTERNS TEMPORELS (jour et heure) --> repérer des activités

# HEATMAP
sns.heatmap(df.groupby(['Day','Hour']).size().unstack(), cmap='YlGnBu')
plt.title('Volume des transactions (jour x heure)')
plt.xlabel('Heure')
plt.ylabel('Jour')

In [None]:
# PATTERNS TEMPORELS (date et heure) --> repérer des activités

# HEATMAP
sns.heatmap(df.groupby(['Date','Hour']).size().unstack(), cmap='YlGnBu')
plt.title('Volume des transactions (date x heure)')
plt.xlabel('Heure')
plt.ylabel('Date')

In [None]:
# PATTERNS TEMPORELS (date et jour) --> repérer des activités

# HEATMAP
sns.heatmap(df.groupby(['Date','Day']).size().unstack(), cmap='YlGnBu')
plt.title('Volume des transactions (date x jour)')
plt.xlabel('Jour')
plt.ylabel('Date')

In [None]:
# DISTRIBUTION EN FONCTION DU MOMENT --> anomalies temporelles périodiques (valeurs aaberrantes)

# BOXPLOT
sns.boxplot(data=vars_quants, x=df['Timestamp'], )
plt.title('Distribution des transactions en fonction du temps')
plt.xlabel('Période couverte (Date et heure)')
plt.xticks(rotation=90)

# 3. PREPROCESSING

In [6]:
df_train, df_test_streamlit = train_test_split(datacopy, test_size=0.3, random_state=0, shuffle=True)
df_test, df_demo = train_test_split(df_test_streamlit, test_size=0.4, random_state=0, shuffle=True)

In [7]:
print(f'Train set : {df_train.shape}') # 70%
print(f'Test set : {df_test.shape}') # 20%
print(f'Demo set : {df_demo.shape}') # 10%

Train set : (4846828, 10)
Test set : (1246327, 10)
Demo set : (830886, 10)


In [10]:
# FONCTION POUR PREPARER LE DATASET à être transformé pour l'Autoencoder
def preprocessing(df):
    df = df.copy()
    
    # feature extraction
    df['Hour'] = df['Timestamp'].dt.hour
    df['Day'] = df['Timestamp'].dt.day
    df['DayOfWeek'] = df['Timestamp'].dt.dayofweek # 0 = lundi, 6 = dimanche
    
    # transformation log
    df['Log_Amount_Paid'] = np.log1p(df['Amount Paid'])
    df['Log_Amount_Diff'] = np.log1p(np.abs(df['Amount Paid'] - df['Amount Received']))
    
    # à supprimer
    drop_cols = ['From Account', 'To Account','Amount Paid', 'Amount Received', 'Timestamp']
    df.drop(columns=drop_cols, inplace=True)
    
    # From/To Bank : encodage par fréquence
    for col in ['From Bank', 'To Bank'] :
        freq = df[col].value_counts()
        df[col] = df[col].map(freq).fillna(0)
        df[col] = np.log1p(df[col])
    
    num_cols = df.select_dtypes(exclude='object').columns.tolist()
    cat_cols = ['Receiving Currency', 'Payment Currency', 'Payment Format']
    
    return df, num_cols, cat_cols

In [None]:
# PIPELINE 1 : TRANSFORMEUR

num_pipeline = Pipeline(steps=[
    ('scaler', RobustScaler())
])

cat_pipeline = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scaler', RobustScaler())
])

cat_pipeline_lgbm = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

# 2. MODELISATION (Pipeline : transformeur + modèle)

## Phase 1 : Détection d'anomalies (autoencoder)

In [12]:
# CREATION DU MODELE AUTOENCODER (Pytorch)
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim,32),
            nn.ReLU(),
            nn.Linear(32,16),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(16,32),
            nn.ReLU(),
            nn.Linear(32,input_dim)
        )

    def forward(self,x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return x_recon

In [13]:
# fixer la reproductibilité pour l'autoencoder
def set_seed(seed=0):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
# adaptation du modèle autoencoder pour être compatible avec le pipeline scikit-learn (fit, transform, predict)
class AutoEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, epochs=20, batch_size=256, lr=1e-3, verbose=1, device=None, seed=0):
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.verbose = verbose
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = None
        self.seed = seed

    def fit(self, X, y=None):
        set_seed(self.seed)
        X = np.array(X, dtype=np.float32)
        self.input_dim = X.shape[1]
        self.model = Autoencoder(self.input_dim).to(self.device)
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.MSELoss()

        dataset = torch.utils.data.TensorDataset(torch.from_numpy(X))
        loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        for epoch in range(self.epochs):
            epoch_loss = 0.0
            for batch_x, in loader:
                batch_x = batch_x.to(self.device)
                optimizer.zero_grad()
                outputs = self.model(batch_x)
                loss = criterion(outputs, batch_x)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * batch_x.size(0)
            epoch_loss /= len(loader.dataset)
            if self.verbose:
                print(f"Epoch {epoch+1}/{self.epochs} - Loss: {epoch_loss:.6f}")
        return self

    # retourne les erreurs de reconstruction (= score d’anomalie)
    def score_samples(self, X):
        X = np.array(X, dtype=np.float32)
        with torch.no_grad():
            inputs = torch.from_numpy(X).to(self.device)
            outputs = self.model(inputs).cpu().numpy()
        errors = np.mean((X - outputs) ** 2, axis=1)
        return errors

    # pour compatibilité sklearn — retourne les scores
    def transform(self, X):
        return self.score_samples(X)

    # Renvoie des labels binaires selon un seuil
    def predict(self, X, threshold=None):
        scores = self.score_samples(X)
        if threshold is None:
            threshold = np.percentile(scores, 99)
        return (scores > threshold).astype(int)

In [15]:
X_AE, num_cols, cat_cols = preprocessing(df_train)

In [16]:
# PIPELINE  2 : MODELE AUTOENCODER (transformeur + modèle)
ae_pipeline = make_pipeline(
    # encodage + scaling
    ColumnTransformer(transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ]),  
    AutoEncoderWrapper(epochs=20, batch_size=256, verbose=1)
)

In [None]:
# entraînement de l'autoencodeur
ae_pipeline.fit(X_AE)

In [None]:
# sauvegarde du modèle autoencodeur entrâinée
joblib.dump(ae_pipeline,'../model/ae_pipeline.pkl')

In [17]:
# si déjà entraîné et sauvegardé --> charger le modèle
ae_pipeline = joblib.load('../model/ae_pipeline.pkl')

In [None]:
X_AE['Anomaly_Score'] = ae_pipeline.score_samples(X_AE)

In [None]:
scores = X_AE['Anomaly_Score']

In [None]:
# distribution des scores --> pour pouvoir  choisir le seuil à utiliser

plt.figure(figsize=(8, 4))
sns.histplot(scores, kde=True, bins=50)
plt.title("Distribution des scores d'anomalie")
plt.xlabel("Score (erreur de reconstruction)")
plt.ylabel("Fréquence")

In [None]:
# Marquer les plus anormaux (1% des scores les plus élévés)
# --> seuil non paramétrique : quantile (position relative) car disttribution de loi spécifisue aux scores (empirique)
threshold = np.percentile(scores, 99)
X_AE['isAnomaly'] = ae_pipeline.predict(X_AE, threshold=threshold)

In [None]:
# pourcentage des anomalies
X_AE['isAnomaly'].value_counts(normalize=True).plot(kind='pie')

In [None]:
# Distribution des scores avec seuil choisi
plt.hist(scores, bins=100)
plt.axvline(threshold, color='r', linestyle='--')
plt.title("Distribution des scores d'anomalie")

In [None]:
# dataset des anomalies (sans transformation : revenir au dataset brute + anomalies) 
df_anomalies = df_train.copy()
df_anomalies.loc[X_AE.index, 'isAnomaly'] = X_AE['isAnomaly']

In [None]:
anomalies = df_anomalies[df_anomalies['isAnomaly'] == 1].drop(columns=['isAnomaly'])

In [None]:
# Payment Format x anomalies
anomalies['Payment Format'].value_counts().plot(kind='bar', ylabel='Fréquence', title='Payement Format (anomalies)')
# anomalies['Payment Format'].value_counts().plot().pie()

In [None]:
anomalies.shapes

## Phase 2 : Pseudo-labellisation (Clustering des anomalies)

In [None]:
X_KMEANS, num_cols, cat_cols = preprocessing(anomalies)

In [None]:
# pipeline global de transformation
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [None]:
X_cluster = preprocessor.fit_transform(X_KMEANS)

In [None]:
# METHODE DU COUDE : détermination de k
inertias = []
K_range = range(1, 10)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X_cluster)
    inertias.append(kmeans.inertia_)

# Affichage
plt.figure(figsize=(8,5))
plt.plot(K_range, inertias, marker='o')
plt.xlabel('Nombre de clusters K')
plt.ylabel('Inertie (somme des distances intra-cluster)')
plt.title("Méthode du coude (Elbow Method)")
plt.grid(True)
plt.show()

In [None]:
# PIPELINE 3 : KMEANS (transformeur + modèle)

kmeans_pipeline = make_pipeline(
    ColumnTransformer(transformers=[
         ('num', num_pipeline, num_cols),
         ('cat', cat_pipeline, cat_cols)
    ]), # encodage + scaling
    KMeans(n_clusters=3, random_state=0)
)

In [None]:
# entaîenement du kmeans
kmeans_pipeline.fit(X_KMEANS)

In [None]:
joblib.dump(kmeans_pipeline, '../model/kmeans_pipeline.pkl')

In [None]:
kmeans_pipeline  = joblib.load('../model/kmeans_pipeline.pkl')

In [None]:
labels = kmeans_pipeline.predict(X_KMEANS)

In [None]:
X_KMEANS['Cluster'] = labels

In [None]:
X_KMEANS['Cluster'].value_counts().plot(kind='pie')

In [None]:
df_cluster = anomalies.copy()
df_cluster.loc[X_KMEANS.index, 'Cluster'] = X_KMEANS['Cluster']

In [None]:
df_cluster.head()

In [None]:
df_cluster.shape

In [None]:
# FONCTION DE CREATION DES VARIABLES RFM (utile pour les signaux d'alarmes)
def rfm_features(df):
    df=df.copy()

    #------ RECENCE (R) -------

    df['Hour'] = df['Timestamp'].dt.hour
    df['Day'] = df['Timestamp'].dt.day
    df['DayOfWeek'] = df['Timestamp'].dt.dayofweek # 0 = lundi, 6 = dimanche

        # récence compte émetteur : nombre de jours écoulés depuis la dernière transaction (From Account)
    last_tx = df.groupby('From Account')['Timestamp'].transform('max')
    df['Recency_Days'] = (df['Timestamp'].max() - last_tx).dt.days # récence de réference (From Account)
    
    df['isNight'] = df['Hour'].apply(lambda x: 1 if (x < 6 or x > 22) else 0)
    df['isWeekend'] = df['DayOfWeek'].isin([5,6]).astype(int)


    #------- FREQUENCE (F) ---------

    # nombre de transactions par compte
    df['Freq_Tx'] = df.groupby('From Account')['Timestamp'].transform('count') # transactions effectuées (réference : From Account)

    # nombre de destinataires (To Account) uniques par compte (From Account)
    df['Unique_To_per_From'] = df.groupby('From Account')['To Account'].transform('nunique')

    # Brust : nombre de transactions dans une petite intervalle de temps (ex: < 5min)
    Time_diff_Min = df.sort_values(['From Account', 'Timestamp']).groupby('From Account')['Timestamp'].diff().dt.total_seconds() / 60
    df['isBrust'] = Time_diff_Min.apply(lambda x: int(x <=3 if pd.notnull(x) else 0))


    #------ MONETAIRE (M) ------

    # différence des montants payés et reçus (alternative stable pour ne garder qu'une seule des variables de base)
    df['Amount_Diff'] = df['Amount Paid'] - df['Amount Received']

    # moyenne et max des montants envoyées (From account)
    df['Amount_Mean'] = df.groupby('From Account')['Amount Paid'].transform('mean')
    df['Amount_Max'] = df.groupby('From Account')['Amount Paid'].transform('max')

    # montants petits fréquents (smurfing)
    df['Small_Amount'] = (df['Amount Paid'] < 200).astype(int)
    df['Nb_Small_Tx'] = (df.groupby('From Account')['Small_Amount'].transform('sum'))

    # transformation logarithmique pour stabiliser la distribution
    df['Log_Amount_Paid'] = np.log1p(df['Amount Paid'])
    df['Log_Amount_Diff'] = np.log1p(np.abs(df['Amount_Diff']))
    df['Log_Amount_Mean'] = np.log1p(df['Amount_Mean'])
    df['Log_Amount_Max'] = np.log1p(df['Amount_Max'])


    #------- AUTRES -------
    # pour garder le minimum d'information de From bank et To Bank
    df['Same_Bank_Transfer'] = (df['From Bank'] == df['To Bank']).astype(int)

    return df

In [None]:
def signals_frauds(df, cluster_col='Cluster', confidence_threshold=0.05):
    df = df.copy()

    # SIGNAUX
    df['High_Amount'] = ((df['Log_Amount_Mean'] > df['Log_Amount_Mean'].quantile(0.95)) |
                         (df['Log_Amount_Max'] > df['Log_Amount_Max'].quantile(0.95))).astype(int)
    df['isInternational'] = (df['Receiving Currency'] != df['Payment Currency']).astype(int)
    df['Freq_Small_Tx'] = (df['Nb_Small_Tx'] > df['Nb_Small_Tx'].quantile(0.9)).astype(int)
    df['Many_Dests'] = (df['Unique_To_per_From'] > df['Unique_To_per_From'].quantile(0.95)).astype(int)
    df['High_Freq_From'] = (df['Freq_Tx'] > df['Freq_Tx'].quantile(0.95)).astype(int)
    df['Similary'] = ((df['Receiving Currency'] == df['Payment Currency']) |
                      (df.get('Same_Bank_Transfer', 0) == 1)).astype(int)
    df['Wire_ACH_Bitcoin_Format'] = df['Payment Format'].isin(['Wire', 'Bitcoin', 'ACH']).astype(int)

    df['Very_Recent'] = (df['Recency_Days'] < 1).astype(int)
    df['Reactivation_Suspect'] = ((df['Recency_Days'] > 10) & (df['Freq_Tx'] > 2)).astype(int)
    df['Cash_Bitcoin_Format'] = df['Payment Format'].isin(['Cash', 'Bitcoin']).astype(int)

    df['Credit_Format'] = (df['Payment Format'] == 'Credit Card').astype(int)

    frauds = {
        'blanchiment': ['High_Amount','isInternational','Freq_Small_Tx','Many_Dests','High_Freq_From','Similary','Wire_ACH_Bitcoin_Format'],
        'fraude par carte': ['Very_Recent','isNight','isWeekend', 'isBrust', 'Credit_Format'],
        'fraude par compte mule': ['Very_Recent','Reactivation_Suspect','Many_Dests','High_Freq_From','Cash_Bitcoin_Format']
    }

    all_signals = list(set(sig for sigs in frauds.values() for sig in sigs))
    existing_signals = [s for s in all_signals if s in df.columns]
    cluster_profiles = df.groupby(cluster_col)[existing_signals].mean()

    cluster_scores = []

    # Calcul des scores possibles selon formats de paiement par cluster
    for cluster_id, row in cluster_profiles.iterrows():
        formats_present = set(df[df[cluster_col] == cluster_id]['Payment Format'].unique())

        possible_types = set()
        if any(f in formats_present for f in ['Wire', 'ACH', 'Bitcoin']):
            possible_types.add('blanchiment')
        if 'Credit Card' in formats_present:
            possible_types.add('fraude par carte')
        if any(f in formats_present for f in ['Cash', 'Bitcoin']):
            possible_types.add('fraude par compte mule')

        for fraud_type in possible_types:
            signals = frauds[fraud_type]
            present_signals = [s for s in signals if s in row.index]
            if present_signals:
                score = row[present_signals].mean()
                cluster_scores.append((cluster_id, fraud_type, score))

    # Trier par score décroissant
    cluster_scores.sort(key=lambda x: x[2], reverse=True)

    cluster_to_fraud = {}
    assigned_types = set()
    assigned_clusters = set()

    # Attribution unique pour chaque type, au cluster avec meilleur score
    for cluster_id, fraud_type, score in cluster_scores:
        if fraud_type not in assigned_types and cluster_id not in assigned_clusters and score >= confidence_threshold:
            cluster_to_fraud[cluster_id] = fraud_type
            assigned_types.add(fraud_type)
            assigned_clusters.add(cluster_id)

    # Les clusters non attribués sont assignés au dernier type de fraude non attribué,
    # sauf si leur score max est trop bas, alors "normale"
    remaining_types = set(frauds.keys()) - assigned_types

    for cluster_id in cluster_profiles.index:
        if cluster_id not in assigned_clusters:
            # Score max possible pour ce cluster
            scores_for_cluster = [score for cid, _, score in cluster_scores if cid == cluster_id]
            max_score = max(scores_for_cluster) if scores_for_cluster else 0

            if max_score < confidence_threshold or not remaining_types:
                cluster_to_fraud[cluster_id] = "légitime"
            else:
                # Attribuer un type restant (au hasard, ou mieux à celui avec meilleur score)
                # Ici on prend le premier de remaining_types
                fraud_type = remaining_types.pop()
                cluster_to_fraud[cluster_id] = fraud_type
                assigned_types.add(fraud_type)
                assigned_clusters.add(cluster_id)

    df['Pseudo_Labels'] = df[cluster_col].map(cluster_to_fraud)

    # Affichage résumé
    for cluster_id, fraud_type in cluster_to_fraud.items():
        print(f"Cluster {cluster_id} attribué à : {fraud_type}")

    return df, cluster_to_fraud


In [None]:
RFM = rfm_features(df_cluster)

In [None]:
df_frauds, mapping = signals_frauds(RFM)

In [None]:
mapping

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df_frauds, x='Pseudo_Labels', hue='Payment Format')
plt.title('Répartition des formats de paiement par type de fraude')
plt.xlabel('Type de fraude')
plt.ylabel('Nombre de transactions')
plt.xticks(rotation=15)
plt.legend(title='Format de paiement')

In [None]:
# dataset avec les labels (sans transformation)
df_labels = df_cluster.copy()
df_labels.loc[df_frauds.index, 'Pseudo_Labels'] = df_frauds['Pseudo_Labels']

In [None]:
df_labels['Pseudo_Labels'].value_counts().plot(kind='pie')

In [None]:
df_labels.head()

In [None]:
df_labels.shape

## Phase 3 : Classification multi-classe

In [None]:
def preprocessing_lgbm(df):
    df = df.copy()
    df, num_cols, cat_cols = preprocessing(df)
    X = df.drop(columns='Pseudo_Labels')
    y = df['Pseudo_Labels']
    return X, y, cat_cols

In [None]:
X_train, y_train, cat_cols = preprocessing_lgbm(df_labels)

In [None]:
# PIPELINE 4 : LIGHTGBM
lgbm_pipeline = make_pipeline(
    ColumnTransformer(transformers=[
    ('cat', cat_pipeline_lgbm, cat_cols)
]),
    lgbm.LGBMClassifier(
        objective='multiclass',
        num_class=4,  # légitime + 3 fraudes
        random_state=0,
        n_jobs=-1,
        class_weight='balanced'  # utile en cas de déséquilibre
    )
)

In [None]:
# entraîenement de lightgbm
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
lgbm_pipeline.fit(X_train, y_train_encoded)

In [None]:
# sauvegarde du modèle lgbm entrâiné
joblib.dump(lgbm_pipeline, '../model//lgbm_pipeline.pkl')
joblib.dump(label_encoder, '../model/label_encoder.pkl')

In [None]:
def preprocessing_test(df):
    df= df.copy()
    X_AE, num_cols, cat_cols = preprocessing(df)
    X_AE['isAnomaly'] = ae_pipeline.predict(X_AE, threshold = threshold)
    df_anomalies = df.copy()
    df_anomalies.loc[X_AE.index, 'isAnomaly'] = X_AE['isAnomaly']
    anomalies = df_anomalies[df_anomalies['isAnomaly'] == 1].drop(columns='isAnomaly')
    X_KMEANS, num_cols, cat_cols = preprocessing(anomalies)
    X_KMEANS['Cluster'] = kmeans_pipeline.predict(X_KMEANS)
    df_cluster = anomalies.copy()
    df_cluster.loc[X_KMEANS.index, 'Cluster'] = X_KMEANS['Cluster']
    RFM = rfm_features(df_cluster)
    df_frauds, mapping = signals_frauds(RFM)
    df_labels = df_cluster.copy()
    df_labels.loc[df_frauds.index, 'Pseudo_Labels'] = df_frauds['Pseudo_Labels']
    X_test, y_test, cat_cols = preprocessing_lgbm(df_labels)
    return X_test, y_test, cat_cols

In [None]:
X_test, y_test, cat_cols = preprocessing_test(df_test)

In [None]:
# PREDICTION

y_test_encoded = label_encoder.transform(y_test)
y_pred = lgbm_pipeline.predict(X_test)

In [None]:
# Evaluation

print(classification_report(y_test_encoded, y_pred))
print(confusion_matrix(y_test_encoded, y_pred))

In [None]:
# Obtention des probabilités pour chaque classe pour l'ensemble d'entraînement et de test
y_train_prob = lgbm_pipeline.predict_proba(X_train)
y_test_prob = lgbm_pipeline.predict_proba(X_test)

# Classes uniques et encodées
classes_encoded = label_encoder.classes_
n_classes = len(classes_encoded)

# Définir les couleurs pour chaque classe
colors = plt.cm.get_cmap('viridis', n_classes)

plt.figure(figsize=(10, 7))

for i in range(n_classes):
    # Calcul de la courbe ROC et de l'AUC pour chaque classe (One vs Rest)
    fpr_train, tpr_train, _ = roc_curve(y_train_encoded, y_train_prob[:, i], pos_label=i)
    roc_auc_train = auc(fpr_train, tpr_train)

    fpr_test, tpr_test, _ = roc_curve(y_test_encoded, y_test_prob[:, i], pos_label=i)
    roc_auc_test = auc(fpr_test, tpr_test)

    color = colors(i)
    class_name = classes_encoded[i]

    # Plot ROC curve : training set (solid line)
    plt.plot(fpr_train, tpr_train, color=color, lw=2,
             label=f'ROC (Train) {class_name} (AUC = {roc_auc_train:.2f})')

    # Plot ROC curve : test set (dashed line)
    plt.plot(fpr_test, tpr_test, color=color, lw=2, linestyle='--',
             label=f'ROC (Test) {class_name} (AUC = {roc_auc_test:.2f})')

# Plot random guess line
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# labels et title
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Faux positifs')
plt.ylabel('Vrais positifs')
plt.title('Receiver Operating Characteristic (ROC) par classe')
plt.legend(loc="lower right", bbox_to_anchor=(1.05, 0), borderaxespad=0.)
plt.grid(True)
plt.tight_layout()
plt.show()