<a href="https://colab.research.google.com/github/Jaber521/fraud_detection/blob/main/final_fraud_hybrid_workflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🔍 Modèle Hybride Final : Chargement → Entraînement → Évaluation → Interface Gradio

In [1]:
!pip install pandas numpy scikit-learn tensorflow gradio joblib

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [2]:

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import gradio as gr
import traceback


In [4]:

# Chargement des données enrichies
df = pd.read_csv("/content/fraud_dataset_final_with_time_parts.csv")

# Extraction des colonnes temporelles

df.drop(columns=["transaction_datetime"], inplace=True, errors="ignore")


In [5]:

enc = {}
for col in ['transaction_country', 'merchant_category_code', 'transaction_method', 'transaction_status']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    enc[col] = le

num_columns = [
    'transaction_amount', 'average_amount_card', 'transaction_country',
    'merchant_category_code', 'transaction_method', 'transaction_status',
    'latitude', 'longitude', 'hour', 'minute', 'second',
    'day', 'month', 'year', 'is_night'
]
sc = MinMaxScaler()
df[num_columns] = sc.fit_transform(df[num_columns])


In [6]:

tx_counts = df['card_id'].value_counts()
cards_lstm = tx_counts[tx_counts >= 2].index
df_lstm = df[df['card_id'].isin(cards_lstm)]

sequences, labels = [], []
for _, group in df_lstm.groupby('card_id'):
    group = group.sort_values(by=['hour', 'minute', 'second'])
    sequences.append(group[num_columns].values)
    labels.append(group['fraud_label'].values)

X_lstm_train, X_lstm_test, y_lstm_train, y_lstm_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)
X_lstm_train_pad = pad_sequences(X_lstm_train, padding='post')
X_lstm_test_pad = pad_sequences(X_lstm_test, padding='post')
y_lstm_train_pad = np.expand_dims(pad_sequences(y_lstm_train, padding='post'), -1)
y_lstm_test_pad = np.expand_dims(pad_sequences(y_lstm_test, padding='post'), -1)


In [7]:

model_lstm = Sequential([
    Masking(mask_value=0.0, input_shape=(None, X_lstm_train_pad.shape[2])),
    LSTM(64, return_sequences=True),
    Dense(1, activation='sigmoid')
])
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.fit(X_lstm_train_pad, y_lstm_train_pad, epochs=5, batch_size=32, validation_split=0.1)


  super().__init__(**kwargs)


Epoch 1/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 140ms/step - accuracy: 0.6507 - loss: 0.6903 - val_accuracy: 0.7542 - val_loss: 0.6791
Epoch 2/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.7759 - loss: 0.6689 - val_accuracy: 0.7542 - val_loss: 0.6758
Epoch 3/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - accuracy: 0.7758 - loss: 0.6624 - val_accuracy: 0.7542 - val_loss: 0.6761
Epoch 4/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - accuracy: 0.7714 - loss: 0.6553 - val_accuracy: 0.7542 - val_loss: 0.6681
Epoch 5/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 86ms/step - accuracy: 0.7756 - loss: 0.6544 - val_accuracy: 0.7542 - val_loss: 0.6626


<keras.src.callbacks.history.History at 0x7eb7f7b3db10>

In [8]:

cards_rf = tx_counts[tx_counts < 2].index
df_rf = df[df['card_id'].isin(cards_rf)]

if len(df_rf) > 0:
    X_rf = df_rf[num_columns]
    y_rf = df_rf['fraud_label']
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_rf, y_rf)
else:
    rf_model = None


In [9]:

# Évaluation LSTM
y_pred_lstm = (model_lstm.predict(X_lstm_test_pad) > 0.5).astype(int)
print("LSTM:")
print(confusion_matrix(y_lstm_test_pad.flatten(), y_pred_lstm.flatten()))
print(classification_report(y_lstm_test_pad.flatten(), y_pred_lstm.flatten()))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 388ms/step
LSTM:
[[1451    0]
 [ 344    5]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.89      1451
           1       1.00      0.01      0.03       349

    accuracy                           0.81      1800
   macro avg       0.90      0.51      0.46      1800
weighted avg       0.85      0.81      0.73      1800



In [10]:

model_lstm.save("lstm_fraud_model_v3.h5")
joblib.dump(sc, "scaler_v3.pkl")
joblib.dump(enc, "encoders_v3.pkl")
if rf_model: joblib.dump(rf_model, "rf_model_v3.pkl")




In [11]:

enc = joblib.load("encoders_v3.pkl")
sc = joblib.load("scaler_v3.pkl")
lstm_model = load_model("lstm_fraud_model_v3.h5")
try:
    rf_model = joblib.load("rf_model_v3.pkl")
except:
    rf_model = None




In [12]:

def predict_rf(transaction_amount, average_amount_card, transaction_country,
               merchant_category_code, transaction_method, transaction_status,
               hour, minute, second, day, month, year, latitude, longitude):
    try:
        is_night = 1 if 0 <= int(hour) < 7 else 0
        merchant_category_code = str(merchant_category_code)
        for col, val in zip(['transaction_country', 'merchant_category_code', 'transaction_method', 'transaction_status'],
                            [transaction_country, merchant_category_code, transaction_method, transaction_status]):
            if val not in enc[col].classes_:
                return f"❌ Valeur inconnue dans {col} : {val}"
        transaction_country = enc['transaction_country'].transform([transaction_country])[0]
        merchant_category_code = enc['merchant_category_code'].transform([merchant_category_code])[0]
        transaction_method = enc['transaction_method'].transform([transaction_method])[0]
        transaction_status = enc['transaction_status'].transform([transaction_status])[0]
        data = [[
            transaction_amount, average_amount_card, transaction_country,
            merchant_category_code, transaction_method, transaction_status,
            latitude, longitude, hour, minute, second, day, month, year, is_night
        ]]
        scaled = sc.transform(data)
        proba = rf_model.predict_proba(scaled)[0][1]
        return f"✅ Proba fraude : {float(proba):.2f} → {'FRAUDE' if proba > 0.5 else 'OK'}"
    except Exception:
        return "❌ Erreur RF :\n" + traceback.format_exc()

def predict_lstm(df):
    try:
        df = df.copy()
        df['merchant_category_code'] = df['merchant_category_code'].astype(str)
        df['is_night'] = df['hour'].apply(lambda h: 1 if 0 <= int(h) < 7 else 0)
        for col in ['transaction_country', 'merchant_category_code', 'transaction_method', 'transaction_status']:
            if not all(df[col].isin(enc[col].classes_)):
                return f"❌ Valeur inconnue dans {col}"
            df[col] = enc[col].transform(df[col])
        X = sc.transform(df[num_columns])
        X_seq = np.expand_dims(X, axis=0)
        preds = lstm_model.predict(X_seq)[0].flatten()
        df['prob_fraude'] = preds
        df['fraude'] = df['prob_fraude'].apply(lambda x: "✅" if x > 0.5 else "🟢")
        return df[['hour', 'minute', 'second', 'prob_fraude', 'fraude']]
    except Exception:
        return "❌ Erreur LSTM :\n" + traceback.format_exc()

def charger_csv_lstm(file):
    try:
        return pd.read_csv(file.name)
    except Exception as e:
        return f"❌ Erreur : {e}"


In [13]:

dropdown_vals = {col: enc[col].classes_.tolist() for col in enc}

import gradio as gr
example_df = pd.DataFrame({
    "transaction_amount": [120.0],
    "average_amount_card": [100.0],
    "transaction_country": ["France"],
    "merchant_category_code": ["5411"],
    "transaction_method": ["Online"],
    "transaction_status": ["approved"],
    "latitude": [48.8566],
    "longitude": [2.3522],
    "hour": [13], "minute": [45], "second": [0],
    "day": [1], "month": [5], "year": [2025]
})

with gr.Blocks() as demo:
    gr.Markdown("# 🔐 Détection de Fraude - Modèle Hybride")

    mode = gr.Radio(["Transaction isolée (RF)", "Séquence (LSTM)"], label="Mode")

    rf_inputs = [
        gr.Number(label="Montant"),
        gr.Number(label="Moyenne"),
        gr.Dropdown(dropdown_vals['transaction_country'], label="Pays"),
        gr.Dropdown(dropdown_vals['merchant_category_code'], label="MCC"),
        gr.Dropdown(dropdown_vals['transaction_method'], label="Méthode"),
        gr.Dropdown(dropdown_vals['transaction_status'], label="Statut"),
        gr.Number(label="Heure"), gr.Number(label="Minute"), gr.Number(label="Seconde"),
        gr.Number(label="Jour"), gr.Number(label="Mois"), gr.Number(label="Année"),
        gr.Number(label="Latitude"), gr.Number(label="Longitude")
    ]

    file_input = gr.File(label="📁 Charger CSV LSTM", file_types=[".csv"])
    df_input = gr.Dataframe(headers=example_df.columns.tolist(), row_count=(2, "dynamic"))
    file_input.change(fn=charger_csv_lstm, inputs=file_input, outputs=df_input)

    submit = gr.Button("Prédire")
    output = gr.Dataframe()

    def route(mode, *args):
        return predict_rf(*args[:-1]) if mode == "Transaction isolée (RF)" else predict_lstm(args[-1])

    submit.click(fn=route, inputs=[mode] + rf_inputs + [df_input], outputs=output)

demo.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0edbda9bdff70e9430.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


