In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Data
transactions = pd.read_csv("https://raw.githubusercontent.com/Tharzalab/Temporaire/refs/heads/main/amount_data.csv")
customers = pd.read_csv("https://raw.githubusercontent.com/Tharzalab/Temporaire/refs/heads/main/account_activity.csv")
merchants = pd.read_csv("https://raw.githubusercontent.com/Tharzalab/Temporaire/refs/heads/main/merchant_data.csv")
transactions_categories = pd.read_csv("https://raw.githubusercontent.com/Tharzalab/Temporaire/refs/heads/main/transaction_records.csv")
fraud_indic = pd.read_csv("https://raw.githubusercontent.com/Tharzalab/Temporaire/refs/heads/main/fraud_indicators.csv")


In [None]:
# Rapide traitement des données
def explore_df(df, name=""):
    print(f"Exploration de {name}")
    print("Dimension :", df.shape)
    print("Colonnes et types :\n", df.dtypes)
    print("Valeurs manquantes par colonne :\n", df.isnull().sum())
    print("Aperçu des premières lignes :")
    print(df.head(3))
    print("\n\n")

explore_df(transactions, "transactions")
explore_df(customers, "customers")
explore_df(merchants, "merchants")
explore_df(transactions_categories, "transactions_categories")
explore_df(fraud_indic, "fraud_indic")

In [None]:
# Nettoyage (Ici inutile car aucune valeur manquante, mais il y en a dans les autres datasets)
def preprocess_table(df, drop_cols=None, fill_strategy="mean"):
    # On enlève les colonnes inutiles
    if drop_cols:
        df = df.drop(columns=drop_cols, errors="ignore")
    # On complète les valeurs manquantes
    if fill_strategy == "mean":
        imp = SimpleImputer(strategy="mean")
        for col in df.select_dtypes(include=[np.number]).columns:
            df[col] = imp.fit_transform(df[[col]])
    elif fill_strategy == "median":
        imp = SimpleImputer(strategy="median")
        for col in df.select_dtypes(include=[np.number]).columns:
            df[col] = imp.fit_transform(df[[col]])
    else:
        pass # à modifier potentiellement

    return df

# On peut changer la médiane par la moyenne ou par le précédent/suivant selon les besoins et les datasets
transactions = preprocess_table(transactions,
                                drop_cols=["unneeded_col1", "unneeded_col2"],
                                fill_strategy="median")

customers = preprocess_table(customers,
                             drop_cols=["some_unused_col"],
                             fill_strategy="median")

merchants = preprocess_table(merchants,
                             drop_cols=None,
                             fill_strategy="median")

transactions_categories = preprocess_table(transactions_categories,
                                  drop_cols=None,
                                  fill_strategy="median")

fraud_indic = preprocess_table(fraud_indic,
                               drop_cols=None,
                               fill_strategy="median")