<a href="https://colab.research.google.com/github/Koutouf/D-tection-de-fraudes-pour-les-transactions-bancaires/blob/main/D_F_T_B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
spark = SparkSession.builder.appName("FraudDetection").getOrCreate()

In [3]:
from google.colab import drive
drive.mount("/content/mydrive")

Mounted at /content/mydrive


In [10]:
import warnings

# Suppression de tous les avertissements pour un affichage plus clair des sorties
warnings.filterwarnings("ignore")

In [20]:
import pandas as pd
import zipfile
# Chemin du fichier zip
data_path = "/content/mydrive/MyDrive/D_F_T_B/PS_20174392719_1491204439457_log.csv.zip"
# Extract the CSV file
with zipfile.ZipFile(data_path, 'r') as zip_ref:
    zip_ref.extractall('/content')
    csv_newdata = zip_ref.namelist()[0]

# Lire le fichier CSV
df = spark.read.csv(f"/content/{csv_newdata}", header=True, inferSchema=True)

# Afficher les premières lignes du DataFrame
print(df.head())
df.show(5)
df.printSchema()

Row(step=1, type='PAYMENT', amount=9839.64, nameOrig='C1231006815', oldbalanceOrg=170136.0, newbalanceOrig=160296.36, nameDest='M1979787155', oldbalanceDest=0.0, newbalanceDest=0.0, isFraud=0, isFlaggedFraud=0)
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|  

In [21]:
indexer = StringIndexer(inputCol="type", outputCol="type_indexed")
df = indexer.fit(df).transform(df)
df = df.withColumn("label", col("isFraud").cast("double"))


In [22]:
feature_columns = ["type_indexed", "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df).select("features", "label")


In [23]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)


In [None]:
# Initialisation du modèle
rf = RandomForestClassifier(featuresCol="features", labelCol="label")

# Définition de la grille de paramètres pour le tuning
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

# Configuration de la validation croisée
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

# Entraînement du modèle avec la validation croisée
cv_model = crossval.fit(train_data)
