In [1]:
import os, sys
python_exec = sys.executable
os.environ['PYSPARK_PYTHON'] = python_exec
os.environ['PYSPARK_DRIVER_PYTHON'] = python_exec

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CreditCardFraudExperiments") \
    .config("spark.pyspark.python", python_exec) \
    .config("spark.executorEnv.PYSPARK_PYTHON", python_exec) \
    .getOrCreate()

25/06/25 03:26:05 WARN Utils: Your hostname, Molphie resolves to a loopback address: 127.0.1.1; using 192.168.6.223 instead (on interface enp4s0)
25/06/25 03:26:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/25 03:26:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/25 03:26:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/06/25 03:26:05 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.clustering import KMeans, GaussianMixture
from pyspark.ml.evaluation import BinaryClassificationEvaluator, ClusteringEvaluator
from pyspark.sql.functions import udf

In [3]:
data_path = "creditcard.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

                                                                                

In [4]:
def stratified_split(df, label_col='Class', train_frac=0.8, seed=42):
    fractions = df.select(label_col).distinct().rdd.map(lambda r: (r[0], train_frac)).collectAsMap()
    train_df = df.stat.sampleBy(label_col, fractions, seed)
    test_df = df.subtract(train_df)
    return train_df, test_df

train_df, test_df = stratified_split(df)
print(f"Train size: {train_df.count()}, Test size: {test_df.count()}")

25/06/25 03:26:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

Train size: 228225, Test size: 56237


                                                                                

In [5]:
feat_cols = [f"V{i}" for i in range(1, 29)] + ["Amount", "Time"]
assembler = VectorAssembler(inputCols=feat_cols, outputCol="rawFeatures")
scaler = StandardScaler(inputCol="rawFeatures", outputCol="features")

# Супервайзд-классификация (RandomForest)

Научиться надёжно отделять нормальные транзакции от мошеннических

In [6]:
rf = RandomForestClassifier(labelCol="Class", featuresCol="features", numTrees=100, maxDepth=6, seed=42)
pipeline_clf = Pipeline(stages=[assembler, scaler, rf])
model_clf = pipeline_clf.fit(train_df)
preds_clf = model_clf.transform(test_df)

                                                                                

In [7]:
ev_roc = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
ev_pr  = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
print("== Classification Metrics ==")
print("ROC AUC:", ev_roc.evaluate(preds_clf))
print("PR  AUC:", ev_pr.evaluate(preds_clf))

== Classification Metrics ==


                                                                                

ROC AUC: 0.979442015045451


[Stage 76:====>                                                   (1 + 12) / 13]

PR  AUC: 0.8249789182256847


                                                                                

# Неконтролируемая кластеризация (KMeans)

Найти естественные группы транзакций и проверить, есть ли кластер, где фродовых примеров заметно больше.

In [8]:
scaled_df = Pipeline(stages=[assembler, scaler]).fit(df).transform(df)
kmeans = KMeans(k=4, seed=42, featuresCol="features", predictionCol="cluster")
model_km = kmeans.fit(scaled_df)
clusters = model_km.transform(scaled_df)

                                                                                

In [9]:
ev_sil = ClusteringEvaluator(featuresCol="features", predictionCol="cluster", metricName="silhouette")
silhouette_score = ev_sil.evaluate(clusters)
training_cost = model_km.summary.trainingCost
print("\n== Clustering Metrics ==")
print(f"Silhouette: {silhouette_score}")
print(f"WSSSE: {training_cost}")
clusters.groupBy("cluster").agg(
    F.count("*").alias("total"),
    F.sum("Class").alias("frauds")
).withColumn("fraud_rate", F.col("frauds")/F.col("total")).show()

                                                                                


== Clustering Metrics ==
Silhouette: 0.07065676641022597
WSSSE: 7707699.754617099
+-------+------+------+--------------------+
|cluster| total|frauds|          fraud_rate|
+-------+------+------+--------------------+
|      1|119789|   169|0.001410814014642413|
|      3| 43081|   194|0.004503145238039...|
|      2|  4451|    10|0.002246686137946...|
|      0|117486|   119|0.001012886641812...|
+-------+------+------+--------------------+



                                                                                

# Детекция аномалий (GaussianMixture)

Использовать распределение данных, чтобы найти аномалии.

In [10]:
pipeline_gmm = Pipeline(stages=[assembler, scaler, GaussianMixture(k=4, featuresCol="features", predictionCol="gmm_cluster", probabilityCol="probability")])
model_gmm = pipeline_gmm.fit(df)
preds_gmm = model_gmm.transform(df)
max_prob_udf = udf(lambda v: float(max(v)), DoubleType())
preds_gmm = preds_gmm.withColumn("maxProb", max_prob_udf("probability")).withColumn("anomalyScore", 1 - F.col("maxProb"))

                                                                                

In [11]:
roc_anom = ev_roc.evaluate(preds_gmm.withColumn("rawPrediction", F.col("anomalyScore")))
total = preds_gmm.count()
k = int(total * 0.001)
topK = preds_gmm.orderBy(F.desc("anomalyScore")).limit(k)
prec_at_k = topK.filter(F.col("Class") == 1).count() / k if k>0 else None
print("\n== Anomaly Detection Metrics ==")
print("ROC AUC:", roc_anom)
print(f"Precision@{k}:", prec_at_k)




== Anomaly Detection Metrics ==
ROC AUC: 0.9492781251872102
Precision@284: 0.2007042253521127


                                                                                