In [116]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, datediff, current_date
from pyspark.sql.types import DoubleType, IntegerType
import math
from hdfs import InsecureClient
import pandas as pd
from pyspark.sql.types import *


# Start Spark
spark = SparkSession.builder \
    .appName("FraudDetectionSparkML") \
    .getOrCreate()

# Load data

    
# Haversine formula as a Spark UDF
def haversine(lat, lon, merch_lat, merch_lon):
    R = 6371.0
    lat1 = math.radians(lat)
    lon1 = math.radians(lon)
    lat2 = math.radians(merch_lat)
    lon2 = math.radians(merch_lon)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c


schema = StructType([
    StructField("trans_date_trans_time", StringType(), True),
    StructField("cc_num", DoubleType(), True),
    StructField("merchant", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amt", DoubleType(), True),
    StructField("first", StringType(), True),
    StructField("last", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", DoubleType(), True),
    StructField("lat", DoubleType(), True),
    StructField("long", DoubleType(), True),
    StructField("city_pop", DoubleType(), True),
    StructField("job", StringType(), True),
    StructField("dob", StringType(), True),
    StructField("trans_num", StringType(), True),
    StructField("unix_time", DoubleType(), True),
    StructField("merch_lat", DoubleType(), True),
    StructField("merch_long", DoubleType(), True),
    StructField("age", DoubleType(), True),
    StructField("distance", DoubleType(), True),
    StructField("is_fraud", IntegerType(), True)
])


# Spark ML stages
df = spark.read.csv(
    "hdfs://hadoop-namenode:9000/data",
    header=True,
    inferSchema=True
    # schema=schema
)

df.printSchema()
# Save the model


root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)



In [117]:
from pyspark.sql.functions import col, sum as spark_sum

null_counts = df.select([spark_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|_c0|trans_date_trans_time|cc_num|merchant|category|amt|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|trans_num|unix_time|merch_lat|merch_long|is_fraud|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|  0|                    0|     0|       0|       0|  0|    0|   0|     0|     0|   0|    0|  0|  0|   0|       0|  0|  0|        0|        0|        0|         0|       0|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+



In [118]:

haversine_udf = udf(haversine, DoubleType())

# Feature engineering
df = df.withColumn("dob", col("dob").cast("date"))
df = df.withColumn("age", (datediff(current_date(), col("dob")) / 365.25).cast(IntegerType()))
df = df.withColumn("distance", haversine_udf(col("lat"), col("long"), col("merch_lat"), col("merch_long")))

# Drop unused columns
drop_cols = ["Unnamed: 0", "trans_date_trans_time", "trans_num", "dob", "unix_time",
             "lat", "long", "merch_lat", "merch_long", "first", "last"]
df = df.drop(*drop_cols)

fraud_df = df.filter(col("is_fraud") == 1)
nonfraud_df = df.filter(col("is_fraud") == 0).sample(fraction=fraud_df.count() / df.filter(col("is_fraud") == 0).count(), seed=42)
df_balanced = fraud_df.union(nonfraud_df)

# Categorical & numeric columns



In [135]:

null_counts = df_balanced.select([spark_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

+------+--------+--------+---+------+------+----+-----+---+--------+---+--------+---+--------+
|cc_num|merchant|category|amt|gender|street|city|state|zip|city_pop|job|is_fraud|age|distance|
+------+--------+--------+---+------+------+----+-----+---+--------+---+--------+---+--------+
|     0|       0|       0|  0|     0|     0|   0|    0|  0|       0|  0|       0|  0|       0|
+------+--------+--------+---+------+------+----+-----+---+--------+---+--------+---+--------+



In [136]:
print(df_balanced.filter(col("is_fraud") == 1).count())
print(df_balanced.filter(col("is_fraud") == 0).count())


7506
7544


In [137]:

categorical_cols = [field.name for field in df.schema.fields 
                    if isinstance(field.dataType, StringType)]

numeric_cols = [field.name for field in df.schema.fields 
                if not isinstance(field.dataType, StringType) 
                and field.name != "is_fraud"]


In [138]:
print(categorical_cols)
print(numeric_cols)

['merchant', 'category', 'gender', 'street', 'city', 'state', 'job']
['cc_num', 'amt', 'zip', 'city_pop', 'age', 'distance']


In [140]:
df_balanced = df_balanced.drop("_c0")

In [141]:
df_balanced.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- distance: double (nullable = true)



['merchant', 'category', 'gender', 'street', 'city', 'state', 'job']
['cc_num', 'amt', 'zip', 'city_pop', 'age', 'distance']


In [142]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline

# Drop nulls only from numeric columns
# StringIndexer + OneHotEncoder for categorical columns
indexers = [
    StringIndexer(inputCol=colname, outputCol=colname + "_index", handleInvalid="keep")
    for colname in categorical_cols
]

encoders = [
    OneHotEncoder(inputCol=colname + "_index", outputCol=colname + "_ohe" , handleInvalid="keep")
    for colname in categorical_cols
]

# Assemble features
assembler = VectorAssembler(
    inputCols=[col + "_ohe" for col in categorical_cols] + numeric_cols,
    outputCol="features" , handleInvalid="keep"
)

# Classifier
classifier = GBTClassifier(labelCol="is_fraud", featuresCol="features", maxIter=50)

# Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, classifier])

# Train/test split
train_df, test_df = df_balanced.randomSplit([0.8, 0.2], seed=42)


In [51]:
df_balanced.count()

0

In [143]:
model = pipeline.fit(train_df)

# Evaluate
predictions = model.transform(test_df)
predictions.select("is_fraud", "prediction", "probability").show(5, truncate=False)



+--------+----------+-----------------------------------------+
|is_fraud|prediction|probability                              |
+--------+----------+-----------------------------------------+
|1       |1.0       |[0.037192793436581766,0.9628072065634182]|
|1       |1.0       |[0.29603400128229684,0.7039659987177032] |
|1       |1.0       |[0.10274545613195854,0.8972545438680415] |
|1       |1.0       |[0.027282491075663655,0.9727175089243364]|
|1       |1.0       |[0.027282491075663655,0.9727175089243364]|
+--------+----------+-----------------------------------------+
only showing top 5 rows



In [144]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="is_fraud", metricName="accuracy")
print("Test Accuracy:", evaluator.evaluate(predictions))

Test Accuracy: 0.9625779625779626


In [150]:
from pyspark.sql.functions import col, count

# Compute confusion matrix
confusion_df = predictions.groupBy("is_fraud", "prediction").agg(count("*").alias("count"))
confusion_df.show()

# Optional: pivot to make it look like a 2x2 matrix
confusion_matrix = confusion_df.groupBy("is_fraud").pivot("prediction").sum("count").fillna(0)
confusion_matrix.show()

+--------+----------+-----+
|is_fraud|prediction|count|
+--------+----------+-----+
|       1|       0.0|   59|
|       1|       1.0| 1381|
|       0|       0.0| 1397|
|       0|       1.0|   49|
+--------+----------+-----+

+--------+----+----+
|is_fraud| 0.0| 1.0|
+--------+----+----+
|       1|  59|1381|
|       0|1397|  49|
+--------+----+----+



In [146]:
model.save("saved_model/fraud_detection_sparkml_model")
