In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("FraudDetection").getOrCreate()

# Load the dataset
df = spark.read.csv("fraudTest.csv", header=True, inferSchema=True)

# Show dataset structure
df.printSchema()
df.show(5)

root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: timestamp (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)

+---+---------------------+----------------+--------------------+--------------+-----+------+--------

In [6]:
from pyspark.sql.functions import count

# Count missing values
df.select([count(col(c)).alias(c) for c in df.columns]).show()

# Drop rows with null values (if any)
df = df.dropna()

+------+---------------------+------+--------+--------+------+------+------+------+------+------+------+------+------+------+--------+------+------+---------+---------+---------+----------+--------+
|   _c0|trans_date_trans_time|cc_num|merchant|category|   amt| first|  last|gender|street|  city| state|   zip|   lat|  long|city_pop|   job|   dob|trans_num|unix_time|merch_lat|merch_long|is_fraud|
+------+---------------------+------+--------+--------+------+------+------+------+------+------+------+------+------+------+--------+------+------+---------+---------+---------+----------+--------+
|555719|               555719|555719|  555719|  555719|555719|555719|555719|555719|555719|555719|555719|555719|555719|555719|  555719|555719|555719|   555719|   555719|   555719|    555719|  555719|
+------+---------------------+------+--------+--------+------+------+------+------+------+------+------+------+------+------+--------+------+------+---------+---------+---------+----------+--------+



In [7]:
from pyspark.sql.functions import to_timestamp, hour, dayofweek

# Convert transaction date to timestamp
df = df.withColumn("trans_date_trans_time", to_timestamp(col("trans_date_trans_time"), "yyyy-MM-dd HH:mm:ss"))

# Extract time-based features
df = df.withColumn("hour", hour(col("trans_date_trans_time")))
df = df.withColumn("day_of_week", dayofweek(col("trans_date_trans_time")))

df.select("trans_date_trans_time", "hour", "day_of_week").show(5)

+---------------------+----+-----------+
|trans_date_trans_time|hour|day_of_week|
+---------------------+----+-----------+
|  2020-06-21 12:14:25|  12|          1|
|  2020-06-21 12:14:33|  12|          1|
|  2020-06-21 12:14:53|  12|          1|
|  2020-06-21 12:15:15|  12|          1|
|  2020-06-21 12:15:17|  12|          1|
+---------------------+----+-----------+
only showing top 5 rows



In [8]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum as spark_sum, count as spark_count

# Define window by user and transaction date
window_spec = Window.partitionBy("cc_num", "trans_date_trans_time")

# Total daily spending per user
df = df.withColumn("daily_spending", spark_sum("amt").over(window_spec))

# Transaction count per day
df = df.withColumn("daily_transactions", spark_count("cc_num").over(window_spec))

df.select("cc_num", "amt", "daily_spending", "daily_transactions").show(5)

+-----------+-----+--------------+------------------+
|     cc_num|  amt|daily_spending|daily_transactions|
+-----------+-----+--------------+------------------+
|60416207185| 4.39|          4.39|                 1|
|60416207185| 9.33|          9.33|                 1|
|60416207185|  3.0|           3.0|                 1|
|60416207185|25.04|         25.04|                 1|
|60416207185| 5.78|          5.78|                 1|
+-----------+-----+--------------+------------------+
only showing top 5 rows



In [9]:
from pyspark.sql.functions import radians, cos, sin, atan2, sqrt

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    return 2 * R * atan2(
        sqrt(sin((radians(lat2) - radians(lat1)) / 2) ** 2 +
             cos(radians(lat1)) * cos(radians(lat2)) *
             sin((radians(lon2) - radians(lon1)) / 2) ** 2),
        sqrt(1 - (sin((radians(lat2) - radians(lat1)) / 2) ** 2 +
                  cos(radians(lat1)) * cos(radians(lat2)) *
                  sin((radians(lon2) - radians(lon1)) / 2) ** 2))
    )

# Add a new column for distance
df = df.withColumn("distance", haversine(col("lat"), col("long"), col("merch_lat"), col("merch_long")))

df.select("lat", "long", "merch_lat", "merch_long", "distance").show(5)

+-------+------------------+------------------+-----------+------------------+
|    lat|              long|         merch_lat| merch_long|          distance|
+-------+------------------+------------------+-----------+------------------+
|33.9659|          -80.9355|         33.986391| -81.200714| 24.56146172635633|
|40.3207|          -110.436|39.450497999999996|-109.960431| 104.9250922447634|
|40.6729|          -73.5365|          40.49581| -74.196111| 59.08007772921541|
|28.5697|          -80.8191|28.812397999999998| -80.883061|27.698567290865142|
|44.2529|-85.01700000000001|         44.959148| -85.884734|104.33510630013764|
+-------+------------------+------------------+-----------+------------------+
only showing top 5 rows



In [10]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Convert category names to numerical index
indexer = StringIndexer(inputCol="category", outputCol="category_index")
df = indexer.fit(df).transform(df)

# One-hot encode category
encoder = OneHotEncoder(inputCol="category_index", outputCol="category_encoded")
df = encoder.fit(df).transform(df)

df.select("category", "category_index", "category_encoded").show(5)

+--------------+--------------+----------------+
|      category|category_index|category_encoded|
+--------------+--------------+----------------+
| personal_care|           7.0|  (13,[7],[1.0])|
| personal_care|           7.0|  (13,[7],[1.0])|
|health_fitness|           9.0|  (13,[9],[1.0])|
|      misc_pos|          10.0| (13,[10],[1.0])|
|        travel|          13.0|      (13,[],[])|
+--------------+--------------+----------------+
only showing top 5 rows



In [11]:
from pyspark.sql.functions import when

# Count fraud and non-fraud transactions
fraud_count = df.filter(df.is_fraud == 1).count()
non_fraud_count = df.filter(df.is_fraud == 0).count()

# Downsample non-fraud transactions
fraud_df = df.filter(df.is_fraud == 1)
non_fraud_df = df.filter(df.is_fraud == 0).sample(False, fraud_count / non_fraud_count)

# Combine balanced dataset
df = fraud_df.union(non_fraud_df)

# Check final distribution
df.groupBy("is_fraud").count().show()

+--------+-----+
|is_fraud|count|
+--------+-----+
|       1| 2145|
|       0| 2072|
+--------+-----+

