In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, when, count, isnan, to_timestamp, hour, dayofweek, countDistinct, sum, to_date, radians, cos, sin, atan2, sqrt, lit, datediff, floor
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [2]:
spark = SparkSession.builder.appName("FraudDetection").getOrCreate()

df = spark.read.csv("fraudTrain.csv", header=True, inferSchema=True)

In [3]:
# Show dataset structure
df.printSchema()
df.show(5)

root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)

+---+---------------------+----------------+--------------------+-------------+------+---------+-------+--

In [4]:
# Check number or rows
df.count()

1296675

In [5]:
# Check missing value
df.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in df.columns
]).show()

+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|_c0|trans_date_trans_time|cc_num|merchant|category|amt|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|trans_num|unix_time|merch_lat|merch_long|is_fraud|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|  0|                    0|     0|       0|       0|  0|    0|   0|     0|     0|   0|    0|  0|  0|   0|       0|  0|  0|        0|        0|        0|         0|       0|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+



In [6]:
# Convert transaction date to timestamp
df = df.withColumn("trans_date_trans_time", to_timestamp(col("trans_date_trans_time"), "yyyy-MM-dd HH:mm:ss"))

# Extract time-based features
df = df.withColumn("hour", hour(col("trans_date_trans_time")))
df = df.withColumn("day_of_week", dayofweek(col("trans_date_trans_time")))

df.select("trans_date_trans_time", "hour", "day_of_week").show(5)

+---------------------+----+-----------+
|trans_date_trans_time|hour|day_of_week|
+---------------------+----+-----------+
|  2019-01-01 00:00:18|   0|          3|
|  2019-01-01 00:00:44|   0|          3|
|  2019-01-01 00:00:51|   0|          3|
|  2019-01-01 00:01:16|   0|          3|
|  2019-01-01 00:03:06|   0|          3|
+---------------------+----+-----------+
only showing top 5 rows


In [7]:
df = df.withColumn("trans_date_only", to_date(col("trans_date_trans_time")))

window_spec_daily = Window.partitionBy("cc_num", "trans_date_only")

# Calculate total daily spending per user and daily transaction count
df = df.withColumn("daily_spending", sum("amt").over(window_spec_daily))
df = df.withColumn("daily_transactions", count("cc_num").over(window_spec_daily))

df = df.drop("trans_date_only")

df.select("cc_num", "trans_date_trans_time", "amt", "daily_spending", "daily_transactions").show(10)

+-----------+---------------------+-----+--------------+------------------+
|     cc_num|trans_date_trans_time|  amt|daily_spending|daily_transactions|
+-----------+---------------------+-----+--------------+------------------+
|60416207185|  2019-01-17 14:20:15| 6.11|         77.73|                 4|
|60416207185|  2019-01-17 19:23:25|62.62|         77.73|                 4|
|60416207185|  2019-01-17 19:24:17| 1.84|         77.73|                 4|
|60416207185|  2019-01-17 23:34:58| 7.16|         77.73|                 4|
|60416207185|  2019-01-18 23:23:44|53.89|         53.89|                 1|
|60416207185|  2019-01-20 21:28:25|  3.6|           3.6|                 1|
|60416207185|  2019-01-29 12:11:27| 16.2|         72.16|                 3|
|60416207185|  2019-01-29 14:55:32|20.22|         72.16|                 3|
|60416207185|  2019-01-29 15:05:16|35.74|         72.16|                 3|
|60416207185|  2019-02-10 03:13:18|50.77|         50.77|                 1|
+-----------

In [8]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    return 2 * R * atan2(
        sqrt(sin((radians(lat2) - radians(lat1)) / 2) ** 2 +
             cos(radians(lat1)) * cos(radians(lat2)) *
             sin((radians(lon2) - radians(lon1)) / 2) ** 2),
        sqrt(1 - (sin((radians(lat2) - radians(lat1)) / 2) ** 2 +
                  cos(radians(lat1)) * cos(radians(lat2)) *
                  sin((radians(lon2) - radians(lon1)) / 2) ** 2))
    )

df = df.withColumn("distance", haversine(col("lat"), col("long"), col("merch_lat"), col("merch_long")))

df.select("lat", "long", "merch_lat", "merch_long", "distance").show(5)

+-------+---------+------------------+-----------+------------------+
|    lat|     long|         merch_lat| merch_long|          distance|
+-------+---------+------------------+-----------+------------------+
|36.0788| -81.1781|         36.011293| -82.048315|  78.5975684882306|
|48.8878|-118.2105|49.159046999999994|-118.186462|30.212175719210443|
|42.1808| -112.262|         43.150704|-112.154481|108.20608258720067|
|46.2306|-112.1138|         47.034331|-112.561071| 95.67323113819748|
|38.4207| -79.4629|         38.674999| -78.632459|  77.5567436258178|
+-------+---------+------------------+-----------+------------------+
only showing top 5 rows


In [9]:
def encode_categorical_column(df: DataFrame, input_col: str) -> DataFrame:
    # Define output column names
    indexed_col = f"{input_col}_index"
    encoded_col = f"{input_col}_encoded"

    # StringIndexer: Converts string categories to numerical indices
    indexer = StringIndexer(inputCol=input_col, outputCol=indexed_col, handleInvalid="keep")
    df = indexer.fit(df).transform(df)

    # OneHotEncoder: Converts numerical indices to one-hot encoded sparse vectors
    encoder = OneHotEncoder(inputCol=indexed_col, outputCol=encoded_col)
    df = encoder.fit(df).transform(df)

    return df

In [10]:
# Check distinct category
df.select(countDistinct(col("category"))).show()

+------------------------+
|count(DISTINCT category)|
+------------------------+
|                      14|
+------------------------+



In [11]:
df = encode_categorical_column(df, "category")

df.select("category", "category_index", "category_encoded").show(5)

+-------------+--------------+----------------+
|     category|category_index|category_encoded|
+-------------+--------------+----------------+
|     misc_net|          11.0| (14,[11],[1.0])|
|  grocery_pos|           1.0|  (14,[1],[1.0])|
|entertainment|           6.0|  (14,[6],[1.0])|
|gas_transport|           0.0|  (14,[0],[1.0])|
|     misc_pos|          10.0| (14,[10],[1.0])|
+-------------+--------------+----------------+
only showing top 5 rows


In [12]:
df = encode_categorical_column(df, "gender")

df.select("gender", "gender_index", "gender_encoded").show(5)

+------+------------+--------------+
|gender|gender_index|gender_encoded|
+------+------------+--------------+
|     F|         0.0| (2,[0],[1.0])|
|     F|         0.0| (2,[0],[1.0])|
|     M|         1.0| (2,[1],[1.0])|
|     M|         1.0| (2,[1],[1.0])|
|     M|         1.0| (2,[1],[1.0])|
+------+------------+--------------+
only showing top 5 rows


In [13]:
# Check distinct merchant
df.select(countDistinct(col("merchant"))).show()

+------------------------+
|count(DISTINCT merchant)|
+------------------------+
|                     693|
+------------------------+



In [14]:
df = encode_categorical_column(df, "merchant")

df.select("merchant", "merchant_index", "merchant_encoded").show(5)

+--------------------+--------------+-----------------+
|            merchant|merchant_index| merchant_encoded|
+--------------------+--------------+-----------------+
|fraud_Rippin, Kub...|         571.0|(693,[571],[1.0])|
|fraud_Heller, Gut...|          81.0| (693,[81],[1.0])|
|fraud_Lind-Buckridge|         319.0|(693,[319],[1.0])|
|fraud_Kutch, Herm...|          40.0| (693,[40],[1.0])|
| fraud_Keeling-Crist|         519.0|(693,[519],[1.0])|
+--------------------+--------------+-----------------+
only showing top 5 rows


In [15]:
df = encode_categorical_column(df, "job")

df.select("job", "job_index", "job_encoded").show(5)

+--------------------+---------+-----------------+
|                 job|job_index|      job_encoded|
+--------------------+---------+-----------------+
|Psychologist, cou...|    141.0|(494,[141],[1.0])|
|Special education...|     61.0| (494,[61],[1.0])|
|Nature conservati...|    457.0|(494,[457],[1.0])|
|     Patent attorney|    232.0|(494,[232],[1.0])|
|Dance movement ps...|    297.0|(494,[297],[1.0])|
+--------------------+---------+-----------------+
only showing top 5 rows


In [16]:
df = df.withColumn("dob_date", to_date(col("dob")))
df = df.withColumn("transaction_date", to_date(col("trans_date_trans_time")))

# Calculate age in years
df = df.withColumn(
    "age",
    floor(datediff(col("transaction_date"), col("dob_date")) / 365.25)
)

df = df.drop("dob_date", "transaction_date")

df.select("dob", "trans_date_trans_time", "age").show(5)

+----------+---------------------+---+
|       dob|trans_date_trans_time|age|
+----------+---------------------+---+
|1988-03-09|  2019-01-01 00:00:18| 30|
|1978-06-21|  2019-01-01 00:00:44| 40|
|1962-01-19|  2019-01-01 00:00:51| 56|
|1967-01-12|  2019-01-01 00:01:16| 51|
|1986-03-28|  2019-01-01 00:03:06| 32|
+----------+---------------------+---+
only showing top 5 rows


In [17]:
df.groupBy("is_fraud").count().show()

+--------+-------+
|is_fraud|  count|
+--------+-------+
|       1|   7506|
|       0|1289169|
+--------+-------+



In [18]:
# Calculate the weight based on counts for Gradient Boost Tree Model
is_fraud = 7506
is_not_fraud = 1289169
class_weight_for_fraud = is_not_fraud / is_fraud

# Add a 'class_weight' column to DataFrame
df = df.withColumn(
    "class_weight",
    when(col("is_fraud") == 1, lit(class_weight_for_fraud))
    .otherwise(lit(1.0))
)

In [19]:
df.show(5)

+-----+---------------------+-----------+--------------------+--------------+-----+-----+----+------+----------------+-------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+----------+------------------+-------------------+--------+----+-----------+--------------+------------------+-----------------+--------------+----------------+------------+--------------+--------------+-----------------+---------+-----------------+---+------------+
|  _c0|trans_date_trans_time|     cc_num|            merchant|      category|  amt|first|last|gender|          street|         city|state|  zip|    lat|     long|city_pop|                 job|       dob|           trans_num| unix_time|         merch_lat|         merch_long|is_fraud|hour|day_of_week|daily_spending|daily_transactions|         distance|category_index|category_encoded|gender_index|gender_encoded|merchant_index| merchant_encoded|job_index|      job_encoded|age|class_weight|
+-----+-----------

In [20]:
feature_cols = [
    "merchant_encoded",
    "category_encoded",
    "amt",
    "gender_encoded",
    "lat",
    "long",
    "city_pop",
    "job_encoded",
    "merch_lat",
    "merch_long",
    "hour",
    "day_of_week",
    "daily_spending",
    "daily_transactions",
    "distance",
    "age",
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_final = assembler.transform(df)

In [21]:
df_final = df_final.withColumn("indexedLabel", col("is_fraud").cast(DoubleType()))

In [22]:
df_final.show(5)

+-----+---------------------+-----------+--------------------+--------------+-----+-----+----+------+----------------+-------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+----------+------------------+-------------------+--------+----+-----------+--------------+------------------+-----------------+--------------+----------------+------------+--------------+--------------+-----------------+---------+-----------------+---+------------+--------------------+------------+
|  _c0|trans_date_trans_time|     cc_num|            merchant|      category|  amt|first|last|gender|          street|         city|state|  zip|    lat|     long|city_pop|                 job|       dob|           trans_num| unix_time|         merch_lat|         merch_long|is_fraud|hour|day_of_week|daily_spending|daily_transactions|         distance|category_index|category_encoded|gender_index|gender_encoded|merchant_index| merchant_encoded|job_index|      job_encoded|ag

In [23]:
df_final.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- daily_spending: doubl

In [24]:
(trainingData, testData) = df_final.randomSplit([0.8, 0.2], seed=42)
trainingData.cache() # Cache for faster iterative training (e.g., hyperparameter tuning)
testData.cache()

DataFrame[_c0: int, trans_date_trans_time: timestamp, cc_num: bigint, merchant: string, category: string, amt: double, first: string, last: string, gender: string, street: string, city: string, state: string, zip: int, lat: double, long: double, city_pop: int, job: string, dob: date, trans_num: string, unix_time: int, merch_lat: double, merch_long: double, is_fraud: int, hour: int, day_of_week: int, daily_spending: double, daily_transactions: bigint, distance: double, category_index: double, category_encoded: vector, gender_index: double, gender_encoded: vector, merchant_index: double, merchant_encoded: vector, job_index: double, job_encoded: vector, age: bigint, class_weight: double, features: vector, indexedLabel: double]

### LogisticRegression

In [25]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    labelCol="indexedLabel",
    featuresCol="features",
    weightCol="class_weight",
    maxIter=100,
    regParam=0.01
)

lr_model = lr.fit(trainingData)

lr_predictions = lr_model.transform(testData)

print("\nLogistic Regression Predictions (first 5 rows):")
lr_predictions.select("indexedLabel", "rawPrediction", "probability", "prediction").show(5, False)


Logistic Regression Predictions (first 5 rows):
+------------+----------------------------------------+-----------------------------------------+----------+
|indexedLabel|rawPrediction                           |probability                              |prediction|
+------------+----------------------------------------+-----------------------------------------+----------+
|0.0         |[1.0330668493612598,-1.0330668493612598]|[0.7375100363918495,0.26248996360815047] |0.0       |
|0.0         |[3.608720512128473,-3.608720512128473]  |[0.9736278472981281,0.026372152701871876]|0.0       |
|0.0         |[2.54616843932897,-2.54616843932897]    |[0.9273156849038388,0.07268431509616124] |0.0       |
|0.0         |[3.2670706156013827,-3.2670706156013827]|[0.9632816997028799,0.03671830029712009] |0.0       |
|0.0         |[2.201779495840918,-2.201779495840918]  |[0.9004091964154171,0.09959080358458294] |0.0       |
+------------+----------------------------------------+------------------------

### RandomForest

In [26]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    labelCol="indexedLabel",
    featuresCol="features",
    weightCol="class_weight",
    numTrees=100,
    maxDepth=10
)

rf_model = rf.fit(trainingData)

rf_predictions = rf_model.transform(testData)

print("\nRandom Forest Predictions (first 5 rows):")
rf_predictions.select("indexedLabel", "rawPrediction", "probability", "prediction").show(5, False)


Random Forest Predictions (first 5 rows):
+------------+--------------------------------------+----------------------------------------+----------+
|indexedLabel|rawPrediction                         |probability                             |prediction|
+------------+--------------------------------------+----------------------------------------+----------+
|0.0         |[48.88941174279169,51.110588257208306]|[0.4888941174279169,0.5111058825720831] |1.0       |
|0.0         |[68.90732323873907,31.09267676126092] |[0.6890732323873908,0.31092676761260923]|0.0       |
|0.0         |[63.85051944614395,36.14948055385606] |[0.6385051944614395,0.36149480553856056]|0.0       |
|0.0         |[72.80757996821463,27.192420031785367]|[0.7280757996821463,0.2719242003178537] |0.0       |
|0.0         |[61.252220472225446,38.74777952777454]|[0.6125222047222545,0.38747779527774545]|0.0       |
+------------+--------------------------------------+----------------------------------------+----------+
onl

### Gradient Boost Tree (GBT)

In [27]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(
    labelCol="indexedLabel",
    featuresCol="features",
    weightCol="class_weight",
    maxDepth=5,
    maxIter=20
)

gbt_model = gbt.fit(trainingData)

gbt_predictions = gbt_model.transform(testData)

print("\nGradient Boost Tree Predictions (first 5 rows):")
gbt_predictions.select("indexedLabel", "rawPrediction", "probability", "prediction").show(5, False)


Gradient Boost Tree Predictions (first 5 rows):
+------------+----------------------------------------+----------------------------------------+----------+
|indexedLabel|rawPrediction                           |probability                             |prediction|
+------------+----------------------------------------+----------------------------------------+----------+
|0.0         |[1.0881283848293455,-1.0881283848293455]|[0.8980970063876447,0.10190299361235533]|0.0       |
|0.0         |[1.4922350344582056,-1.4922350344582056]|[0.9518675847180936,0.04813241528190637]|0.0       |
|0.0         |[1.5430274772735555,-1.5430274772735555]|[0.956313846600244,0.04368615339975601] |0.0       |
|0.0         |[1.5430274772735555,-1.5430274772735555]|[0.956313846600244,0.04368615339975601] |0.0       |
|0.0         |[1.3787116232513754,-1.3787116232513754]|[0.9403312204006815,0.05966877959931849]|0.0       |
+------------+----------------------------------------+--------------------------------

### Evaluate Models

In [28]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import udf

def evaluate_model_performance(predictions_df, model_name, chosen_threshold=0.5):
    print(f"\n--- Evaluating {model_name} ---")

    # AUC-ROC
    evaluator_roc = BinaryClassificationEvaluator(
        labelCol="indexedLabel", rawPredictionCol="rawPrediction", metricName="areaUnderROC"
    )
    auc_roc = evaluator_roc.evaluate(predictions_df)
    print(f"Area Under ROC (AUC-ROC): {auc_roc:.4f}")

    # AUC-PR
    evaluator_pr = BinaryClassificationEvaluator(
        labelCol="indexedLabel", rawPredictionCol="rawPrediction", metricName="areaUnderPR"
    )
    auc_pr = evaluator_pr.evaluate(predictions_df)
    print(f"Area Under PR (AUC-PR): {auc_pr:.4f}")

    # Apply custom threshold for Precision/Recall/F1
    predict_at_threshold_udf = udf(lambda prob_vec: 1.0 if prob_vec[1] >= chosen_threshold else 0.0, DoubleType())
    predictions_tuned = predictions_df.withColumn(
        "tuned_prediction",
        predict_at_threshold_udf(col("probability"))
    )

    # F1 Score with tuned threshold
    evaluator_f1_tuned = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="tuned_prediction", metricName="f1"
    )
    f1_score_tuned = evaluator_f1_tuned.evaluate(predictions_tuned)
    print(f"F1 Score (tuned threshold {chosen_threshold}): {f1_score_tuned:.4f}")

    # Precision with tuned threshold
    evaluator_precision_tuned = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="tuned_prediction", metricName="precisionByLabel"
    )
    precision_tuned = evaluator_precision_tuned.evaluate(predictions_tuned, {evaluator_precision_tuned.metricLabel: 1.0})
    print(f"Precision (tuned threshold {chosen_threshold}, for positive class 1.0): {precision_tuned:.4f}")

    # Recall with tuned threshold
    evaluator_recall_tuned = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="tuned_prediction", metricName="recallByLabel"
    )
    recall_tuned = evaluator_recall_tuned.evaluate(predictions_tuned, {evaluator_recall_tuned.metricLabel: 1.0})
    print(f"Recall (tuned threshold {chosen_threshold}, for positive class 1.0): {recall_tuned:.4f}")


chosen_threshold_for_eval = 0.75

evaluate_model_performance(lr_predictions, "Logistic Regression", chosen_threshold=chosen_threshold_for_eval)
evaluate_model_performance(rf_predictions, "Random Forest Classifier", chosen_threshold=chosen_threshold_for_eval)
evaluate_model_performance(gbt_predictions, "GBT Classifier", chosen_threshold=chosen_threshold_for_eval)


--- Evaluating Logistic Regression ---
Area Under ROC (AUC-ROC): 0.9834
Area Under PR (AUC-PR): 0.4918
F1 Score (tuned threshold 0.75): 0.9945
Precision (tuned threshold 0.75, for positive class 1.0): 0.4721
Recall (tuned threshold 0.75, for positive class 1.0): 0.8214

--- Evaluating Random Forest Classifier ---
Area Under ROC (AUC-ROC): 0.9850
Area Under PR (AUC-PR): 0.7294
F1 Score (tuned threshold 0.75): 0.9950
Precision (tuned threshold 0.75, for positive class 1.0): 0.9715
Recall (tuned threshold 0.75, for positive class 1.0): 0.3174

--- Evaluating GBT Classifier ---
Area Under ROC (AUC-ROC): 0.9974
Area Under PR (AUC-PR): 0.8224
F1 Score (tuned threshold 0.75): 0.9932
Precision (tuned threshold 0.75, for positive class 1.0): 0.4007
Recall (tuned threshold 0.75, for positive class 1.0): 0.9668
