In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_timestamp, hour, dayofweek,
    to_date, datediff, floor,
    radians, cos, sin, atan2, sqrt,
    sum, count, when, lit, udf
)
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window

from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("EvaluatePipeline").getOrCreate()

df = spark.read.csv("fraudTest.csv", header=True, inferSchema=True)
print(f"Total rows: {df.count()}")

Total rows: 555719


In [2]:
# Date/Time Features
df = df.withColumn("trans_date_trans_time", to_timestamp(col("trans_date_trans_time"), "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("hour", hour(col("trans_date_trans_time")))
df = df.withColumn("day_of_week", dayofweek(col("trans_date_trans_time")))

# Daily Spending/Transactions (Window functions)
df = df.withColumn("trans_date_only", to_date(col("trans_date_trans_time")))

window_spec_daily = Window.partitionBy("cc_num", "trans_date_only")

df = df.withColumn("daily_spending", sum("amt").over(window_spec_daily))
df = df.withColumn("daily_transactions", count("cc_num").over(window_spec_daily))

df = df.drop("trans_date_only")

# Haversine Distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    return 2 * R * atan2(
        sqrt(sin((radians(lat2) - radians(lat1)) / 2) ** 2 +
             cos(radians(lat1)) * cos(radians(lat2)) *
             sin((radians(lon2) - radians(lon1)) / 2) ** 2),
        sqrt(1 - (sin((radians(lat2) - radians(lat1)) / 2) ** 2 +
                  cos(radians(lat1)) * cos(radians(lat2)) *
                  sin((radians(lon2) - radians(lon1)) / 2) ** 2))
    )

df = df.withColumn("distance", haversine(col("lat"), col("long"), col("merch_lat"), col("merch_long")))

# Age Calculation
df = df.withColumn("dob_date", to_date(col("dob")))
df = df.withColumn("transaction_date", to_date(col("trans_date_trans_time")))
df = df.withColumn("age", floor(datediff(col("transaction_date"), col("dob_date")) / 365.25))
df = df.drop("dob_date", "transaction_date")

# Class Weight (for handling imbalance)
is_fraud_count = df.filter(col("is_fraud") == 1).count()
is_not_fraud_count = df.filter(col("is_fraud") == 0).count()

class_weight_for_fraud = is_not_fraud_count / is_fraud_count

df = df.withColumn("class_weight", when(col("is_fraud") == 1, lit(class_weight_for_fraud)).otherwise(lit(1.0)))

# Label preparation (cast to DoubleType, required for MLlib)
df = df.withColumn("indexedLabel", col("is_fraud").cast(DoubleType()))

In [3]:
# Load the Trained Pipeline Model
model_path = "./Pipeline"

loaded_pipeline_model = PipelineModel.load(model_path)

predictions = loaded_pipeline_model.transform(df)
predictions.cache() # Cache for faster multiple evaluations

chosen_threshold = 0.75 # Use the threshold you decided on during your analysis

predict_at_threshold_udf = udf(lambda prob_vec: 1.0 if prob_vec[1] >= chosen_threshold else 0.0, DoubleType())

# Add the new thresholded prediction column to the predictions DataFrame
predictions_with_threshold = predictions.withColumn(
    "tuned_prediction",
    predict_at_threshold_udf(col("probability"))
)

predictions_with_threshold.select("indexedLabel", "rawPrediction", "probability", "prediction", "tuned_prediction").show(5, False)

+------------+----------------------------------------+----------------------------------------+----------+----------------+
|indexedLabel|rawPrediction                           |probability                             |prediction|tuned_prediction|
+------------+----------------------------------------+----------------------------------------+----------+----------------+
|0.0         |[0.8928531792839728,-0.8928531792839728]|[0.856400056381766,0.14359994361823403] |0.0       |0.0             |
|0.0         |[1.5466820417606335,-1.5466820417606335]|[0.9566181885897809,0.04338181141021913]|0.0       |0.0             |
|0.0         |[1.5466820417606335,-1.5466820417606335]|[0.9566181885897809,0.04338181141021913]|0.0       |0.0             |
|0.0         |[1.5466820417606335,-1.5466820417606335]|[0.9566181885897809,0.04338181141021913]|0.0       |0.0             |
|0.0         |[1.5466820417606335,-1.5466820417606335]|[0.9566181885897809,0.04338181141021913]|0.0       |0.0             |


In [4]:
# Evaluate Model Performance
# AUC-ROC
evaluator_roc = BinaryClassificationEvaluator(
    labelCol="indexedLabel", rawPredictionCol="rawPrediction", metricName="areaUnderROC"
)
auc_roc = evaluator_roc.evaluate(predictions_with_threshold)
print(f"Area Under ROC (AUC-ROC): {auc_roc:.4f}")

# AUC-PR
evaluator_pr = BinaryClassificationEvaluator(
    labelCol="indexedLabel", rawPredictionCol="rawPrediction", metricName="areaUnderPR"
)
auc_pr = evaluator_pr.evaluate(predictions_with_threshold)
print(f"Area Under PR (AUC-PR): {auc_pr:.4f}")

# F1 Score with tuned threshold
evaluator_f1_tuned = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="tuned_prediction", metricName="f1"
)
f1_score_tuned = evaluator_f1_tuned.evaluate(predictions_with_threshold)
print(f"F1 Score (tuned threshold {chosen_threshold}): {f1_score_tuned:.4f}")

# Precision with tuned threshold
evaluator_precision_tuned = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="tuned_prediction", metricName="precisionByLabel"
)
precision_tuned = evaluator_precision_tuned.evaluate(predictions_with_threshold, {evaluator_precision_tuned.metricLabel: 1.0})
print(f"Precision (tuned threshold {chosen_threshold}, for positive class 1.0): {precision_tuned:.4f}")

# Recall with tuned threshold
evaluator_recall_tuned = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="tuned_prediction", metricName="recallByLabel"
)
recall_tuned = evaluator_recall_tuned.evaluate(predictions_with_threshold, {evaluator_recall_tuned.metricLabel: 1.0})
print(f"Recall (tuned threshold {chosen_threshold}, for positive class 1.0): {recall_tuned:.4f}")

Area Under ROC (AUC-ROC): 0.9964
Area Under PR (AUC-PR): 0.7652
F1 Score (tuned threshold 0.75): 0.9947
Precision (tuned threshold 0.75, for positive class 1.0): 0.3552
Recall (tuned threshold 0.75, for positive class 1.0): 0.9385


In [5]:
spark.stop()