In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import GBTClassificationModel

# Initialize Spark session
spark = SparkSession.builder.appName("TestModel").getOrCreate()

# Load the trained GBT model
model_path = "fraud_detection_model"  # Path to the saved model folder
model = GBTClassificationModel.load(model_path)

In [2]:
# Load the test dataset
test_df = spark.read.csv("Test_Process_Data.csv", header=True, inferSchema=True)

# Show the dataset
test_df.show(5)

+-----+---------------------+-----------+--------------------+------------+-----+-----+----+------+----------------+-------------+-----+-----+-------+---------+--------+--------------------+-------------------+--------------------+----------+---------+-----------+--------+----+-----------+--------------+------------------+-----------------+--------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+
|  _c0|trans_date_trans_time|     cc_num|            merchant|    category|  amt|first|last|gender|          street|         city|state|  zip|    lat|     long|city_pop|                 job|                dob|           trans_num| unix_time|merch_lat| merch_long|is_fraud|hour|day_of_week|daily_spending|daily_transactions|         distance|category_index|category_0|category_1|category_2|category_3|category_4|category_5|category_6|category_7|category_8|category_9|category_10|category_11|categ

In [3]:
from pyspark.ml.feature import VectorAssembler

# List of feature columns
feature_columns = [
    "amt", "hour", "day_of_week", "distance", 
    "category_0", "category_1", "category_2", "category_3", 
    "category_4", "category_5", "category_6", "category_7", 
    "category_8", "category_9", "category_10", "category_11", 
    "category_12"
]

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
test_df = assembler.transform(test_df)

# Show the DataFrame with the new "features" column
test_df.select("features", "is_fraud").show(5, truncate=False)

+-------------------------------------------------------+--------+
|features                                               |is_fraud|
+-------------------------------------------------------+--------+
|(17,[0,1,2,3],[4.39,23.0,7.0,96.33152031871192])       |0       |
|(17,[0,1,2,3,7],[9.33,18.0,4.0,90.995324297606,1.0])   |0       |
|(17,[0,1,2,3,7],[3.0,21.0,1.0,102.6031791487097,1.0])  |0       |
|(17,[0,1,2,3,6],[25.04,12.0,5.0,84.34864729903813,1.0])|0       |
|(17,[0,1,2,3,7],[5.78,21.0,7.0,77.01270612365549,1.0]) |0       |
+-------------------------------------------------------+--------+
only showing top 5 rows



In [4]:
# Make predictions
predictions = model.transform(test_df)

# Show predictions
predictions.select("is_fraud", "prediction", "probability").show(5)

+--------+----------+--------------------+
|is_fraud|prediction|         probability|
+--------+----------+--------------------+
|       0|       0.0|[0.88514621284719...|
|       0|       0.0|[0.93879369741640...|
|       0|       0.0|[0.95449695527846...|
|       0|       0.0|[0.95484087950618...|
|       0|       0.0|[0.95449695527846...|
+--------+----------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Initialize the evaluator
evaluator = BinaryClassificationEvaluator(
    labelCol="is_fraud",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

# Calculate AUC-ROC
auc = evaluator.evaluate(predictions)
print(f"AUC-ROC: {auc}")

AUC-ROC: 0.9841213371321361


In [6]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType

# Convert predictions to an RDD for MulticlassMetrics
prediction_and_labels = predictions.select("prediction", "is_fraud").rdd.map(
    lambda row: (float(row["prediction"]), float(row["is_fraud"]))
)

# Calculate metrics
metrics = MulticlassMetrics(prediction_and_labels)

# Confusion matrix
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Confusion Matrix:\n{confusion_matrix}")



Confusion Matrix:
[[5.39306e+05 1.42680e+04]
 [1.80000e+02 1.96500e+03]]
