In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("FraudDetectionTraining").getOrCreate()

# Load the preprocessed training data
df = spark.read.csv("Train_Process_Data.csv", header=True, inferSchema=True)

# Show the dataset
df.show(5)

+------+---------------------+-----------+--------------------+-------------+-------+-------+------+------+--------------------+-------------+-----+-----+-------+---------+--------+--------------------+-------------------+--------------------+----------+------------------+-------------------+--------+----+-----------+--------------+------------------+------------------+--------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+
|   _c0|trans_date_trans_time|     cc_num|            merchant|     category|    amt|  first|  last|gender|              street|         city|state|  zip|    lat|     long|city_pop|                 job|                dob|           trans_num| unix_time|         merch_lat|         merch_long|is_fraud|hour|day_of_week|daily_spending|daily_transactions|          distance|category_index|category_0|category_1|category_2|category_3|category_4|category_5|category_6|ca

In [2]:
from pyspark.ml.feature import VectorAssembler

# List of feature columns
feature_columns = [
    "amt", "hour", "day_of_week", "distance", 
    "category_0", "category_1", "category_2", "category_3", 
    "category_4", "category_5", "category_6", "category_7", 
    "category_8", "category_9", "category_10", "category_11", 
    "category_12"
]

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

# Show the DataFrame with the new "features" column
df.select("features", "is_fraud").show(5, truncate=False)

+---------------------------------------------------------+--------+
|features                                                 |is_fraud|
+---------------------------------------------------------+--------+
|(17,[0,1,2,3,12],[20.41,23.0,7.0,116.98342957838344,1.0])|1       |
|(17,[0,1,2,3,8],[19.23,23.0,7.0,18.887326963782808,1.0]) |1       |
|(17,[0,1,2,3,9],[877.57,22.0,5.0,74.43446018543847,1.0]) |1       |
|(17,[0,1,2,3,5],[317.39,2.0,5.0,59.94420126750621,1.0])  |1       |
|(17,[0,1,2,3,9],[1042.06,22.0,5.0,76.37441112979597,1.0])|1       |
+---------------------------------------------------------+--------+
only showing top 5 rows



In [3]:
# Split the data into training (80%) and testing (20%) sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

print(f"Training set count: {train_df.count()}")
print(f"Testing set count: {test_df.count()}")

Training set count: 20141
Testing set count: 4910


In [4]:
#Train with logistic Regression
from pyspark.ml.classification import LogisticRegression

# Initialize the Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="is_fraud")

# Train the model
model = lr.fit(train_df)

# Make predictions on the test set
predictions = model.transform(test_df)

# Show predictions
predictions.select("is_fraud", "prediction", "probability").show(5)

+--------+----------+--------------------+
|is_fraud|prediction|         probability|
+--------+----------+--------------------+
|       0|       0.0|[0.92374855301772...|
|       0|       0.0|[0.96136278172699...|
|       0|       0.0|[0.96150760843079...|
|       1|       0.0|[0.83634179460630...|
|       1|       0.0|[0.83099748428856...|
+--------+----------+--------------------+
only showing top 5 rows



In [5]:
#Train with RandomForest
from pyspark.ml.classification import RandomForestClassifier

# Initialize the Random Forest model
rf = RandomForestClassifier(featuresCol="features", labelCol="is_fraud")

# Train the model
rf_model = rf.fit(train_df)

# Make predictions
rf_predictions = rf_model.transform(test_df)

# Show predictions
rf_predictions.select("is_fraud", "prediction", "probability").show(5)

+--------+----------+--------------------+
|is_fraud|prediction|         probability|
+--------+----------+--------------------+
|       0|       0.0|[0.93202960854054...|
|       0|       0.0|[0.92667713738301...|
|       0|       0.0|[0.75711825144993...|
|       1|       0.0|[0.65372378593618...|
|       1|       0.0|[0.61721859562477...|
+--------+----------+--------------------+
only showing top 5 rows



In [6]:
#Train with Gradient Boosted Trees 
from pyspark.ml.classification import GBTClassifier

# Initialize the GBT model
gbt = GBTClassifier(featuresCol="features", labelCol="is_fraud")

# Train the model
gbt_model = gbt.fit(train_df)

# Make predictions
gbt_predictions = gbt_model.transform(test_df)

# Show predictions
gbt_predictions.select("is_fraud", "prediction", "probability").show(5)

+--------+----------+--------------------+
|is_fraud|prediction|         probability|
+--------+----------+--------------------+
|       0|       0.0|[0.95484087950618...|
|       0|       0.0|[0.95484087950618...|
|       0|       0.0|[0.93569677559213...|
|       1|       1.0|[0.05135804435962...|
|       1|       1.0|[0.05314662713574...|
+--------+----------+--------------------+
only showing top 5 rows



In [7]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Initialize the evaluator
evaluator = BinaryClassificationEvaluator(
    labelCol="is_fraud", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderROC"
)

# Calculate AUC-ROC
LRauc = evaluator.evaluate(predictions)
print(f"AUC-ROC LogisticRegression: {LRauc}")
RFauc = evaluator.evaluate(rf_predictions)
print(f"AUC-ROC RandomForest: {RFauc}")
GBTauc = evaluator.evaluate(gbt_predictions)
print(f"AUC-ROC GradientBoostedTrees : {GBTauc}")

AUC-ROC LogisticRegression: 0.8997612205023143
AUC-ROC RandomForest: 0.970529879448365
AUC-ROC GradientBoostedTrees : 0.9910664180071832


In [10]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType

# Convert predictions to an RDD for MulticlassMetrics
prediction_and_labels = predictions.select("prediction", "is_fraud").rdd.map(lambda row: (float(row["prediction"]), float(row["is_fraud"])))

# Calculate metrics
metrics = MulticlassMetrics(prediction_and_labels)

# Confusion matrix LogisticRegression
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Logistic Regression Confusion Matrix:\n{confusion_matrix}")

Logistic Regression Confusion Matrix:
[[3377.   61.]
 [ 400. 1072.]]


In [11]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType

# Convert predictions to an RDD for MulticlassMetrics
rf_prediction_and_labels = rf_predictions.select("prediction", "is_fraud").rdd.map(
    lambda row: (float(row["prediction"]), float(row["is_fraud"]))
)

# Calculate metrics
rf_metrics = MulticlassMetrics(rf_prediction_and_labels)

# Confusion matrix RandomForest
rf_confusion_matrix = rf_metrics.confusionMatrix().toArray()
print(f"Random Forest Confusion Matrix:\n{rf_confusion_matrix}")

Random Forest Confusion Matrix:
[[3359.   79.]
 [ 363. 1109.]]


In [12]:
# Convert predictions to an RDD for MulticlassMetrics
gbt_prediction_and_labels = gbt_predictions.select("prediction", "is_fraud").rdd.map(
    lambda row: (float(row["prediction"]), float(row["is_fraud"]))
)

# Calculate metrics
gbt_metrics = MulticlassMetrics(gbt_prediction_and_labels)

# Confusion matrix Gradient Boosted Trees
gbt_confusion_matrix = gbt_metrics.confusionMatrix().toArray()
print(f"Gradient Boosted Trees Confusion Matrix:\n{gbt_confusion_matrix}")

Gradient Boosted Trees Confusion Matrix:
[[3350.   88.]
 [ 101. 1371.]]


In [16]:
# Gradient Boosted Trees have the best AUC-ROC score and least false positive, false negative
# Save the Gradient Boosted Trees model
gbt_model.save("fraud_detection_model")