In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import unix_timestamp
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName("test3CreditCardFraudDetection").getOrCreate()

fraudTrain = spark.read.csv("fraudTrain.csv", header=True, inferSchema=True)
fraudTest = spark.read.csv("fraudTest.csv", header=True, inferSchema=True)

In [3]:
drop_cols = ['_c0', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state',
             'zip', 'job', 'dob', 'trans_num', 'merchant']
fraudTrain = fraudTrain.drop(*drop_cols)
fraudTest = fraudTest.drop(*drop_cols)

In [None]:
#Timestamp conversion

fraudTrain = fraudTrain.withColumn("trans_date_ts", unix_timestamp("trans_date_trans_time")).drop("trans_date_trans_time")
fraudTest = fraudTest.withColumn("trans_date_ts", unix_timestamp("trans_date_trans_time")).drop("trans_date_trans_time")

In [5]:
# String Indexing
indexer = StringIndexer(inputCol="category", outputCol="category_index", handleInvalid="keep")
indexer_model = indexer.fit(fraudTrain)
fraudTrain = indexer_model.transform(fraudTrain)
fraudTest = indexer_model.transform(fraudTest)

In [6]:
# Feature Assembly
features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'trans_date_ts', 'category_index']
assembler = VectorAssembler(inputCols=features, outputCol="features")
fraudTrain = assembler.transform(fraudTrain).select("features", "is_fraud")
fraudTest = assembler.transform(fraudTest).select("features", "is_fraud")

In [7]:
# Train Models
lr = LogisticRegression(labelCol="is_fraud", featuresCol="features")
rf = RandomForestClassifier(labelCol="is_fraud", featuresCol="features", numTrees=50)
gbt = GBTClassifier(labelCol="is_fraud", featuresCol="features", maxIter=20)
dt = DecisionTreeClassifier(labelCol="is_fraud", featuresCol="features")

lr_model = lr.fit(fraudTrain)
rf_model = rf.fit(fraudTrain)
gbt_model = gbt.fit(fraudTrain)
dt_model = dt.fit(fraudTrain)

In [8]:
#Evaluate Models
evaluator = BinaryClassificationEvaluator(labelCol="is_fraud")
print("Logistic Regression AUC:", evaluator.evaluate(lr_model.transform(fraudTest)))
print("Random Forest AUC:", evaluator.evaluate(rf_model.transform(fraudTest)))
print("GBT AUC:", evaluator.evaluate(gbt_model.transform(fraudTest)))
print("Decision Tree AUC:", evaluator.evaluate(dt_model.transform(fraudTest)))

Logistic Regression AUC: 0.8458183862789208
Random Forest AUC: 0.9064916128862412
GBT AUC: 0.9846902016826905
Decision Tree AUC: 0.7826415354790967


Model Evaluation

In [17]:
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator

# models = {
#     "Logistic Regression": lr_model,
#     "Random Forest": rf_model,
#     "GBT": gbt_model,
#     "Decision Tree": dt_model
# }

# metrics_dict = {}

# for name, model in models.items():
#     predictions = model.transform(fraudTest)

#     # Classification Evaluators
#     acc_eval = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="accuracy")
#     prec_eval = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="weightedPrecision")
#     rec_eval = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="weightedRecall")
#     f1_eval = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="f1")

#     # Regression-like Evaluators
#     mse_eval = RegressionEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="mse")
#     r2_eval = RegressionEvaluator(labelCol="is_fraud", predictionCol="prediction", metricName="r2")

#     # AUC
#     auc_eval = BinaryClassificationEvaluator(labelCol="is_fraud", metricName="areaUnderROC")

#     # Collect Metrics
#     metrics_dict[name] = {
#         "Accuracy": round(acc_eval.evaluate(predictions), 4),
#         "Precision": round(prec_eval.evaluate(predictions), 4),
#         "Recall": round(rec_eval.evaluate(predictions), 4),
#         "F1-Score": round(f1_eval.evaluate(predictions), 4),
#         "MSE": round(mse_eval.evaluate(predictions), 4),
#         "R2": round(r2_eval.evaluate(predictions), 4),
#         "AUC": round(auc_eval.evaluate(predictions), 4)
#     }

# import pprint
# pprint.pprint(metrics_dict)


In [9]:
# Predict for user input
user_input = {
    'amt': 100.0,
    'lat': 37.7749,
    'long': -122.4194,
    'city_pop': 50000,
    'unix_time': 1325376018,
    'merch_lat': 37.0,
    'merch_long': -122.0,
    'trans_date_ts': 1577836800,
    'category': 'misc_pos'
}

input_df = spark.createDataFrame([Row(**user_input)])
input_df = indexer_model.transform(input_df)
input_df = assembler.transform(input_df)

In [10]:
# Individual Model Predictions
print("Logistic Regression Prediction:")
lr_model.transform(input_df).select("prediction", "probability").show()

print("Random Forest Prediction:")
rf_model.transform(input_df).select("prediction", "probability").show()

print("GBT Prediction:")
gbt_model.transform(input_df).select("prediction", "probability").show()

print("Decision Tree Prediction:")
dt_model.transform(input_df).select("prediction", "probability").show()

Logistic Regression Prediction:
+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       1.0|[2.55771090264348...|
+----------+--------------------+

Random Forest Prediction:
+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       0.0|[0.99648296591945...|
+----------+--------------------+

GBT Prediction:
+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       0.0|[0.95642324394935...|
+----------+--------------------+

Decision Tree Prediction:
+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       0.0|[0.99955450010580...|
+----------+--------------------+



In [11]:
# Ensemble (Voting)
preds = [
    rf_model.transform(input_df).select("prediction").first()[0],
    lr_model.transform(input_df).select("prediction").first()[0],
    gbt_model.transform(input_df).select("prediction").first()[0],
    dt_model.transform(input_df).select("prediction").first()[0],
]
final_vote = round(sum(preds) / len(preds))
print(f"Ensembled (majority voting) prediction: {final_vote}")

Ensembled (majority voting) prediction: 0
