In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import unix_timestamp, col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [3]:
spark = SparkSession.builder.appName("CreditCardFraudDetection").getOrCreate()
fraudTrain = spark.read.csv("fraudTrain.csv", header=True, inferSchema=True)
fraudTest = spark.read.csv("fraudTest.csv", header=True, inferSchema=True)

In [4]:
drop_cols = ['_c0', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state',
             'zip', 'job', 'dob', 'trans_num', 'merchant']
fraudTrain = fraudTrain.drop(*drop_cols)
fraudTest = fraudTest.drop(*drop_cols)

In [5]:
fraudTrain = fraudTrain.withColumn("trans_date_ts", unix_timestamp("trans_date_trans_time")).drop("trans_date_trans_time")
fraudTest = fraudTest.withColumn("trans_date_ts", unix_timestamp("trans_date_trans_time")).drop("trans_date_trans_time")

In [6]:
fraudTrain = fraudTrain.filter(fraudTrain["category"].isNotNull()) 
# Remove rows with null values in "category"
fraudTest = fraudTest.filter(fraudTest["category"].isNotNull())  
# Remove rows with null values in "category"

# Fill missing values in "category" with "unknown" if necessary
fraudTrain = fraudTrain.fillna({"category": "unknown"})
fraudTest = fraudTest.fillna({"category": "unknown"})


In [7]:
# Ensure "category" column is of string type
fraudTrain = fraudTrain.withColumn("category", col("category").cast("string"))
fraudTest = fraudTest.withColumn("category", col("category").cast("string"))

# String Indexing
indexer = StringIndexer(inputCol="category", outputCol="category_index", handleInvalid="skip")
indexer_model = indexer.fit(fraudTrain)
fraudTrain = indexer_model.transform(fraudTrain)
fraudTest = indexer_model.transform(fraudTest)

In [8]:
# 6. Feature Assembly
features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'trans_date_ts', 'category_index']
assembler = VectorAssembler(inputCols=features, outputCol="features")
fraudTrain = assembler.transform(fraudTrain).select("features", "is_fraud")
fraudTest = assembler.transform(fraudTest).select("features", "is_fraud")

In [9]:
# 7. Train Models
lr = LogisticRegression(labelCol="is_fraud", featuresCol="features")
print("Logistic Regression model...Trained")

rf = RandomForestClassifier(labelCol="is_fraud", featuresCol="features", numTrees=50)
print("Random Forest model...Trained")

gbt = GBTClassifier(labelCol="is_fraud", featuresCol="features", maxIter=20)
print("Gradient Boosted Trees model...Trained")

dt = DecisionTreeClassifier(labelCol="is_fraud", featuresCol="features")
print("Decision Tree model...Trained")

lr_model = lr.fit(fraudTrain)
rf_model = rf.fit(fraudTrain)
gbt_model = gbt.fit(fraudTrain)
dt_model = dt.fit(fraudTrain)


In [15]:
# 8. Evaluate Models
evaluator = BinaryClassificationEvaluator(labelCol="is_fraud")
print("Model Evaluation: ")
print("Logistic Regression AUC:", evaluator.evaluate(lr_model.transform(fraudTest)))
print("Random Forest AUC:", evaluator.evaluate(rf_model.transform(fraudTest)))
print("GBT AUC:", evaluator.evaluate(gbt_model.transform(fraudTest)))
print("Decision Tree AUC:", evaluator.evaluate(dt_model.transform(fraudTest)))



Model Evaluation: 
Logistic Regression AUC: 0.8458159503176068
Random Forest AUC: 0.9138470328134222
GBT AUC: 0.9855016572411177
Decision Tree AUC: 0.7390320633397441


In [11]:
#Predict user input
user_input = {
    'amt': 100.0,
    'lat': 37.7749,
    'long': -122.4194,
    'city_pop': 50000,
    'unix_time': 1325376018,
    'merch_lat': 37.0,
    'merch_long': -122.0,
    'trans_date_ts': 1577836800,
    'category': 'misc_pos'
}

input_df = spark.createDataFrame([Row(**user_input)])
input_df = indexer_model.transform(input_df)
input_df = assembler.transform(input_df)



In [12]:
# Individual Model Predictions
print("Logistic Regression Prediction:")
lr_model.transform(input_df).select("prediction", "probability").show()

print("Random Forest Prediction:")
rf_model.transform(input_df).select("prediction", "probability").show()

print("GBT Prediction:")
gbt_model.transform(input_df).select("prediction", "probability").show()

print("Decision Tree Prediction:")
dt_model.transform(input_df).select("prediction", "probability").show()



Logistic Regression Prediction:
+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       1.0|[2.55771088320235...|
+----------+--------------------+

Random Forest Prediction:
+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       0.0|[0.99660064607331...|
+----------+--------------------+

GBT Prediction:
+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       0.0|[0.95626712005716...|
+----------+--------------------+

Decision Tree Prediction:
+----------+--------------------+
|prediction|         probability|
+----------+--------------------+
|       0.0|[0.99962934054242...|
+----------+--------------------+



In [13]:
# Ensemble (Voting)
preds = [
    rf_model.transform(input_df).select("prediction").first()[0],
    lr_model.transform(input_df).select("prediction").first()[0],
    gbt_model.transform(input_df).select("prediction").first()[0],
    dt_model.transform(input_df).select("prediction").first()[0],
]
final_vote = round(sum(preds) / len(preds))
print(f"Ensembled (majority voting) prediction: {final_vote}")

Ensembled (majority voting) prediction: 0
