In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder\
    .appName("credit-card-fraud-detection")\
    .master("local[*]")\
    .config("spark.log.level", "ERROR")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/10 17:32:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".


# Data preparation

In [2]:
# Change the path to the CSV file as needed
# Load the dataset
df = spark.read.csv("../../data/creditcard.csv", header=True, inferSchema=True)
df.show(5)

                                                                                

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

# Data preprocessing

**Understanding the data**:
- According to the dataset description, the input variables are the result of a PCA transformation except "Time" and "Amount" so the features are previously scaled. 
- Every value in the dataset is not null so imputing is also not needed.
- The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. To deal with this problem, we have 2 methods:
    - Cost-sensitive learning: the lost function will be adjusted to favor the detection of the minority class.
    - Undersampling, oversampling technique or a combination of the two.

Because of the reasons above and the fact that I will choose the oversampling method to deal with the highly unbalanced nature of the dataset, this data processing step will include:
- Using the VectorAssembler class to assemble feature columns into a single vector column
- Splitting the dataset into train and test set.
- Oversample the minority class (Class = 1) 

In [3]:
# Use all columns as features exclude the target column "Class"
input_cols = df.columns[:-1]

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
df = assembler.transform(df)
df = df.select("features", "Class")

# Sample training data in a stratified fashion
train_df = df.sampleBy("Class", {1: 0.8, 0: 0.8}, seed=42)

# Get test data as the remaining set
test_df = df.subtract(train_df)

# Oversample the train df to deal with class imbalance
# Calculate class counts in the training data
class_counts = train_df.groupBy("Class").count().orderBy("Class").collect()
major_count, minor_count = class_counts[0]["count"], class_counts[1]["count"]
# Calculate the desired oversampling ratio
ratio = float(major_count) / minor_count
# Filter out and oversample the minor class 
oversampled_minor_df = train_df\
    .filter(col("Class") == 1)\
    .sample(withReplacement=True, fraction=ratio, seed=42)
# Combine the minor with the train df
train_df = train_df\
    .filter(col("Class") == 0)\
    .union(oversampled_minor_df)

# Train the Logistic Regression model using spark.ml

In [4]:
# Initialize the Logistic Regression estimator
lr = LogisticRegression(
    featuresCol="features",
    labelCol="Class",
)

# Fit the model
model = lr.fit(train_df)

                                                                                

# Evaluate the obtained model

In [5]:
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

summary = model.summary
print("Accuracy:", summary.accuracy)
print("Area under ROC:", summary.areaUnderROC)
print("Precision:", summary.precisionByLabel)
print("Recall:", summary.recallByLabel)

Coefficients: [-1.1309613997836921e-05,1.0575146905678012,0.37187403083503523,0.3621571131535163,0.9928752230265231,0.8683937847855047,-0.549018270279921,-0.9346498393330334,-0.5091200105183902,-0.9162622414829977,-1.7520304262299293,0.41761632344260896,-1.1646986410886988,-0.37874594035348685,-1.5143775748471835,-0.22869965604488743,-1.0284618415091276,-1.260125126584444,-0.2449533405440587,0.6518054724077349,-1.5799836683001411,0.2755090309667702,0.9164506102458377,0.4930653381465653,-0.3424322104467807,-0.15849601385485856,-0.37285911122554016,-1.4030783113272092,0.26051747112516116,0.0099777173517977]
Intercept: -3.726361012393294


                                                                                

Accuracy: 0.9506457625260388


                                                                                

Area under ROC: 0.9888904925032321
Precision: [0.928193963054259, 0.9755331887763358]
Recall: [0.9767724605303008, 0.9245630255414803]


# Evaluate on test set

In [6]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="Class",
    predictionCol="prediction"
)

# AUC-ROC and AUC-PR
binary_evaluator = BinaryClassificationEvaluator(
    labelCol="Class",
    rawPredictionCol="rawPrediction"
)

# Predict on the test set
predictions = model.transform(test_df)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = [evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 0.0}),
             evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1.0})]
recall = [evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 0.0}),
        evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1.0})]
auc_roc = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
auc_pr = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderPR"})

print("Accuracy:", accuracy)
print("Area under ROC:", auc_roc)
print("Area under PR:", auc_pr)
print("Precision:", precision)
print("Recall:", recall)

                                                                                

Accuracy: 0.9768835464196169
Area under ROC: 0.9703564462692146
Area under PR: 0.7173726040790815
Precision: [0.9998541662868914, 0.06376811594202898]
Recall: [0.9769865160934077, 0.9166666666666666]
