In [1]:
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder\
    .appName("credit-card-fraud-detection")\
    .master("local[*]")\
    .config("spark.log.level", "ERROR")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/03 20:42:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".


# Data preparation

In [2]:
# Change the path to the CSV file as needed
# Load the dataset
df = spark.read.csv("../../data/creditcard.csv", header=True, inferSchema=True)

                                                                                

# Data preprocessing

**Understanding the data**:
- According to the dataset description, the input variables are the result of a PCA transformation except "Time" and "Amount" so the features are previously scaled. 
- Every value in the dataset is not null so imputing is also not needed.
- The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. To deal with this problem, we have 2 methods:
    - Cost-sensitive learning: the lost function will be adjusted to favor the detection of the minority class.
    - Undersampling, oversampling technique or a combination of the two.

Because of the reasons above and the fact that I will choose the oversampling method to deal with the highly unbalanced nature of the dataset, this data processing step will include:
- Using the VectorAssembler class to assemble feature columns into a single vector column
- Splitting the dataset into train and test set.
- Oversample the minority class (Class = 1) 

In [None]:
# Use all columns as features exclude the target column "Class"
input_cols = df.columns[:-1]

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
df = assembler.transform(df)
df = df.select("features", "Class")

# Sample training data in a stratified fashion
train_df = df.sampleBy("Class", {1: 0.8, 0: 0.8}, seed=42)

# Get test data as the remaining set
test_df = df.subtract(train_df)

# Oversample the train df to deal with class imbalance
# Calculate class counts in the training data
class_counts = train_df.groupBy("Class").count().collect()
major_count = next((row['count'] for row in class_counts if row['Class'] == 0), 0)
minor_count = next((row['count'] for row in class_counts if row['Class'] == 1), 0)
# Calculate the desired oversampling ratio
ratio = float(major_count) / minor_count
# Filter out and oversample the minor class 
oversampled_minor_df = train_df\
    .filter(col("Class") == 1)\
    .sample(withReplacement=True, fraction=ratio, seed=42)
# Combine the minor into the train df
train_df = train_df\
    .filter(col("Class") == 0)\
    .union(oversampled_minor_df)

# Train the Logistic Regression model using spark.ml

In [4]:
# Initialize the Logistic Regression estimator
lr = LogisticRegression(
    featuresCol="features",
    labelCol="Class"
)

# Fit the model
model = lr.fit(train_df)

                                                                                

# Evaluate the obtained model

In [5]:
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

summary = model.summary
print("Accuracy:", summary.accuracy)
print("Area under ROC:", summary.areaUnderROC)
print("Precision:", summary.precisionByLabel)
print("Recall:", summary.recallByLabel)

Coefficients: [-5.072919216040532e-06,0.16032398462139535,-0.038098422937629786,-0.04588383718109761,0.7639024881965766,0.11393107652483428,-0.11670485875053566,-0.11739974242966567,-0.18268219074469103,-0.34139033159368437,-0.9021537931553096,-0.16906977637661197,0.2478004554489767,-0.4062627768700422,-0.6185203932606864,-0.06352243286229126,-0.3175113805005493,-0.04142497489779145,0.07999330524501906,0.14226985067241435,-0.5429696453570624,0.4238613359741888,0.6848894116402684,-0.10008728402951969,0.10235134086221194,-0.05006994175913812,0.087387702118653,-0.9264996689917785,-0.34964449580186685,0.000978891398308182]
Intercept: -8.46709645408243


                                                                                

Accuracy: 0.9992244495563588


                                                                                

Area under ROC: 0.9797735958050905
Precision: [0.9993726474278545, 0.8798586572438163]
Recall: [0.999850767887005, 0.6352040816326531]


# Evaluate on test set

In [6]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="Class",
    predictionCol="prediction"
)

# AUC-ROC and AUC-PR
binary_evaluator = BinaryClassificationEvaluator(
    labelCol="Class",
    rawPredictionCol="rawPrediction"
)

# Predict on the test set
predictions = model.transform(test_df)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = [evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 0.0}),
             evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1.0})]
recall = [evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 0.0}),
        evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1.0})]
auc_roc = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
auc_pr = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderPR"})

print("Accuracy:", accuracy)
print("Area under ROC:", auc_roc)
print("Area under PR:", auc_pr)
print("Precision:", precision)
print("Recall:", recall)

                                                                                

Accuracy: 0.9992353788431104
Area under ROC: 0.9543174032050252
Area under PR: 0.6998219082805189
Precision: [0.9993946515685647, 0.8732394366197183]
Recall: [0.9998396893535918, 0.6458333333333334]
