In [1]:
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder\
    .appName("credit-card-fraud-detection")\
    .master("local[*]")\
    .config("spark.log.level", "ERROR")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/03 11:47:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".


# Data preparation

In [3]:
# Change the path to the CSV file as needed
# Load the dataset
df = spark.read.csv("../../data/creditcard.csv", header=True, inferSchema=True)

                                                                                

# Data preprocessing

**Understanding the data**:
- According to the dataset description, the input variables are the result of a PCA transformation except "Time" and "Amount" so the features are previously scaled. 
- Every value in the dataset is not null so imputing is also not needed.
- The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. To deal with this problem, we have 2 methods:
    - Cost-sensitive learning: the lost function will be adjusted to favor the detection of the minority class.
    - Undersampling, oversampling technique or a combination of the two.

Because of the reasons above and the fact that I will choose the cost-sensitive learning method to deal with the highly unbalanced nature of the dataset, this data processing step will include:
- Adding a weight column of value 0.99828 whenever the label is 1 (minority) and 0.00172 when the label is 0 (majority) 
- Using the VectorAssembler class to assemble feature columns into a single vector column
- Splitting the dataset into train and test set.

In [4]:
# Use all columns as features exclude the target column "Class"
input_cols = df.columns[:-1]

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
df = assembler.transform(df)
df = df.select("features", "Class")

# Split the data into training and test sets in a stratified fashion
# Compute fraction for each class (80% for train, 20% for test)
fractions = df.groupBy("Class").count().withColumn("fraction", col("count") * 0.8 / col("count"))
fractions_dict = {row["Class"]: row["fraction"] for row in fractions.collect()}

# Sample training data using fractions
train_df = df.sampleBy("Class", fractions_dict, seed=42)

# Get test data as the remaining set
test_df = df.subtract(train_df)

# Add a weight column for the class imbalance
train_df = train_df.withColumn("weight", when(df.Class == 0, 0.00172).otherwise(0.99828))

# Train the Logistic Regression model using spark.ml

In [5]:
# Initialize the Logistic Regression estimator
lr = LogisticRegression(
    featuresCol="features",
    labelCol="Class",
    weightCol="weight"
)

# Fit the model
model = lr.fit(train_df)

                                                                                

# Evaluate the obtained model

In [6]:
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

summary = model.summary
print("Accuracy:", summary.accuracy)
print("Area under ROC:", summary.areaUnderROC)

Coefficients: [-1.1304776212373613e-05,1.1446646282802129,0.4487875981835195,0.41065340104236464,1.0029877858313643,0.9517032475310695,-0.5791569027531398,-1.0387979511217196,-0.5110226174207108,-0.9565685022022705,-1.8437666725605057,0.4518793584715885,-1.2281340781090573,-0.3977433871920711,-1.5716065956273857,-0.23816492811551268,-1.0915377866785836,-1.3580135250379064,-0.2786183868844708,0.6790906063180608,-1.7049488476778822,0.27423389270389287,0.9579861820429577,0.5831291045579751,-0.33599455197926154,-0.16959463703556144,-0.32986384864204804,-1.4944112826653644,0.32081550914282697,0.010985096581117312]
Intercept: -3.8982867300131296


                                                                                

Accuracy: 0.9515085906954918


                                                                                

Area under ROC: 0.9892048346905004


# Evaluate on test set

In [7]:
# Evaluate the model
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="Class",
    predictionCol="prediction",
    metricName="accuracy"
)

# AUC-ROC and AUC-PR
binary_evaluator = BinaryClassificationEvaluator(
    labelCol="Class",
    rawPredictionCol="rawPrediction"
)

accuracy = accuracy_evaluator.evaluate(model.transform(test_df))
auc_roc = binary_evaluator.evaluate(model.transform(test_df), {binary_evaluator.metricName: "areaUnderROC"})

print("Accuracy:", accuracy)
print("Area under ROC:", auc_roc)



Accuracy: 0.9770613652933122
Area under ROC: 0.9701848173942983


                                                                                