In [77]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD, LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics

spark = SparkSession.builder\
    .appName("credit-card-fraud-detection")\
    .master("local[*]")\
    .config("spark.log.level", "ERROR")\
    .getOrCreate()

# Data preparation

In [78]:
# Change the path to the CSV file as needed
# Load the dataset
df = spark.read.csv("../../data/creditcard.csv", header=True, inferSchema=True)
df.show(5)

                                                                                

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

# Data preprocessing

**Understanding the data**:
- According to the dataset description, the input variables are the result of a PCA transformation except "Time" and "Amount" so the features are previously scaled. 
- Every value in the dataset is not null so imputing is also not needed.
- The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. To deal with this problem, we have 2 methods:
    - Cost-sensitive learning: the lost function will be adjusted to favor the detection of the minority class.
    - Undersampling, oversampling technique or a combination of the two.

Because of the reasons above and the fact that I will choose the cost-sensitive learning method to deal with the highly unbalanced nature of the dataset, this data processing step will include:
- Adding a weight column of value 0.99828 whenever the label is 1 (minority) and 0.00172 when the label is 0 (majority) 
- Using the VectorAssembler class to assemble feature columns into a single vector column
- Splitting the dataset into train and test set.

When using DataFrame-based MLlib, the model will standardize the Time and Amount column first. With the RDD-based MLlib, this is not the case so I will need to standardize them by myself.

In [None]:
# Normalize Time and Amount using min-max scaling
# Compute statistics
time_stats = df.agg(min("Time").alias("min_time"), F.max("Time").alias("max_time")).collect()[0]
amount_stats = df.agg(min("Amount").alias("min_amount"), F.max("Amount").alias("max_amount")).collect()[0]

# Apply normalization (Min-Max scaling)
df = df.withColumn("Time", 
                  (col("Time") - time_stats["min_time"]) / 
                  (time_stats["max_time"] - time_stats["min_time"]))

df = df.withColumn("Amount", 
                  (col("Amount") - amount_stats["min_amount"]) / 
                  (amount_stats["max_amount"] - amount_stats["min_amount"]))

# Use all columns as features exclude the target column "Class"
input_cols = df.columns[:-1]

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
df = assembler.transform(df).select("features", "Class")

# Sample training data in a stratified fashion
train_df = df.sampleBy("Class", {1: 0.8, 0: 0.8}, seed=42)

# Get test data as the remaining set
test_df = df.subtract(train_df)

# Oversample the train df to deal with class imbalance
# Calculate class counts in the training data
class_counts = train_df.groupBy("Class").count().collect()
major_count = next((row['count'] for row in class_counts if row['Class'] == 0), 0)
minor_count = next((row['count'] for row in class_counts if row['Class'] == 1), 0)
# Calculate the desired oversampling ratio
ratio = float(major_count) / minor_count
# Filter out and oversample the minor class 
oversampled_minor_df = train_df\
    .filter(col("Class") == 1)\
    .sample(withReplacement=True, fraction=ratio, seed=42)
# Combine the minor into the train df
train_df = train_df\
    .filter(col("Class") == 0)\
    .union(oversampled_minor_df)

# Train the Logistic Regression model using spark.mllib

In [82]:
# Convert the DataFrame into an RDD of LabeledPoint objects
train_rdd = train_df.rdd.map(lambda row: LabeledPoint(row.Class, DenseVector(row.features.values)))

# Train the logistic regression model
# model = LogisticRegressionWithLBFGS.train(train_rdd, intercept=True)

# Train the model using SGD
model = LogisticRegressionWithSGD.train(train_rdd, step=1, intercept=True, iterations=100)

                                                                                

# Evaluate on test set

In [83]:
print("Coefficients: ", model.weights)
print("Intercept: ", model.intercept)

test_rdd = test_df.rdd.map(lambda row: LabeledPoint(row.Class, DenseVector(row.features.values)))
predictionAndLabels = test_rdd.map(lambda p: (float(model.predict(p.features)), p.label))

# Calculate accuracy, precision, and recall
multiMetrics = MulticlassMetrics(predictionAndLabels)
accuracy = multiMetrics.accuracy
labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect()
precision_by_label = {label: multiMetrics.precision(label) for label in labels}
recall_by_label = {label: multiMetrics.recall(label) for label in labels}

print("Accuracy: {}".format(accuracy))
print("Precision:", list(precision_by_label.values()))
print("Recall:", list(recall_by_label.values()))

# Calculate the area under the ROC curve and PR curve
binaryMetrics = BinaryClassificationMetrics(predictionAndLabels)
roc_auc = binaryMetrics.areaUnderROC
roc_pr = binaryMetrics.areaUnderPR

print("Area under ROC: {:.4f}".format(roc_auc))
print("Area under PR: {:.4f}".format(roc_pr))


Coefficients:  [-1.0409482336703346,-0.1554068524341427,0.09340311065745453,-0.49937226452937206,0.559723197394129,-0.14484663190704042,-0.13618982707538577,-0.3976244866232469,-0.05499241701336593,-0.2694559188324656,-0.6083801909877051,0.3866671931784822,-0.7631844695985326,-0.10960052671668699,-0.9691166940532628,-0.05689768392003591,-0.5887090517109573,-0.8685733113731122,-0.24887362257688883,0.09090664606598867,0.06396430777819669,0.13915508760604234,0.08416415903967822,-0.08522876820847165,-0.03994130565569177,-0.007095639373137107,-0.04339372072375452,0.001447579999732893,0.01102788190649151,-0.0050911726274490594]
Intercept:  -1.0827907889518473


                                                                                

Accuracy: 0.9768835464196169
Precision: [0.999817721149815, 0.0625]
Recall: [0.9770221406814984, 0.8958333333333334]
Area under ROC: 0.9364
Area under PR: 0.0593


                                                                                

# Experiment with parameters

In [71]:
# Perform hyperparameter tuning for step_size and max_iterations in SGD
# Define parameters to test
step_sizes = [5, 1.0, 0.1, 0.01]
max_iterations_list = [20, 50, 100, 200]
k_folds = 4

# Sample training data in a stratified fashion WITHOUT oversampling yet
# We'll use the original df to create train/test split
train_df = df.sampleBy("Class", {1: 0.8, 0: 0.8}, seed=42)
test_df = df.subtract(train_df)

# Store results
results = {}

print(f"Starting hyperparameter tuning with {k_folds}-fold cross-validation...")
for step_size in step_sizes:
    for iterations in max_iterations_list:
        accuracy_values = []
        
        # Key for storing results
        param_key = (step_size, iterations)
        
        # Perform k-fold cross-validation using sampleBy and subtract
        for i in range(k_folds):
            # Create a stratified validation fold (1/k of the data)
            # Using different seed for each fold
            validation_df = train_df.sampleBy("Class", {1: 1.0/k_folds, 0: 1.0/k_folds}, seed=42+i)
            
            # Create training fold as the remaining data
            train_fold_df = train_df.subtract(validation_df)
            
            # Now apply oversampling to just this training fold
            # Calculate class counts in the training fold
            fold_class_counts = train_fold_df.groupBy("Class").count().collect()
            fold_major_count = next((row['count'] for row in fold_class_counts if row['Class'] == 0), 0)
            fold_minor_count = next((row['count'] for row in fold_class_counts if row['Class'] == 1), 0)
            
            # Calculate the desired oversampling ratio for this fold
            fold_ratio = float(fold_major_count) / fold_minor_count
            
            # Oversample the minority class in this fold
            oversampled_minor_df = train_fold_df\
                .filter(col("Class") == 1)\
                .sample(withReplacement=True, fraction=fold_ratio, seed=42+i)
                
            # Create balanced training fold
            balanced_train_fold_df = train_fold_df\
                .filter(col("Class") == 0)\
                .union(oversampled_minor_df)
            
            # Convert to RDD format for model training
            train_fold_rdd = balanced_train_fold_df.rdd.map(lambda row: LabeledPoint(row.Class, DenseVector(row.features.values)))
            validation_fold_rdd = validation_df.rdd.map(lambda row: LabeledPoint(row.Class, DenseVector(row.features.values)))
            
            # Train model with current parameters
            model = LogisticRegressionWithSGD.train(
                train_fold_rdd, 
                iterations=iterations,
                step=step_size, 
                intercept=True,
            )
            
            # Evaluate on validation data using accuracy instead of precision
            predictionAndLabels = validation_fold_rdd.map(lambda p: (float(model.predict(p.features)), p.label))
            
            # Use MulticlassMetrics to get accuracy
            multiMetrics = MulticlassMetrics(predictionAndLabels)
            
            # Calculate accuracy
            accuracy = multiMetrics.accuracy
            accuracy_values.append(accuracy)
        
        # Calculate average accuracy across folds
        avg_accuracy = sum(accuracy_values) / len(accuracy_values)
        results[param_key] = avg_accuracy
        
        print(f"Step size: {step_size}, Iterations: {iterations}, Average Accuracy: {avg_accuracy:.4f}")

Starting hyperparameter tuning with 4-fold cross-validation...


                                                                                

Step size: 5, Iterations: 20, Average Accuracy: 0.9743


                                                                                

Step size: 5, Iterations: 50, Average Accuracy: 0.9774


                                                                                

Step size: 5, Iterations: 100, Average Accuracy: 0.9772


                                                                                

Step size: 5, Iterations: 200, Average Accuracy: 0.9772


                                                                                

Step size: 1.0, Iterations: 20, Average Accuracy: 0.9422


                                                                                

Step size: 1.0, Iterations: 50, Average Accuracy: 0.9711


                                                                                

Step size: 1.0, Iterations: 100, Average Accuracy: 0.9752


                                                                                2]

Step size: 1.0, Iterations: 200, Average Accuracy: 0.9765


                                                                                12]

Step size: 0.1, Iterations: 20, Average Accuracy: 0.0018


                                                                                ]]

Step size: 0.1, Iterations: 50, Average Accuracy: 0.0019


                                                                                ]]

Step size: 0.1, Iterations: 100, Average Accuracy: 0.0671


                                                                                2]

Step size: 0.1, Iterations: 200, Average Accuracy: 0.5153


                                                                                2]]

Step size: 0.01, Iterations: 20, Average Accuracy: 0.0018


                                                                                ]]

Step size: 0.01, Iterations: 50, Average Accuracy: 0.0018


Traceback (most recent call last):
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib64/python3.13/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib64/python3.13/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ~~~~~~~~~~~~~~~~~~~~^^
  File "/usr/lib64/python3.13/socket.py", line 719, in readinto
    return self._sock.recv_into(b)
           ~~~~~~~~~~~~~~~~~~~~^^^
KeyboardInterrupt
25/04/09 21:29:54 ERROR Executor: Exception in task 13.0 in stage 118034.0 (TID 356337)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
    ~~~~~~~^^
  File "/home/keineik/Projects/lab03-spark-ml/venv/l

KeyboardInterrupt: 

In [None]:
# Find best parameters based on accuracy
best_accuracy = -1
best_params = None
for params, accuracy in results.items():
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

# Extract individual parameters
best_step_size, best_iterations = best_params

print("\nBest parameters:")
print(f"Step size: {best_step_size}")
print(f"Iterations: {best_iterations}")
print(f"Best average accuracy: {best_accuracy:.4f}")

# Train final model with best parameters
print(f"\nTraining final SGD model with step size = {best_step_size}, iterations = {best_iterations}...")
sgd_model = LogisticRegressionWithSGD.train(
    train_rdd,
    iterations=best_iterations,
    step=best_step_size,
    intercept=True,
)

# Evaluate on test set
sgd_predictionAndLabels = test_rdd.map(lambda p: (float(sgd_model.predict(p.features)), p.label))

# Calculate metrics
sgd_multiMetrics = MulticlassMetrics(sgd_predictionAndLabels)
sgd_binaryMetrics = BinaryClassificationMetrics(sgd_predictionAndLabels)

# Print evaluation metrics
sgd_accuracy = sgd_multiMetrics.accuracy
sgd_labels = sgd_predictionAndLabels.map(lambda x: x[1]).distinct().collect()
sgd_precision_by_label = {label: sgd_multiMetrics.precision(label) for label in sgd_labels}
sgd_recall_by_label = {label: sgd_multiMetrics.recall(label) for label in sgd_labels}
sgd_roc_auc = sgd_binaryMetrics.areaUnderROC
sgd_roc_pr = sgd_binaryMetrics.areaUnderPR

print(f"\nSGD Model Evaluation (with best step size = {best_step_size}, iterations = {best_iterations}):")
print(f"Accuracy: {sgd_accuracy:.4f}")
print("Precision:", {l: f"{p:.4f}" for l, p in sgd_precision_by_label.items()})
print("Recall:", {l: f"{r:.4f}" for l, r in sgd_recall_by_label.items()})
print(f"Area under ROC: {sgd_roc_auc:.4f}")
print(f"Area under PR: {sgd_roc_pr:.4f}")


Best parameters:
Step size: 5
Iterations: 50
Best average accuracy: 0.9774

Training final SGD model with step size = 5, iterations = 50...


                                                                                


SGD Model Evaluation (with best step size = 5, iterations = 50):
Accuracy: 0.9788
Precision: {0.0: '0.9998', 1.0: '0.0685'}
Recall: {0.0: '0.9789', 1.0: '0.9062'}
Area under ROC: 0.9426
Area under PR: 0.0654

Comparison with LBFGS model:
SGD (step_size = 5, iterations = 50) Precision: 0.0685
SGD AUC: 0.9426
LBFGS AUC: 0.9364
