In [1]:
from pyspark import SparkContext
import numpy as np

# Initialize Spark context
sc = SparkContext("local", "LogisticRegression")

# Load data and create RDD
file_path = "../creditcard.csv/creditcard.csv"
# data_path = "creditcard.csv"  # Update with your path
raw_rdd = sc.textFile(file_path)

In [2]:
indexed_rdd = raw_rdd.zipWithIndex()

header = indexed_rdd.filter(lambda x: x[1] == 0).map(lambda x: x[0]).collect()
if header:
    header = header[0]
    data = indexed_rdd.filter(lambda x: x[1] > 0).map(lambda x: x[0])

# Skip header and parse data
parsed_rdd = raw_rdd.filter(lambda line: line != header).map(
    lambda line: [float(x.strip('"')) if x.strip('"').isdigit() else 0.0 
    for x in line.split(",")]
)

In [3]:
def parse_record(record):
    """Convert record to (features, label) pair"""
    # Extract features (V1-V28 and Amount)
    features = [float(x) for x in record[1:29]] + [float(record[29])]
    # Standardize Amount (last feature)
    features[-1] = (features[-1] - amount_mean) / amount_std
    # Add intercept term (1.0)
    features = [1.0] + features
    # Get label (last column)
    label = int(record[30])
    return (np.array(features), label)

# Calculate mean and std of Amount for standardization
amount_stats = parsed_rdd.map(lambda x: float(x[29])).stats()
amount_mean = amount_stats.mean()
amount_std = amount_stats.stdev()

# Create final RDD of (features, label) pairs
data_rdd = parsed_rdd.map(parse_record).cache()

In [4]:
def sigmoid(z):
    """Sigmoid function"""
    return 1 / (1 + np.exp(-z))

def compute_gradient(record, weights):
    """Compute gradient for a single record"""
    features, label = record
    z = np.dot(weights, features)
    error = sigmoid(z) - label
    return error * features

def update_weights(weights, gradients, learning_rate, num_points):
    """Update weights using averaged gradients"""
    avg_gradient = gradients.reduce(lambda a, b: a + b) / num_points
    return weights - learning_rate * avg_gradient

def predict(features, weights, threshold=0.5):
    """Make prediction for a single record"""
    return int(sigmoid(np.dot(weights, features)) >= threshold)

In [5]:
# Modify the stratified split to return RDDs without preserving partitions (improve efficiency)
def stratified_split_rdd(rdd, test_size=0.2, seed=None):
    """Stratified split for RDD maintaining class ratios"""
    # Separate classes
    class_0_rdd = rdd.filter(lambda x: x[1] == 0)
    class_1_rdd = rdd.filter(lambda x: x[1] == 1)
    
    # Calculate splits
    split_0 = int(class_0_rdd.count() * test_size)
    split_1 = int(class_1_rdd.count() * test_size)
    
    # Split each class
    test_rdd = (
        class_0_rdd.take(split_0)  # This is a list, not RDD
        + class_1_rdd.take(split_1)
    )
    train_rdd = (
        class_0_rdd.filter(lambda x: x not in test_rdd)
        .union(class_1_rdd.filter(lambda x: x not in test_rdd))
    )
    
    # Convert back to RDD using the original context
    train_rdd = sc.parallelize(train_rdd.collect())
    test_rdd = sc.parallelize(test_rdd)
    
    return train_rdd, test_rdd

In [6]:
# Split the data (after parse_record step)
train_rdd, test_rdd = stratified_split_rdd(data_rdd, test_size=0.2, seed=42)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 7.0 (TID 27) (DESKTOP-VH5UT1J executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "D:\spark-3.5.0-bin-hadoop3\spark-3.5.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1247, in main
  File "D:\spark-3.5.0-bin-hadoop3\spark-3.5.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1239, in process
  File "D:\spark-3.5.0-bin-hadoop3\spark-3.5.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "D:\spark-3.5.0-bin-hadoop3\spark-3.5.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\util.py", line 83, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\029at\AppData\Local\Temp\ipykernel_12044\471116915.py", line 18, in <lambda>
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1046)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1046)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:407)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1045)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "D:\spark-3.5.0-bin-hadoop3\spark-3.5.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1247, in main
  File "D:\spark-3.5.0-bin-hadoop3\spark-3.5.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1239, in process
  File "D:\spark-3.5.0-bin-hadoop3\spark-3.5.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "D:\spark-3.5.0-bin-hadoop3\spark-3.5.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\util.py", line 83, in wrapper
    return f(*args, **kwargs)
  File "C:\Users\029at\AppData\Local\Temp\ipykernel_12044\471116915.py", line 18, in <lambda>
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1046)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
# Downsample majority class in the train set
def downsample_train(train_rdd, seed=42):
    # Separate classes
    class_0 = train_rdd.filter(lambda x: x[1] == 0)
    class_1 = train_rdd.filter(lambda x: x[1] == 1)
    
    count_0 = class_0.count()
    count_1 = class_1.count()
    
    # Identify majority and minority
    if count_0 > count_1:
        majority_rdd = class_0
        minority_rdd = class_1
        minority_count = count_1
    else:
        majority_rdd = class_1
        minority_rdd = class_0
        minority_count = count_0
    
    # Downsample majority to match minority count
    fraction = minority_count / majority_rdd.count()
    downsampled_majority = majority_rdd.sample(False, fraction, seed=seed)
    
    # Combine and cache
    balanced_train = downsampled_majority.union(minority_rdd).cache()
    return balanced_train

In [None]:
# Apply downsampling
balanced_train_rdd = downsample_train(train_rdd, seed=42)

In [None]:
# Verify new class balance
print(f"Balanced Train count: {balanced_train_rdd.count()}")
print(f"Fraud ratio in balanced train: {balanced_train_rdd.filter(lambda x: x[1]==1).count() / balanced_train_rdd.count():.4f}")

In [None]:
try:
    first_record = balanced_train_rdd.first()
except Exception as e:
    print(f"Error getting first element: {str(e)}")

In [None]:
len(first_record[0])

In [None]:
# Initialize parameters
num_features = len(first_record[0])
weights = np.zeros(num_features)
learning_rate = 0.01
iterations = 100
num_points = balanced_train_rdd.count()

In [None]:
# Training loop
for i in range(iterations):
    # Compute gradients for all points
    gradients = balanced_train_rdd.map(lambda record: compute_gradient(record, weights))
    
    # Update weights
    weights = update_weights(weights, gradients, learning_rate, num_points)
    
    # Periodically print progress
    if i % 10 == 0:
        # print(f"Iteration {i}: weights = {weights[:5]}...")  # Print first 5 weights
        train_pred = balanced_train_rdd.map(lambda r: (r[1], predict(r[0], weights)))
        train_acc = train_pred.filter(lambda x: x[0]==x[1]).count()/num_points
        print(f"Iter {i}: Train Accuracy = {train_acc:.4f}")

In [None]:
# Make predictions on training data (for demonstration)
predictions_rdd = test_rdd.map(lambda record: (
    record[1],  # Actual label
    predict(record[0], weights)  # Predicted label
))

# Calculate confusion matrix
confusion = predictions_rdd.map(lambda x: (x, 1)) \
                          .reduceByKey(lambda a, b: a + b) \
                          .collectAsMap()

In [None]:
# confusion = dict(confusion)

print("Confusion Matrix:")
print("True Negatives:", confusion.get((0, 0), 0))
print("False Positives:", confusion.get((0, 1), 0))
print("False Negatives:", confusion.get((1, 0), 0))
print("True Positives:", confusion.get((1, 1), 0))

# Calculate metrics
tn = confusion.get((0,0), 0)
fp = confusion.get((0,1), 0)
fn = confusion.get((1,0), 0)
tp = confusion.get((1,1), 0)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2*(precision*recall)/(precision+recall) if (precision+recall) > 0 else 0

print(f"\nPrecision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np

# Convert RDDs to MLlib format
train_data_mllib = train_rdd.map(lambda x: LabeledPoint(x[1], x[0]))
test_data_mllib = test_rdd.map(lambda x: LabeledPoint(x[1], x[0]))

# Train MLlib model (with same parameters as manual implementation)
mllib_model = LogisticRegressionWithSGD.train(
    train_data_mllib,
    iterations=100,
    step=0.01,
    intercept=True  # Our manual implementation already has intercept
)

In [None]:
def evaluate_model(model, test_data, is_mllib=False):
    """Evaluate model and return metrics"""
    if is_mllib:
        # MLlib predictions
        preds_labels = test_data.map(lambda lp: (
            lp.label,
            1 if model.predict(lp.features) > 0.5 else 0
        ))
    else:
        # Manual implementation predictions
        preds_labels = test_data.map(lambda x: (
            x[1],  # true label
            predict(x[0], model)  # predicted label (model=weights for manual)
        ))
    
    # Calculate confusion matrix
    confusion = preds_labels.map(lambda x: (x, 1)) \
                          .reduceByKey(lambda a,b: a+b) \
                          .collectAsMap()
    
    # Extract components
    tn = confusion.get((0,0), 0)
    fp = confusion.get((0,1), 0)
    fn = confusion.get((1,0), 0)
    tp = confusion.get((1,1), 0)
    
    # Calculate metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2*(precision*recall)/(precision+recall) if (precision+recall) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    return {
        'confusion_matrix': {'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp},
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy
    }

In [None]:
# Evaluate MLlib implementation
mllib_metrics = evaluate_model(mllib_model, test_data_mllib, is_mllib=True)
mllib_metrics