In [1]:
from pyspark import SparkContext
import math
import time

sc = SparkContext.getOrCreate()

In [2]:
# 1. Improved Data Loading and Parsing
def load_and_parse_data(sc, filepath):
    """Load and parse CSV data, handling headers and malformed records"""
    try:
        # More robust header handling
        lines = sc.textFile(filepath)
        # header = lines.first()

        indexed_rdd = lines.zipWithIndex()

        header = indexed_rdd.filter(lambda x: x[1] == 0).map(lambda x: x[0]).collect()
        if header:
            header = header[0]
            data = indexed_rdd.filter(lambda x: x[1] > 0).map(lambda x: x[0])
        
        # Skip header and parse data
        data = lines.filter(lambda line: line != header).map(
            lambda line: [float(x.strip('"')) if x.strip('"').isdigit() else 0.0 
            for x in line.split(",")]
        )
        
        # Create feature-label pairs, handle empty lines
        rdd_data = data.filter(lambda cols: len(cols) > 1).map(
            lambda cols: (cols[:-1], cols[-1])
        )
        
        # Cache as we'll reuse this RDD
        rdd_data.cache()
        
        # Count features for verification
        num_features = len(rdd_data.first()[0]) if not rdd_data.isEmpty() else 0
        print(f"Loaded dataset with {rdd_data.count()} records and {num_features} features")
        
        return rdd_data
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return sc.emptyRDD()


In [3]:
# Load data
file_path = "../creditcard.csv/creditcard.csv"
rdd_data = load_and_parse_data(sc, file_path)

Loaded dataset with 284807 records and 30 features


In [4]:
rdd_data

PythonRDD[4] at RDD at PythonRDD.scala:53

In [5]:
# 2. Enhanced Initialization with Feature Scaling
def feature_scaling(rdd):
    """Scale features to zero mean and unit variance"""
    # Calculate stats for each feature
    feature_stats = rdd.map(lambda x: x[0]).zipWithIndex().flatMap(
        lambda x: [(i, (val, val**2, 1)) for i, val in enumerate(x[0])]
    ).reduceByKey(
        lambda a, b: (a[0]+b[0], a[1]+b[1], a[2]+b[2])
    ).collectAsMap()
    
    # Calculate mean and std for each feature
    scaling_params = {}
    for i, (sum_x, sum_x2, count) in feature_stats.items():
        mean = sum_x / count
        std = math.sqrt((sum_x2 / count) - mean**2)
        scaling_params[i] = (mean, std if std != 0 else 1.0)
    
    # Scale features
    scaled_rdd = rdd.map(
        lambda x: (
            [(val - scaling_params[i][0])/scaling_params[i][1] for i, val in enumerate(x[0])],
            x[1]
        )
    )
    return scaled_rdd

In [6]:
# Scale features (important for gradient descent)
scaled_data = feature_scaling(rdd_data)

In [7]:
scaled_data

PythonRDD[14] at RDD at PythonRDD.scala:53

In [8]:
try:
    first_record = rdd_data.first()
except Exception as e:
    print(f"Error getting first element: {str(e)}")

In [11]:
len(first_record[0])

30

In [12]:
# Initialize parameters
num_features = len(first_record[0])
initial_weights = [0.0] * num_features
learning_rate = 0.1  # Can be larger with scaled features
num_iterations = 50
regularization_param = 0.01  # L2 regularization

In [13]:
# 3. Enhanced Helper Functions
def sigmoid(z):
    """Numerically stable sigmoid function"""
    # Prevent overflow
    z = max(min(z, 20), -20)
    return 1.0 / (1.0 + math.exp(-z))

def predict(features, weights):
    """Compute prediction with bias term"""
    # Add bias term (1.0) to features
    extended_features = features + [1.0]
    extended_weights = weights + [0.0]  # Bias term weight
    z = sum(w * f for w, f in zip(extended_weights, extended_features))
    return sigmoid(z)

def compute_gradient(point, weights):
    """Compute gradient with regularization"""
    features, label = point
    prediction = predict(features, weights)
    error = prediction - label
    
    # Gradient for features
    gradient = [error * f for f in features]
    
    # Add regularization (excluding bias term)
    gradient = [g + regularization_param * w for g, w in zip(gradient, weights)]
    
    # Gradient for bias term (always 1.0)
    bias_gradient = error
    
    return gradient + [bias_gradient]

def compute_loss(point, weights):
    """Compute regularized logistic loss"""
    features, label = point
    prediction = predict(features, weights)
    
    # Avoid log(0)
    epsilon = 1e-15
    prediction = max(min(prediction, 1 - epsilon), epsilon)
    
    # Log loss
    loss = -label * math.log(prediction) - (1 - label) * math.log(1 - prediction)
    
    # L2 regularization
    reg_loss = 0.5 * regularization_param * sum(w**2 for w in weights)
    
    return loss + reg_loss


In [14]:
# 4. Enhanced Training with Early Stopping
def train_logistic_regression(rdd, initial_weights, learning_rate, max_iter):
    """Train logistic regression with early stopping"""
    weights = initial_weights.copy()
    best_weights = initial_weights.copy()
    best_loss = float('inf')
    no_improvement_count = 0
    
    for i in range(max_iter):
        start_time = time.time()
        
        # Compute gradients (map-reduce)
        gradients = rdd.map(
            lambda point: compute_gradient(point, weights)
        ).reduce(
            lambda a, b: [x + y for x, y in zip(a, b)]
        )
        
        # Average gradients
        num_points = rdd.count()
        gradients = [g / num_points for g in gradients]
        
        # Update weights
        weights = [w - learning_rate * g for w, g in zip(weights, gradients)]
        
        # Compute loss
        total_loss = rdd.map(
            lambda point: compute_loss(point, weights)
        ).sum()
        avg_loss = total_loss / num_points
        
        # Early stopping check
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_weights = weights.copy()
            no_improvement_count = 0
        else:
            no_improvement_count += 1
            if no_improvement_count >= 5:
                print(f"Early stopping at iteration {i}")
                break
        
        # Print progress
        iteration_time = time.time() - start_time
        print(f"Iteration {i}: Loss = {avg_loss:.6f}, Time = {iteration_time:.2f}s")
    
    return best_weights


In [15]:
# Train model
print("\nStarting training...")
final_weights = train_logistic_regression(
    scaled_data, 
    initial_weights + [0.0],  # Add bias term
    learning_rate, 
    num_iterations
)
print("\nTraining completed.")


Starting training...
Iteration 0: Loss = 0.668642, Time = 16.65s
Iteration 1: Loss = 0.645371, Time = 15.80s
Iteration 2: Loss = 0.623270, Time = 15.63s
Iteration 3: Loss = 0.602279, Time = 17.83s
Iteration 4: Loss = 0.582338, Time = 16.19s
Iteration 5: Loss = 0.563391, Time = 19.88s
Iteration 6: Loss = 0.545384, Time = 15.47s
Iteration 7: Loss = 0.528266, Time = 15.40s
Iteration 8: Loss = 0.511987, Time = 14.96s
Iteration 9: Loss = 0.496502, Time = 15.49s
Iteration 10: Loss = 0.481766, Time = 15.57s
Iteration 11: Loss = 0.467736, Time = 15.34s
Iteration 12: Loss = 0.454375, Time = 15.23s
Iteration 13: Loss = 0.441644, Time = 15.54s
Iteration 14: Loss = 0.429508, Time = 15.21s
Iteration 15: Loss = 0.417935, Time = 15.67s
Iteration 16: Loss = 0.406892, Time = 15.60s
Iteration 17: Loss = 0.396351, Time = 15.60s
Iteration 18: Loss = 0.386284, Time = 15.10s
Iteration 19: Loss = 0.376666, Time = 15.80s
Iteration 20: Loss = 0.367470, Time = 15.20s
Iteration 21: Loss = 0.358676, Time = 15.67

In [16]:
# 5. Comprehensive Evaluation
def evaluate_model(rdd, weights, threshold=0.5):
    """Evaluate model with multiple metrics"""
    # Make predictions
    predictions = rdd.map(
        lambda point: (
            predict(point[0], weights),  # Probability
            point[1]  # Actual label
        )
    ).cache()
    
    # Calculate metrics at different thresholds
    results = {}
    for threshold in [0.3, 0.5, 0.7]:
        # Classify based on threshold
        classified = predictions.map(
            lambda x: (1 if x[0] >= threshold else 0, x[1])
        )
        
        # Calculate confusion matrix
        true_pos = classified.filter(lambda x: x[0] == 1 and x[1] == 1).count()
        false_pos = classified.filter(lambda x: x[0] == 1 and x[1] == 0).count()
        true_neg = classified.filter(lambda x: x[0] == 0 and x[1] == 0).count()
        false_neg = classified.filter(lambda x: x[0] == 0 and x[1] == 1).count()
        
        # Calculate metrics
        accuracy = (true_pos + true_neg) / (true_pos + false_pos + true_neg + false_neg)
        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        results[threshold] = {
            'threshold': threshold,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'confusion_matrix': {
                'true_pos': true_pos,
                'false_pos': false_pos,
                'true_neg': true_neg,
                'false_neg': false_neg
            }
        }
    
    return results


In [17]:
# Evaluate model
print("\nEvaluating model...")
eval_results = evaluate_model(scaled_data, final_weights)

for threshold, metrics in eval_results.items():
    print(f"\nMetrics at threshold {threshold}:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    print("Confusion Matrix:")
    print(f"True Positives: {metrics['confusion_matrix']['true_pos']}")
    print(f"False Positives: {metrics['confusion_matrix']['false_pos']}")
    print(f"True Negatives: {metrics['confusion_matrix']['true_neg']}")
    print(f"False Negatives: {metrics['confusion_matrix']['false_neg']}")

# Save final weights
print("\nFinal weights (including bias term):")
print(final_weights)


Evaluating model...

Metrics at threshold 0.3:
Accuracy: 0.9983
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Confusion Matrix:
True Positives: 0
False Positives: 0
True Negatives: 284315
False Negatives: 492

Metrics at threshold 0.5:
Accuracy: 0.9983
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Confusion Matrix:
True Positives: 0
False Positives: 0
True Negatives: 284315
False Negatives: 492

Metrics at threshold 0.7:
Accuracy: 0.9983
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Confusion Matrix:
True Positives: 0
False Positives: 0
True Negatives: 284315
False Negatives: 492

Final weights (including bias term):
[-0.001659404997448309, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0005283581234271225, -1.506893951529394]
