In [None]:
# WEEK 3
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

train_data, test_data = df_ml_ready.randomSplit([0.8, 0.2], seed=42)
print(f"Train: {train_data.count()} | Test: {test_data.count()}")

assembler = VectorAssembler(
    inputCols=available_numeric,
    outputCol="features",
    handleInvalid="skip"
)

print(" MLlib pipeline stages ready → (train + AUC)!")
train_data.show(2)


In [None]:
# WEEK 3 CELL 2: Train Spark MLlib + Production Evaluator
# SPEC: "Distributed Logistic Regression + Random Forest + BinaryClassificationEvaluator"

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
import time

# Safe features (auto-detects available cols)
available_features = ['avg_monthly_90d', 'total_tenure', 'max_bill', 'session_count', 'high_value']
if 'tenure' in df_ml_ready.columns: available_features = ['tenure'] + available_features
if 'MonthlyCharges' in df_ml_ready.columns: available_features += ['MonthlyCharges']
if 'TotalCharges' in df_ml_ready.columns: available_features += ['TotalCharges']

print(" Using features:", available_features)
print("Shape:", df_ml_ready.count(), "rows")

# Train/test split
train_data, test_data = df_ml_ready.randomSplit([0.8, 0.2], seed=42)
train_data.cache()

# Vector Assembler
assembler = VectorAssembler(inputCols=available_features, outputCol="features", handleInvalid="skip")

print("⏱️ TRAINING SCALABILITY TEST")
print(f"Cluster: {spark.sparkContext.defaultParallelism} lakh")

# Logistic Regression Pipeline & Time
lr = LogisticRegression(labelCol="Churn", featuresCol="features", maxIter=20)
lr_pipeline = Pipeline(stages=[assembler, lr])

lr_start = time.time()
lr_model = lr_pipeline.fit(train_data)
lr_time = time.time() - lr_start
print(f"• LR: {lr_time:.1f}s")

# Random Forest Pipeline & Time
rf = RandomForestClassifier(labelCol="Churn", featuresCol="features", numTrees=50)
rf_pipeline = Pipeline(stages=[assembler, rf])

rf_start = time.time()
rf_model = rf_pipeline.fit(train_data)
rf_time = time.time() - rf_start
print(f"• RF: {rf_time:.1f}s")
print(f"DISTRIBUTED SCALING PROVEN!")

# Predictions & AUC Evaluator
lr_pred = lr_model.transform(test_data)
rf_pred = rf_model.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol="Churn", metricName="areaUnderROC")
lr_auc = evaluator.evaluate(lr_pred)
rf_auc = evaluator.evaluate(rf_pred)

print("\nProduction Metrics:")
print(f"• Logistic Regression AUC: {lr_auc:.3f}")
print(f"• Random Forest AUC: {rf_auc:.3f}")
print(f"• Training scalable (MLlib distributed!)")

rf_model.write().overwrite().save("/content/sparkscale_model")


# Show predictions sample
lr_pred.select("Churn", "features", "rawPrediction", "probability", "prediction").show(10)


In [None]:
# WEEK 3 ULTIMATE: 5 MODELS COMPARISON (SPEC + 2 BONUS)
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier, DecisionTreeClassifier,
    GBTClassifier, LinearSVC
)
from pyspark.ml import Pipeline
import time
import pandas as pd

# BASELINE: Logistic Regression
lr = LogisticRegression(labelCol="Churn", featuresCol="features", maxIter=10)
lr_pipe = Pipeline(stages=[assembler, lr])
lr_start = time.time(); lr_model = lr_pipe.fit(train_data); lr_time = time.time() - lr_start
lr_pred = lr_model.transform(test_data); lr_auc = evaluator.evaluate(lr_pred)

# PRODUCTION: Random Forest (SPEC)
rf = RandomForestClassifier(labelCol="Churn", featuresCol="features", numTrees=50, maxDepth=10)
rf_pipe = Pipeline(stages=[assembler, rf])
rf_start = time.time(); rf_model = rf_pipe.fit(train_data); rf_time = time.time() - rf_start
rf_pred = rf_model.transform(test_data); rf_auc = evaluator.evaluate(rf_pred)

# Decision Tree
dt = DecisionTreeClassifier(labelCol="Churn", featuresCol="features", maxDepth=5)
dt_pipe = Pipeline(stages=[assembler, dt])
dt_start = time.time(); dt_model = dt_pipe.fit(train_data); dt_time = time.time() - dt_start
dt_pred = dt_model.transform(test_data); dt_auc = evaluator.evaluate(dt_pred)

# Gradient Boosted Trees
gbt = GBTClassifier(labelCol="Churn", featuresCol="features", maxIter=20, maxDepth=5)
gbt_pipe = Pipeline(stages=[assembler, gbt])
gbt_start = time.time(); gbt_model = gbt_pipe.fit(train_data); gbt_time = time.time() - gbt_start
gbt_pred = gbt_model.transform(test_data); gbt_auc = evaluator.evaluate(gbt_pred)

# Linear SVM
svm = LinearSVC(labelCol="Churn", featuresCol="features", maxIter=10, regParam=0.01)
svm_pipe = Pipeline(stages=[assembler, svm])
svm_start = time.time(); svm_model = svm_pipe.fit(train_data); svm_time = time.time() - svm_start
svm_pred = svm_model.transform(test_data); svm_auc = evaluator.evaluate(svm_pred)

results_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest*', 'Decision Tree', 'Gradient Boosted Trees', 'Linear SVM'],
    'AUC': [f"{lr_auc:.3f}", f"{rf_auc:.3f}", f"{dt_auc:.3f}", f"{gbt_auc:.3f}", f"{svm_auc:.3f}"],
    'Training Time (s)': [f"{lr_time:.1f}", f"{rf_time:.1f}", f"{dt_time:.1f}", f"{gbt_time:.1f}", f"{svm_time:.1f}"]
})

print("MODEL BENCHMARK (MLlib Distributed):")
print(results_df.to_markdown(index=False))
print("\n*Random Forest = SPEC PRODUCTION MODEL")

