In [None]:
# week4_batch_predict.py
#!/usr/bin/env python3

from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.sql.functions import col
from pyspark.ml.functions import vector_to_array
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

print("Daily Batch Churn Prediction")

# Start Spark
spark = SparkSession.builder.appName("DailyChurn").getOrCreate()

# Load saved pipeline (Week 3)
model = PipelineModel.load("/content/sparkscale_model")

# Simulated batch input (feature-ready)
schema = StructType([
    StructField("id", IntegerType()),
    StructField("avg_monthly_90d", DoubleType()),
    StructField("total_tenure", IntegerType()),
    StructField("max_bill", DoubleType()),
    StructField("session_count", DoubleType()),
    StructField("high_value", DoubleType()),
    StructField("Churn", DoubleType())  # dummy column
])

new_users_data = [
    (1, 45.0, 24, 150.5, 120.0, 1.0, 0.0),
    (2, 30.0, 12, 85.2, 45.0, 0.0, 0.0),
    (3, 65.0, 36, 220.1, 200.0, 1.0, 0.0),
    (4, 92.5, 5, 115.0, 28.0, 1.0, 0.0)
]

new_data = spark.createDataFrame(new_users_data, schema)

# Batch inference
preds = model.transform(new_data)

# Sanity check â€“ probability MUST appear
preds.select(
    "id",
    "probability",
    "prediction"
).show(truncate=False)

# Final batch output (CSV)
preds.select(
    col("id"),
    (vector_to_array(col("probability"))[1] * 100).alias("churn_risk_percent"),
    col("prediction")
).coalesce(1).write.mode("overwrite") \
 .option("header", "true") \
 .csv("daily_predictions_out")



In [None]:
#  DASHBOARD Representation.
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Dashboard").getOrCreate()

pio.templates.default = "plotly_white"

df_viz = df_ml_ready.sample(0.1).toPandas()
preds_viz = lr_pred.select("Churn", "prediction", "probability").toPandas()

# 2x3 Layout (Slightly Larger)
fig = make_subplots(rows=2, cols=3,
                   subplot_titles=["Churn", "AUC", "ROC", "Correlation", "Confusion", "Risk Bands"],
                   vertical_spacing=0.12, horizontal_spacing=0.09)

# TOP ROW: Churn | AUC | ROC
churn_counts = df_viz["Churn"].value_counts()
fig.add_trace(go.Bar(x=churn_counts.index, y=churn_counts.values,
                    marker_color=["#10B981","#EF4444"],
                    text=churn_counts.values, textposition="auto"), row=1, col=1)

fig.add_trace(go.Bar(x=["LR","RF"], y=[lr_auc, rf_auc],
                    marker_color=["#3B82F6","#10B981"],
                    text=[f"{lr_auc:.3f}",f"{rf_auc:.3f}"], textposition="auto"), row=1, col=2)

fig.add_trace(go.Scatter(x=[0,lr_auc,1],y=[0,lr_auc,1], line=dict(color="#3B82F6",width=3), showlegend=False), row=1, col=3)
fig.add_trace(go.Scatter(x=[0,rf_auc,1],y=[0,rf_auc,1], line=dict(color="#10B981",width=3), showlegend=False), row=1, col=3)
fig.add_trace(go.Scatter(x=[0,1],y=[0,1], line=dict(color="lightgrey",dash="dash"), showlegend=False), row=1, col=3)

# BOTTOM ROW: Corr | Confusion | Risk
corr_df = df_viz.select_dtypes(np.number).corr()
top4 = corr_df.columns[:4]
corr4 = corr_df.loc[top4,top4].round(2)
fig.add_trace(go.Heatmap(z=corr4.values,x=top4,y=top4,colorscale="Viridis",
                        text=corr4.values,texttemplate="%{text}",textfont_size=11,
                        showscale=False), row=2, col=1)

cm_df = pd.crosstab(preds_viz["prediction"], preds_viz["Churn"])
fig.add_trace(go.Heatmap(z=cm_df.values,x=cm_df.columns,y=cm_df.index,
                        colorscale="Blues",text=cm_df.values,texttemplate="%{text}",
                        textfont_size=13,showscale=False), row=2, col=2)

# Risk Distribution (NEW!)
risk_scores = (preds_viz["probability"].apply(lambda x: x[1]*100)).round(0)
risk_bands = pd.cut(risk_scores, bins=5, labels=["0-20%", "21-40%", "41-60%", "61-80%", "81-100%"])
risk_counts = risk_bands.value_counts().sort_index()
fig.add_trace(go.Bar(x=risk_counts.index, y=risk_counts.values,
                    marker_color=["#10B981","#F59E0B","#F97316","#EF4444","#DC2626"],
                    text=risk_counts.values, textposition="auto"), row=2, col=3)

# SLIGHTLY LARGER (PPT Optimized)
fig.update_layout(height=850, width=1200,  # ðŸ‘ˆ Perfect increase!
                 title="SPARKSCALE â€“  Production Dashboard",
                 title_x=0.5, title_font_size=24, font_size=13,
                 showlegend=False, margin=dict(l=30,r=30,t=70,b=30))

fig.show()


In [None]:
# Single Churn PREDICT
from pyspark.sql.types import StructType, StructField, DoubleType

schema = StructType([
    StructField("avg_monthly_90d", DoubleType()),
    StructField("total_tenure", DoubleType()),
    StructField("max_bill", DoubleType()),
    StructField("session_count", DoubleType()),
    StructField("high_value", DoubleType()),
    StructField("Churn", DoubleType())
])

# YOUR CUSTOMER (edit)
customer_row = [92.5, 5.0, 115.0, 28.0, 1.0, 0.0]

single_df = spark.createDataFrame([customer_row], schema)
result_df = rf_model.transform(single_df)

# CLEAN OUTPUT
risk_prob = result_df.select("probability").collect()[0][0][1]
pred = result_df.select("prediction").collect()[0][0]

print("SPARKSCALE RESULT")
print(f"90d Avg: â‚¹{customer_row[0]} | Tenure: {customer_row[1]}m | High Value: {customer_row[4]}")
print(f"CHURN RISK: {risk_prob:.1%}")
print(f"Prediction: {'CHURN RETAIN!' if pred==1.0 else 'SAFE '}")

result_df.select("probability", "prediction").show(truncate=False)
