In [0]:
# ------------------------------------
# Cell 1 — Configuration & Imports
# ------------------------------------

from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

# Spark session (Databricks already provides this)
spark = SparkSession.builder.appName("TelecomChurn_Transformation").getOrCreate()

# Source cleaned Delta tables
train_table = "kusha_solutions.telecom_churn_ml.telecom_train_clean"
test_table  = "kusha_solutions.telecom_churn_ml.telecom_test_clean"

# Read data
train_df = spark.table(train_table)
test_df  = spark.table(test_table)

print("✅ Cleaned Train & Test tables loaded successfully.")
display(train_df.limit(5))


In [0]:
c

In [0]:
# ------------------------------------
# Cell 3 — Feature Scaling (Standardization)
# ------------------------------------

numeric_cols = ['tenure', 'MonthlyCharges_log', 'TotalCharges_log']

# Step 1: Assemble features into vector
assembler = VectorAssembler(inputCols=numeric_cols, outputCol='num_features_unscaled')

# Step 2: Fit scaler on train only
scaler = StandardScaler(inputCol='num_features_unscaled', outputCol='num_features_scaled', withMean=True, withStd=True)
scaler_model = scaler.fit(assembler.transform(train_df))

# Step 3: Apply same scaler to train and test
train_df = scaler_model.transform(assembler.transform(train_df))
test_df  = scaler_model.transform(assembler.transform(test_df))

print("✅ Standard scaling applied to numeric columns.")
display(train_df.select("tenure", "MonthlyCharges_log", "TotalCharges_log", "num_features_scaled").limit(5))


In [0]:
# ------------------------------------
# Cell 4 — Encode Categorical Columns
# ------------------------------------

cat_cols = ['gender', 'Partner', 'Dependents', 'InternetService', 'Contract', 'PaymentMethod']

for col_name in cat_cols:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_index", handleInvalid="keep")
    index_model = indexer.fit(train_df)
    train_df = index_model.transform(train_df)
    test_df  = index_model.transform(test_df)

encoder = OneHotEncoder(
    inputCols=[f"{c}_index" for c in cat_cols],
    outputCols=[f"{c}_encoded" for c in cat_cols]
)
encoder_model = encoder.fit(train_df)
train_df = encoder_model.transform(train_df)
test_df  = encoder_model.transform(test_df)

print("✅ Categorical columns encoded successfully.")
display(train_df.select([f"{c}_encoded" for c in cat_cols]).limit(3))


In [0]:
# ------------------------------------
# Cell 6 — Schema Consistency Check
# ------------------------------------

train_cols = set(train_df.columns)
test_cols = set(test_df.columns)

missing_in_test = train_cols - test_cols
missing_in_train = test_cols - train_cols

print("Columns missing in Test:", missing_in_test)
print("Columns missing in Train:", missing_in_train)

if not missing_in_test and not missing_in_train:
    print("✅ Train & Test schema are consistent.")
else:
    print("⚠️ Schema mismatch detected. Please align manually.")


In [0]:
# ------------------------------------
# Cell 7 — Save Transformed Delta Tables
# ------------------------------------

train_df.write.format("delta").mode("overwrite").saveAsTable("kusha_solutions.telecom_churn_ml.train_transformed")
test_df.write.format("delta").mode("overwrite").saveAsTable("kusha_solutions.telecom_churn_ml.test_transformed")

print("✅ Transformed Train & Test tables stored successfully in Delta Lake.")
