In [0]:
# ------------------------------------
# Cell 1 — Configuration and Imports
# ------------------------------------
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
import pandas as pd
from imblearn.over_sampling import SMOTE
from pyspark.sql import functions as F

# Spark session (Databricks provides it by default)
spark = SparkSession.builder.appName("SMOTE_Balancing").getOrCreate()

# Load transformed training dataset
train_df = spark.table("kusha_solutions.telecom_churn_ml.train_transformed")

print("✅ Training data loaded successfully.")


In [0]:
# ------------------------------------
# Cell 2 — Check Target Class Distribution
# ------------------------------------
train_df.groupBy("Churn").count().orderBy(F.desc("count")).show()

display(
    train_df.groupBy("Churn").count().orderBy(F.desc("count"))
)


In [0]:
# ------------------------------------
# Cell 3 — Encode Churn Column for SMOTE
# ------------------------------------
indexer = StringIndexer(inputCol="Churn", outputCol="Churn_index")
train_df = indexer.fit(train_df).transform(train_df)

train_df.select("Churn", "Churn_index").distinct().show()
print("✅ Target column encoded successfully.")


In [0]:
# ------------------------------------
# Cell 4 — Convert to Pandas for SMOTE
# ------------------------------------
train_pd = train_df.toPandas()

# Separate features & target
X = train_pd.drop(["Churn", "Churn_index"], axis=1)
y = train_pd["Churn_index"]

print("✅ Converted to Pandas for SMOTE application.")
print("Shape before SMOTE:", X.shape)


In [0]:
# ------------------------------------
# Cell 5 — Apply SMOTE (Fixed Version)
# ------------------------------------
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd

# Drop irrelevant columns
X = train_pd.drop(columns=[
    "customerID", "Churn", "Churn_index",
    "gender", "Partner", "Dependents", "InternetService", 
    "Contract", "PaymentMethod",
    "gender_encoded", "Partner_encoded", "Dependents_encoded",
    "InternetService_encoded", "Contract_encoded", "PaymentMethod_encoded",
    "num_features_unscaled", "num_features_scaled"
], errors='ignore')

y = train_pd["Churn_index"]

# Check if any remaining string columns exist
obj_cols = X.select_dtypes(include=['object']).columns
if len(obj_cols) > 0:
    print("Encoding string columns before SMOTE:", list(obj_cols))
    for col in obj_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

# Verify all columns are now numeric
print("✅ Data types after encoding:")
print(X.dtypes)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("✅ SMOTE applied successfully.")
print("Before SMOTE:", y.value_counts().to_dict())
print("After SMOTE:", pd.Series(y_resampled).value_counts().to_dict())



In [0]:
# ------------------------------------
# Cell 6 — Convert Back to Spark
# ------------------------------------
balanced_pd = pd.concat([X_resampled, y_resampled], axis=1)
balanced_df = spark.createDataFrame(balanced_pd)

print("✅ Converted balanced dataset back to Spark DataFrame.")
balanced_df.groupBy("Churn_index").count().show()


In [0]:
# ------------------------------------
# Cell 6 — Convert Back to Spark
# ------------------------------------
balanced_pd = pd.concat([X_resampled, y_resampled], axis=1)
balanced_df = spark.createDataFrame(balanced_pd)

print("✅ Converted balanced dataset back to Spark DataFrame.")
balanced_df.groupBy("Churn_index").count().show()
