In [0]:

# Cell 1 - Load Cleaned Data
from pyspark.sql import SparkSession, functions as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

spark = SparkSession.builder.appName("Telecom_EDA").getOrCreate()

df = spark.table("kusha_solutions.telecom_churn_ml.telecom_train_clean")

print("âœ… Data loaded successfully")
print(f"Rows: {df.count()} | Columns: {len(df.columns)}")


In [0]:
# Cell 2 - Basic Info
print("Data types and column overview:")
for c, t in df.dtypes:
    print(f"{c:25} {t}")

numeric_cols = [c for c, t in df.dtypes if t in ['int', 'double', 'float', 'bigint'] and c != 'SeniorCitizen' ]
categorical_cols = [c for c, t in df.dtypes if (t == 'string' and c != 'Churn') or c == 'SeniorCitizen' ]

print("\nNumeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

In [0]:
# Cell 3 - Numeric Summary
numeric_summary = df.select(numeric_cols).summary()
display(numeric_summary)


In [0]:
# Cell 4 - Target Distribution
churn_df = df.groupBy("Churn").count().toPandas()
churn_df["percentage"] = churn_df["count"] / churn_df["count"].sum() * 100
display(churn_df)

plt.figure(figsize=(5,4))
sns.barplot(x="Churn", y="count", data=churn_df)
plt.title("Churn Distribution")
plt.show()


In [0]:
# Cell 5 - Numeric Distributions
pdf = df.select(numeric_cols).toPandas()
pdf.hist(bins=20, figsize=(14,8))
plt.suptitle("Numeric Feature Distributions")
plt.show()


In [0]:
# Cell 6 - Boxplots
for col in numeric_cols:
    plt.figure(figsize=(6,3))
    sns.boxplot(x=pdf[col])
    plt.title(f"Boxplot for {col}")
    plt.show()


In [0]:
# Cell 7 - Categorical Frequencies
for c in categorical_cols:
    top_values = df.groupBy(c).count().orderBy(F.desc("count")).limit(10).toPandas()
    plt.figure(figsize=(6,3))
    sns.barplot(x=c, y="count", data=top_values)
    plt.title(f"Top categories for {c}")
    plt.xticks(rotation=45)
    plt.show()


In [0]:
# Cell 8 - Numeric vs Churn
num_pdf = df.select(numeric_cols + ["Churn"]).toPandas()

for c in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x="Churn", y=c, data=num_pdf)
    plt.title(f"{c} vs Churn")
    plt.show()


In [0]:
# Cell 9 - Categorical vs Churn

from pyspark.sql.window import Window
for c in categorical_cols:
    temp = (df.groupBy(c, "Churn").count()
              .withColumn("total", F.sum("count").over(Window.partitionBy(c)))
              .withColumn("percentage", F.col("count")/F.col("total")*100)
              .orderBy(c))
    temp_pdf = temp.toPandas()
    if len(temp_pdf) < 30:
        plt.figure(figsize=(8,4))
        sns.barplot(x=c, y="percentage", hue="Churn", data=temp_pdf)
        plt.title(f"Churn % by {c}")
        plt.xticks(rotation=45)
        plt.show()



In [0]:
# Cell 10 - Correlation Matrix
corr = df[numeric_cols].toPandas().corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
