<a href="https://colab.research.google.com/github/HellxAngel69/Project-Scalable-and-Distributed-Computing-Skills/blob/main/Project%20Scalable.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1 - Import & Load data (Pandas)**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

url = "https://drive.google.com/uc?export=download&id=1d4GYM9oYttQnwK2L8lBgKaG0wFx3FB0X"
df = pd.read_csv(url)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


**2 - Create Spark Session, Convert Pandas → Spark DataFrame and Explore dataset**

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CustomerData-DataFrame") \
    .getOrCreate()
spark_df = spark.createDataFrame(df)
spark_df.show(5)

spark_df.printSchema()

print("Rows:", spark_df.count())
print("Columns:", len(spark_df.columns))

spark_df.describe().show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|          No|No phone service|            DSL|            No|         Yes|              No|         No|    

**3 - Check and handle missing values**


In [None]:
from pyspark.sql.functions import col, when, trim

spark_df = spark_df.withColumn(
    "TotalCharges",
    when(trim(col("TotalCharges")) == "", None)
    .otherwise(col("TotalCharges"))
)

spark_df = spark_df.withColumn(
    "TotalCharges",
    col("TotalCharges").cast("double")
)

from pyspark.sql.functions import col, sum

spark_df.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in spark_df.columns
]).show()

from pyspark.sql.functions import avg

mean_tenure = spark_df.select(avg("tenure")).first()[0]
mean_monthly = spark_df.select(avg("MonthlyCharges")).first()[0]
mean_total = spark_df.select(avg("TotalCharges")).first()[0]

df_clean = spark_df.fillna({
    "tenure": mean_tenure,
    "MonthlyCharges": mean_monthly,
    "TotalCharges": mean_total
})

df_clean.select("tenure", "MonthlyCharges", "TotalCharges").show(5)


+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

**7 - Encode categorical variables**



In [None]:
from pyspark.ml.feature import StringIndexer

gender_indexer = StringIndexer(
    inputCol="gender",
    outputCol="gender_index"
)

df_indexed = gender_indexer.fit(df_clean).transform(df_clean)

from pyspark.ml.feature import OneHotEncoder

gender_encoder = OneHotEncoder(
    inputCol="gender_index",
    outputCol="gender_vec"
)

df_encoded = gender_encoder.fit(df_indexed).transform(df_indexed)


**8 - Assemble & Scale numerical features**



In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["tenure", "MonthlyCharges", "TotalCharges"],
    outputCol="numerical_features"
)

df_features = assembler.transform(df_encoded)
df_features.select("numerical_features").show(5)

from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(
    inputCol="numerical_features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

df_final = scaler.fit(df_features).transform(df_features)
df_final.select("scaled_features", "gender_vec").show(5)


+-------------------+
| numerical_features|
+-------------------+
|  [1.0,29.85,29.85]|
|[34.0,56.95,1889.5]|
| [2.0,53.85,108.15]|
|[45.0,42.3,1840.75]|
|  [2.0,70.7,151.65]|
+-------------------+
only showing top 5 rows
+--------------------+-------------+
|     scaled_features|   gender_vec|
+--------------------+-------------+
|[-1.2773553380283...|    (1,[],[])|
|[0.06633494828367...|(1,[0],[1.0])|
|[-1.2366374505643...|(1,[0],[1.0])|
|[0.51423171038768...|(1,[0],[1.0])|
|[-1.2366374505643...|    (1,[],[])|
+--------------------+-------------+
only showing top 5 rows
