In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from pyspark.sql import SparkSession

In [3]:
spark.sparkContext.setLogLevel("ERROR")


In [4]:
spark = SparkSession.builder.appName("KMeansexample").getOrCreate()

In [5]:
print(spark.version)


3.5.3


In [None]:
data = [(1, "foo"), (2, "bar"), (3, "baz")]
df = spark.createDataFrame(data, ["id", "value"])
df.show()

# checking computational capacity

In [9]:
import pandas as pd
import numpy as np
import time

# Generate a large datase
data = np.random.rand(1_100_000_000, 1)
df = pd.DataFrame(data, columns=["value"])

# Measure time
start_time = time.time()
sum_result = df["value"].sum()
end_time = time.time()

print("Without Spark:")
print(f"Sum: {sum_result}")
print(f"Time taken: {end_time - start_time:.2f} seconds")


Without Spark:
Sum: 550013308.2265407
Time taken: 116.92 seconds


In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as spark_sum
import time

# Generate a large dataset
data = [(float(i),) for i in np.random.rand(1_1_000_000)]
spark_df = spark.createDataFrame(data, ["value"])

# Measure time
start_time = time.time()
sum_result = spark_df.select(spark_sum("value")).collect()[0][0]
end_time = time.time()

print("With Spark:")
print(f"Sum: {sum_result}")
print(f"Time taken: {end_time - start_time:.2f} seconds")


[Stage 3:>                                                          (0 + 8) / 8]

With Spark:
Sum: 5497746.262147221
Time taken: 4.25 seconds


                                                                                

In [None]:
df = pd.read_csv("smoking_driking_dataset_Ver01.csv.gz")

In [None]:
df

# Data Preparation
## Check for Missing Values

In [None]:
df.isnull().sum()

## Convert Object Data types into Numerical Data types

In [None]:
if "sex" in df.columns:
    one_hot = pd.get_dummies(df["sex"], prefix = 'Is_')
    df_ = pd.concat([df, one_hot], axis = 1)
    df = df_.drop(["sex"], axis = 1)

In [None]:
df

In [None]:
if "DRK_YN" in df.columns:    
    df = df.drop(["DRK_YN"], axis = 1)

    

## Standardising

In [None]:
pipeline = Pipeline([
    ('standardiser', StandardScaler())
])

In [None]:
Standard_df = pipeline.fit_transform(df)

In [None]:
Standard_df

# Model

## Rough Model

In [None]:
kmeans = KMeans(n_clusters =5,  n_init = 10, random_state = 42)
kmeans_pred = kmeans.fit_predict(Standard_df)
wcss = kmeans.inertia_

In [None]:
centroids = kmeans.cluster_centers_

In [None]:
plt.figure(figsize = (8,10))

plt.scatter(Standard_df[:,-1], Standard_df[:, -2], c = kmeans_pred, cmap = "viridis", alpha = 0.6)
plt.scatter(centroids[:,0], centroids[:, 1], c = "red", marker = 'X')

### The graph seems cluttered so Decided to try plot with the sample.

In [None]:
np.random.seed(42)
sample_indices = np.random.choice(Standard_df.shape[0], replace = False, size = 1000)
sample_df = Standard_df[sample_indices]
sample_preds = kmeans_pred[sample_indices]

In [None]:
plt.figure(figsize = (8,10))

plt.scatter(sample_df[:,2], sample_df[:, 8], c = sample_preds, cmap = "viridis", alpha = 0.6)
plt.scatter(centroids[:,0], centroids[:, 1], c = "red", marker = 'X')


In [None]:
silhouette_score(Standard_df, kmeans_pred)