In [27]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, PCA, MinMaxScaler
from pyspark.ml.clustering import KMeans, GaussianMixture
from pyspark.ml.evaluation import ClusteringEvaluator, RegressionEvaluator
import findspark
from pyspark.sql.functions import col
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.appName("Clustering").getOrCreate()

In [4]:
spark

In [5]:
df = spark.read.csv(r"C:\Users\Robyi\Documents\Data Science Dataset\boston.csv", header = True, inferSchema = True)

In [9]:
df.show()

+--------------------+-----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|                CRIM|   ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+--------------------+-----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|              0.7842|  0.0| 8.14|   0|0.538| 5.99| 81.7|4.2579|  4|307.0|   21.0|386.75|14.67|17.5|
|             0.08187|  0.0| 2.89|   0|0.445| 7.82| 36.9|3.4952|  2|276.0|   18.0|393.53| 3.57|43.8|
|             0.16211| 20.0| 6.96|   0|0.464| 6.24| 16.3| 4.429|  3|223.0|   18.6| 396.9| 6.59|25.2|
|0.035019999999999996| 80.0| 4.95|   0|0.411|6.861| 27.9|5.1167|  4|245.0|   19.2| 396.9| 3.33|28.5|
|             0.14455| 12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311.0|   15.2| 396.9|19.15|27.1|
|             1.38799|  0.0| 8.14|   0|0.538| 5.95| 82.0|  3.99|  4|307.0|   21.0| 232.6|27.71|13.2|
|             0.15445| 25.0| 5.13|   0|0.453|6.145| 29.2|7.8148|  8|284.0|   19.7|390.68| 6

In [7]:
df = df.dropDuplicates()

In [8]:
df = df.dropna(how="any")

In [14]:
numerical_columns = [
    "CRIM", "ZN", "INDUS","CHAS","NX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"]

In [15]:
for column in numerical_columns:

    quantiles = df.approxQuantile(column, [0.25, 0.75], 0.05)
    Q1, Q3 = quantiles
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df = df.filter((col(column) >= lower_bound) & (col(column) <= upper_bound))

In [16]:
df.show()

+--------------------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+----+
|                CRIM| ZN|INDUS|CHAS|   NX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+--------------------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+----+
|             0.62976|0.0| 8.14|   0|0.538|5.949|61.8|4.7075|  4|307.0|   21.0| 396.9| 8.26|20.4|
|             0.21977|0.0| 6.91|   0|0.448|5.602|62.0|6.0877|  3|233.0|   17.9| 396.9| 16.2|19.4|
|             0.19802|0.0|10.59|   0|0.489|6.182|42.4|3.9454|  4|277.0|   18.6|393.63| 9.47|25.0|
|0.030410000000000003|0.0| 5.19|   0|0.515|5.895|59.6| 5.615|  5|224.0|   20.2|394.81|10.56|18.5|
|             0.19539|0.0|10.81|   0|0.413|6.245| 6.2|5.2873|  4|305.0|   19.2|377.17| 7.54|23.4|
|0.037380000000000004|0.0| 5.19|   0|0.515| 6.31|38.5|6.4584|  5|224.0|   20.2| 389.4| 6.75|20.7|
|              0.1676|0.0| 7.38|   0|0.493|6.426|52.3|4.5404|  5|287.0|   19.6| 396.9|  7.2|23.8|
|             0.0298

In [17]:
assembler = VectorAssembler(inputCols=[
    "CRIM", "ZN", "INDUS","CHAS","NX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"], 
    outputCol="features")
df = assembler.transform(df)

In [18]:
MinMax = MinMaxScaler(inputCol="features", outputCol="features_scaled")
df = MinMax.fit(df).transform(df)

In [19]:
df.show()

+--------------------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+----+--------------------+--------------------+
|                CRIM| ZN|INDUS|CHAS|   NX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|            features|     features_scaled|
+--------------------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+----+--------------------+--------------------+
|             0.62976|0.0| 8.14|   0|0.538|5.949|61.8|4.7075|  4|307.0|   21.0| 396.9| 8.26|20.4|[0.62976,0.0,8.14...|[0.93435192972619...|
|             0.21977|0.0| 6.91|   0|0.448|5.602|62.0|6.0877|  3|233.0|   17.9| 396.9| 16.2|19.4|[0.21977,0.0,6.91...|[0.29579790050774...|
|             0.19802|0.0|10.59|   0|0.489|6.182|42.4|3.9454|  4|277.0|   18.6|393.63| 9.47|25.0|[0.19802,0.0,10.5...|[0.26192256175435...|
|0.030410000000000003|0.0| 5.19|   0|0.515|5.895|59.6| 5.615|  5|224.0|   20.2|394.81|10.56|18.5|[0.03041000000000...|[8.72192629972279...|
|             0.1953

In [21]:
evaluator = ClusteringEvaluator(featuresCol="features_scaled", metricName="silhouette")

In [22]:
for k in range(2, 10):
    kmeans = KMeans(k=k, seed=42)
    model = kmeans.fit(df)
    predictions = model.transform(df)
    
    silhouette = evaluator.evaluate(predictions)
    print(f"K: {k}, Silhouette Score: {silhouette}")

K: 2, Silhouette Score: 0.28914861046363655
K: 3, Silhouette Score: 0.19455624283645367
K: 4, Silhouette Score: 0.19161862356804352
K: 5, Silhouette Score: 0.19678415630060875
K: 6, Silhouette Score: 0.15521919923328034
K: 7, Silhouette Score: 0.10777001962623141
K: 8, Silhouette Score: 0.08291673269533589
K: 9, Silhouette Score: 0.07068593983021136


In [23]:
for k in range(2, 10):
    gmm = GaussianMixture(k=k, seed=42)
    model = gmm.fit(df)
    
    log_likelihood = model.summary.logLikelihood
    print(f"K: {k}, Log-Likelihood: {log_likelihood}")

K: 2, Log-Likelihood: -2005.9113549095578
K: 3, Log-Likelihood: -1578.6846012885217
K: 4, Log-Likelihood: -1193.1031636567566
K: 5, Log-Likelihood: -1508.9759605015636
K: 6, Log-Likelihood: -1664.1520859376515
K: 7, Log-Likelihood: -543.0716579887458
K: 8, Log-Likelihood: 775.1933461781736
K: 9, Log-Likelihood: 538.8490251290315


In [None]:
kmeans = KMeans(featuresCol="features_scaled", seed =42)

param_grid = ParamGridBuilder().addGrid(kmeans.k,(list(range(2,10)))).build()

crossval = CrossValidator(estimator=kmeans, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)
cv_model = crossval.fit(df)

best_k = cv_model.bestModel.summary.k
print(best_k)

In [None]:
pca = PCA(inputCol="features_scaled", outputCol="pca_features")

evaluator = RegressionEvaluator(labelCol="pca_features", metricName="r2")

param_grid = ParamGridBuilder().addGrid(pca.k, list(range(1, 10))).build()

crossval = CrossValidator(estimator=pca,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=3)

cv_model = crossval.fit(df)
best_n = cv_model.bestModel.getK()

print(f"Jumlah komponen PCA terbaik: {best_n}")

In [24]:
kmeans = KMeans(k = 2,featuresCol="features_scaled", seed=42)
kmeans_model = kmeans.fit(df)

df_kmeans = kmeans_model.transform(df)
df_kmeans.show()

+--------------------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+----+--------------------+--------------------+----------+
|                CRIM| ZN|INDUS|CHAS|   NX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|            features|     features_scaled|prediction|
+--------------------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+----+--------------------+--------------------+----------+
|             0.62976|0.0| 8.14|   0|0.538|5.949|61.8|4.7075|  4|307.0|   21.0| 396.9| 8.26|20.4|[0.62976,0.0,8.14...|[0.93435192972619...|         0|
|             0.21977|0.0| 6.91|   0|0.448|5.602|62.0|6.0877|  3|233.0|   17.9| 396.9| 16.2|19.4|[0.21977,0.0,6.91...|[0.29579790050774...|         1|
|             0.19802|0.0|10.59|   0|0.489|6.182|42.4|3.9454|  4|277.0|   18.6|393.63| 9.47|25.0|[0.19802,0.0,10.5...|[0.26192256175435...|         1|
|0.030410000000000003|0.0| 5.19|   0|0.515|5.895|59.6| 5.615|  5|224.0|   20.2|394.81|10.56|18

In [25]:
gmm = GaussianMixture(k = 8,featuresCol="features_scaled", seed=42)
gmm_model = gmm.fit(df)

df_gmm = gmm_model.transform(df)
df_gmm.show()

+--------------------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+----+--------------------+--------------------+--------------------+----------+
|                CRIM| ZN|INDUS|CHAS|   NX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|            features|     features_scaled|         probability|prediction|
+--------------------+---+-----+----+-----+-----+----+------+---+-----+-------+------+-----+----+--------------------+--------------------+--------------------+----------+
|             0.62976|0.0| 8.14|   0|0.538|5.949|61.8|4.7075|  4|307.0|   21.0| 396.9| 8.26|20.4|[0.62976,0.0,8.14...|[0.93435192972619...|[2.78569950852054...|         2|
|             0.21977|0.0| 6.91|   0|0.448|5.602|62.0|6.0877|  3|233.0|   17.9| 396.9| 16.2|19.4|[0.21977,0.0,6.91...|[0.29579790050774...|[1.19469473027134...|         5|
|             0.19802|0.0|10.59|   0|0.489|6.182|42.4|3.9454|  4|277.0|   18.6|393.63| 9.47|25.0|[0.19802,0.0,10.5...|[0.26192256175435...|[

In [26]:
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score: {silhouette:.4f}")

Silhouette Score: 0.0707


In [28]:
from pyspark.ml.feature import PCA

best_n = None
target_variance = 0.95
selected_n = None

for n in range(1, 10): 
    pca = PCA(k=n, inputCol="features_scaled", outputCol="pca_features")
    model = pca.fit(df)

    explained_variance = model.explainedVariance.sum()
    print(f"n: {n}, Explained Variance: {explained_variance}")

    if explained_variance >= target_variance:
        selected_n = n
        break 

print(f"Jumlah komponen PCA terbaik: {selected_n}")

n: 1, Explained Variance: 0.3393436101906492
n: 2, Explained Variance: 0.4883416800359247
n: 3, Explained Variance: 0.6048106908735508
n: 4, Explained Variance: 0.710989432768875
n: 5, Explained Variance: 0.7934402632813414
n: 6, Explained Variance: 0.866805416995867
n: 7, Explained Variance: 0.9100604021334169
n: 8, Explained Variance: 0.9423722690650482
n: 9, Explained Variance: 0.966993502530331
Jumlah komponen PCA terbaik: 9


In [29]:
pca = PCA(k=9, inputCol="features_scaled", outputCol="pca_features")
pca_model = pca.fit(df)

df_pca = pca_model.transform(df)

df_pca.select("pca_features").show()

+--------------------+
|        pca_features|
+--------------------+
|[-1.1522394750978...|
|[-0.5988605401915...|
|[-0.7121075032445...|
|[-0.9534365810806...|
|[-0.2611894173277...|
|[-0.6372368486260...|
|[-0.8605857274124...|
|[-0.0579792028613...|
|[-1.0894627031523...|
|[-1.2808950783335...|
|[-0.7431726418486...|
|[-0.5516567859162...|
|[-1.2141682363766...|
|[-0.0522792232206...|
|[-0.7393135494079...|
|[-1.5860151094695...|
|[-0.3168126584615...|
|[-1.7131028356626...|
|[-0.8930048960612...|
|[-1.5632728862878...|
+--------------------+
only showing top 20 rows

