# Code Sample

In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.types import IntegerType

## Initialize Spark Session

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("TaxiDataAnalysis").getOrCreate()
spark.sparkContext.setLogLevel('ERROR')


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/25 16:14:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Set I/O

In [3]:
# Set I/O
local_test = True
if local_test:
    df = spark.read.csv('sample/sample.csv',
                        sep=',', header=True, inferSchema=True)  # .toDF(*header)
    output_folder_path = 'output'
else:
    input_file_path = sys.argv[1]
    df = spark.read.csv(input_file_path, sep=',', header=True, inferSchema=True)
    output_folder_path = sys.argv[2]

print(df.schema)




StructType([StructField('medallion', StringType(), True), StructField('hack_license', StringType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropoff_datetime', TimestampType(), True), StructField('trip_time_in_secs', IntegerType(), True), StructField('trip_distance', DoubleType(), True), StructField('pickup_longitude', DoubleType(), True), StructField('pickup_latitude', DoubleType(), True), StructField('dropoff_longitude', DoubleType(), True), StructField('dropoff_latitude', DoubleType(), True), StructField('payment_type', StringType(), True), StructField('fare_amount', DoubleType(), True), StructField('surcharge', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('total_amount', DoubleType(), True)])


                                                                                

## Data cleaning

In [4]:
# Data cleaning
df_filtered = df.filter((col('fare_amount') > 0) &
                        (col('trip_distance') > 0) &
                        (col('trip_time_in_secs') > 0))

# Filter out trips where longitude or latitude is 0
df = df_filtered.filter((col('pickup_longitude') != 0) &
                        (col('pickup_latitude') != 0) &
                        (col('dropoff_longitude') != 0) &
                        (col('dropoff_latitude') != 0))

# Filter out trips that happens at least around New York State
nyc_latitude_bounds = (39, 42)
nyc_longitude_bounds = (-76, -72)

df = df.filter(
    (col('pickup_longitude') >= nyc_longitude_bounds[0]) & (col('pickup_longitude') <= nyc_longitude_bounds[1]) &
    (col('pickup_latitude') >= nyc_latitude_bounds[0]) & (col('pickup_latitude') <= nyc_latitude_bounds[1]) &
    (col('dropoff_longitude') >= nyc_longitude_bounds[0]) & (col('dropoff_longitude') <= nyc_longitude_bounds[1]) &
    (col('dropoff_latitude') >= nyc_latitude_bounds[0]) & (col('dropoff_latitude') <= nyc_latitude_bounds[1])
)
print(df.show(5))
print("*** Filter finished ***")

+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|20A85CDF09AC1BE3A...|6AC3258720B5F6600...|2013-06-01 00:00:00|2013-06-01 00:06:00|              360|         0.99|      -74.008789|      40.704243|       -74.017204|       40.708412|         CRD|        6.0|

## Model

In [5]:
# Add time_slot column
df = df.withColumn("time_slot", (hour(col("pickup_datetime")) / 2).cast(IntegerType()))
print("*** Time Slot Added ***")

# Prepare the features for KMeans
vecAssembler = VectorAssembler(inputCols=["pickup_longitude", "pickup_latitude"], outputCol="features")

# Evaluator for computing silhouette score
evaluator = ClusteringEvaluator()

# We will collect the best K and stats in this list
best_ks = []
cluster_stats = []

evaluator = ClusteringEvaluator(predictionCol="cluster", featuresCol="features")

# Loop through each time slot (12 in total)
for time_slot in range(0, 12):
    df_slot = df.filter(col("time_slot") == time_slot)
    df_slot = vecAssembler.transform(df_slot)
    df_slot.cache()
    silhouette_scores = []

    # Apply KMeans for k from 7 to 20 and calculate silhouette scores
    for k in range(7, 21):
        kmeans = KMeans(k=k, seed=1, featuresCol="features", predictionCol="cluster")
        model = kmeans.fit(df_slot)
        predictions = model.transform(df_slot)

        if "cluster" not in predictions.columns:
            raise ValueError("The prediction column 'cluster' was not found in the predictions DataFrame.")

        silhouette = evaluator.evaluate(predictions)
        silhouette_scores.append((k, silhouette))
        print(f"Time Slot {time_slot}: K = {k} and the silhouette = {silhouette}")

    # Find the best k with the highest silhouette score
    best_k = max(silhouette_scores, key=lambda item: item[1])[0]
    best_ks.append((time_slot, best_k))

    print(f"*** The Best K is {best_k} for Time Slot {time_slot} ***")

    # Reapply KMeans with the best k
    best_kmeans = KMeans(k=best_k, seed=1, featuresCol="features", predictionCol="cluster")
    best_model = best_kmeans.fit(df_slot)
    best_predictions = best_model.transform(df_slot)

    # Get centroids of Clusters
    centroids = best_model.clusterCenters()

    # Calculate statistics for each cluster
    for cluster in range(best_k):
        cluster_df = best_predictions.filter(col("cluster") == cluster)

        # Extract centroid coordinates for the current cluster
        centroid_longitude, centroid_latitude = centroids[cluster]

        order_density = cluster_df.count() / (2.0 * 60.0 * 60.0)

        avg_unit_price = cluster_df.groupBy('cluster').avg('fare_amount').first()[1]

        avg_tips = cluster_df.groupBy('cluster').avg('tip_amount').first()[1]

        percentage_tips = cluster_df.withColumn('tip_percentage',
                                                (col('tip_amount') / (
                                                        col('total_amount') - col('tip_amount')))).groupBy(
            'cluster').avg('tip_percentage').first()[1]

        centroid_latitude = float(centroid_latitude)
        centroid_longitude = float(centroid_longitude)

        # Append stats to the list
        cluster_stats.append((time_slot, cluster, centroid_latitude, centroid_longitude, order_density, avg_unit_price,
                              avg_tips, percentage_tips))

        print(f"*** Cluster {cluster} Statistics for Time SLot {time_slot} Calculated***")
        # print(cluster_stats)

    df_slot.unpersist()

print("*** Cluster Statistics for Time Slot Calculated ***")

# Convert the best k's to a DataFrame
best_ks_df = spark.createDataFrame(best_ks, schema=["Time_slot", "Best_K"])

# Convert the cluster stats to a DataFrame
stats_schema = ["Time_slot", "Cluster number", "Centroid Latitude", "Centroid Longitude", "Order density",
                "Average unit price per customer", "Average Tips per customer", "Percentage of tips"]

stats_df = spark.createDataFrame(cluster_stats, schema=stats_schema)

# Best k for each time slot
best_ks_df.show()

# Final result
stats_df.show()


*** Time Slot Added ***


                                                                                

Time Slot 0: K = 7 and the silhouette = 0.5064992199093005
Time Slot 0: K = 8 and the silhouette = 0.545110305875861
Time Slot 0: K = 9 and the silhouette = 0.5201905050131413
Time Slot 0: K = 10 and the silhouette = 0.5583138945494259
Time Slot 0: K = 11 and the silhouette = 0.5404954248364835
Time Slot 0: K = 12 and the silhouette = 0.5539472850845616


                                                                                

Time Slot 0: K = 13 and the silhouette = 0.5964565257829996
Time Slot 0: K = 14 and the silhouette = 0.5497093541377305


                                                                                

Time Slot 0: K = 15 and the silhouette = 0.5310592292533893
Time Slot 0: K = 16 and the silhouette = 0.5595477249288543


                                                                                

Time Slot 0: K = 17 and the silhouette = 0.5836019155143297


                                                                                

Time Slot 0: K = 18 and the silhouette = 0.5820227259053611


                                                                                

Time Slot 0: K = 19 and the silhouette = 0.556886959568952


                                                                                

Time Slot 0: K = 20 and the silhouette = 0.5622697481657748
*** The Best K is 13 for Time Slot 0 ***
*** Cluster 0 Statistics for Time SLot 0 Calculated***
*** Cluster 1 Statistics for Time SLot 0 Calculated***
*** Cluster 2 Statistics for Time SLot 0 Calculated***
*** Cluster 3 Statistics for Time SLot 0 Calculated***
*** Cluster 4 Statistics for Time SLot 0 Calculated***
*** Cluster 5 Statistics for Time SLot 0 Calculated***
*** Cluster 6 Statistics for Time SLot 0 Calculated***
*** Cluster 7 Statistics for Time SLot 0 Calculated***
*** Cluster 8 Statistics for Time SLot 0 Calculated***
*** Cluster 9 Statistics for Time SLot 0 Calculated***
*** Cluster 10 Statistics for Time SLot 0 Calculated***
*** Cluster 11 Statistics for Time SLot 0 Calculated***
*** Cluster 12 Statistics for Time SLot 0 Calculated***


                                                                                

Time Slot 1: K = 7 and the silhouette = 0.5458526787055832
Time Slot 1: K = 8 and the silhouette = 0.5267279486739135
Time Slot 1: K = 9 and the silhouette = 0.5700956036726145
Time Slot 1: K = 10 and the silhouette = 0.5609225201993941
Time Slot 1: K = 11 and the silhouette = 0.5631050062071224
Time Slot 1: K = 12 and the silhouette = 0.5583691489983433
Time Slot 1: K = 13 and the silhouette = 0.533829527132829
Time Slot 1: K = 14 and the silhouette = 0.5860250614006768
Time Slot 1: K = 15 and the silhouette = 0.5912952363780066
Time Slot 1: K = 16 and the silhouette = 0.5848407538603719
Time Slot 1: K = 17 and the silhouette = 0.6139256829996882
Time Slot 1: K = 18 and the silhouette = 0.6268941767779492
Time Slot 1: K = 19 and the silhouette = 0.5665888312858921
Time Slot 1: K = 20 and the silhouette = 0.5568346308006605
*** The Best K is 18 for Time Slot 1 ***
*** Cluster 0 Statistics for Time SLot 1 Calculated***
*** Cluster 1 Statistics for Time SLot 1 Calculated***
*** Cluster 2

                                                                                

Time Slot 2: K = 7 and the silhouette = 0.5139345275149714
Time Slot 2: K = 8 and the silhouette = 0.5153784122507269
Time Slot 2: K = 9 and the silhouette = 0.5065896702773712
Time Slot 2: K = 10 and the silhouette = 0.5519785666341938
Time Slot 2: K = 11 and the silhouette = 0.542119915762901
Time Slot 2: K = 12 and the silhouette = 0.5372458929807005
Time Slot 2: K = 13 and the silhouette = 0.5748122064784568
Time Slot 2: K = 14 and the silhouette = 0.59404356340654
Time Slot 2: K = 15 and the silhouette = 0.6034671676712564
Time Slot 2: K = 16 and the silhouette = 0.6047050632280873
Time Slot 2: K = 17 and the silhouette = 0.6096987063283912
Time Slot 2: K = 18 and the silhouette = 0.5532469079604935
Time Slot 2: K = 19 and the silhouette = 0.5481129313352429
Time Slot 2: K = 20 and the silhouette = 0.5708027582001872
*** The Best K is 17 for Time Slot 2 ***
*** Cluster 0 Statistics for Time SLot 2 Calculated***
*** Cluster 1 Statistics for Time SLot 2 Calculated***
*** Cluster 2 S

                                                                                

Time Slot 3: K = 7 and the silhouette = 0.5328604394325797
Time Slot 3: K = 8 and the silhouette = 0.5007148128417791
Time Slot 3: K = 9 and the silhouette = 0.5103206325075648
Time Slot 3: K = 10 and the silhouette = 0.5491638727877679
Time Slot 3: K = 11 and the silhouette = 0.532485637807516
Time Slot 3: K = 12 and the silhouette = 0.5504990383949102
Time Slot 3: K = 13 and the silhouette = 0.569262577419168
Time Slot 3: K = 14 and the silhouette = 0.5686640041070887
Time Slot 3: K = 15 and the silhouette = 0.5888833647120961
Time Slot 3: K = 16 and the silhouette = 0.5587761799984144
Time Slot 3: K = 17 and the silhouette = 0.5776721568128586


                                                                                

Time Slot 3: K = 18 and the silhouette = 0.5481685655723607


                                                                                

Time Slot 3: K = 19 and the silhouette = 0.589946741353141


                                                                                

Time Slot 3: K = 20 and the silhouette = 0.5650720439934582
*** The Best K is 19 for Time Slot 3 ***
*** Cluster 0 Statistics for Time SLot 3 Calculated***
*** Cluster 1 Statistics for Time SLot 3 Calculated***
*** Cluster 2 Statistics for Time SLot 3 Calculated***
*** Cluster 3 Statistics for Time SLot 3 Calculated***
*** Cluster 4 Statistics for Time SLot 3 Calculated***
*** Cluster 5 Statistics for Time SLot 3 Calculated***
*** Cluster 6 Statistics for Time SLot 3 Calculated***
*** Cluster 7 Statistics for Time SLot 3 Calculated***
*** Cluster 8 Statistics for Time SLot 3 Calculated***
*** Cluster 9 Statistics for Time SLot 3 Calculated***
*** Cluster 10 Statistics for Time SLot 3 Calculated***
*** Cluster 11 Statistics for Time SLot 3 Calculated***
*** Cluster 12 Statistics for Time SLot 3 Calculated***
*** Cluster 13 Statistics for Time SLot 3 Calculated***
*** Cluster 14 Statistics for Time SLot 3 Calculated***
*** Cluster 15 Statistics for Time SLot 3 Calculated***
*** Cluster 1

                                                                                

Time Slot 4: K = 7 and the silhouette = 0.5012703551997734
Time Slot 4: K = 8 and the silhouette = 0.4610078355323348
Time Slot 4: K = 9 and the silhouette = 0.49055002828868954
Time Slot 4: K = 10 and the silhouette = 0.503613724556238
Time Slot 4: K = 11 and the silhouette = 0.5009757891562752


                                                                                

Time Slot 4: K = 12 and the silhouette = 0.5087831753838159
Time Slot 4: K = 13 and the silhouette = 0.5060504122908335


                                                                                

Time Slot 4: K = 14 and the silhouette = 0.5216759837638817


                                                                                

Time Slot 4: K = 15 and the silhouette = 0.5141395489019629


                                                                                

Time Slot 4: K = 16 and the silhouette = 0.557296336975174


                                                                                

Time Slot 4: K = 17 and the silhouette = 0.5270458332658845


                                                                                

Time Slot 4: K = 18 and the silhouette = 0.5007219588083306


                                                                                

Time Slot 4: K = 19 and the silhouette = 0.5298837108184241


                                                                                

Time Slot 4: K = 20 and the silhouette = 0.5587280337949879
*** The Best K is 20 for Time Slot 4 ***
*** Cluster 0 Statistics for Time SLot 4 Calculated***
*** Cluster 1 Statistics for Time SLot 4 Calculated***
*** Cluster 2 Statistics for Time SLot 4 Calculated***
*** Cluster 3 Statistics for Time SLot 4 Calculated***
*** Cluster 4 Statistics for Time SLot 4 Calculated***
*** Cluster 5 Statistics for Time SLot 4 Calculated***
*** Cluster 6 Statistics for Time SLot 4 Calculated***
*** Cluster 7 Statistics for Time SLot 4 Calculated***
*** Cluster 8 Statistics for Time SLot 4 Calculated***
*** Cluster 9 Statistics for Time SLot 4 Calculated***
*** Cluster 10 Statistics for Time SLot 4 Calculated***
*** Cluster 11 Statistics for Time SLot 4 Calculated***
*** Cluster 12 Statistics for Time SLot 4 Calculated***
*** Cluster 13 Statistics for Time SLot 4 Calculated***
*** Cluster 14 Statistics for Time SLot 4 Calculated***
*** Cluster 15 Statistics for Time SLot 4 Calculated***
*** Cluster 1

                                                                                

Time Slot 5: K = 7 and the silhouette = 0.5268405654205215
Time Slot 5: K = 8 and the silhouette = 0.5219403183551019
Time Slot 5: K = 9 and the silhouette = 0.5301560010164038
Time Slot 5: K = 10 and the silhouette = 0.4932768960351441


                                                                                

Time Slot 5: K = 11 and the silhouette = 0.49544541111297064


                                                                                

Time Slot 5: K = 12 and the silhouette = 0.5245082581261683


                                                                                

Time Slot 5: K = 13 and the silhouette = 0.5284685843406238


                                                                                

Time Slot 5: K = 14 and the silhouette = 0.53670444581226


                                                                                

Time Slot 5: K = 15 and the silhouette = 0.548507200469485


                                                                                

Time Slot 5: K = 16 and the silhouette = 0.5262806560757056


                                                                                

Time Slot 5: K = 17 and the silhouette = 0.5237442823800753


                                                                                

Time Slot 5: K = 18 and the silhouette = 0.5450818774616848


                                                                                

Time Slot 5: K = 19 and the silhouette = 0.500554076690223


                                                                                

Time Slot 5: K = 20 and the silhouette = 0.5088302856045468
*** The Best K is 15 for Time Slot 5 ***
*** Cluster 0 Statistics for Time SLot 5 Calculated***
*** Cluster 1 Statistics for Time SLot 5 Calculated***
*** Cluster 2 Statistics for Time SLot 5 Calculated***
*** Cluster 3 Statistics for Time SLot 5 Calculated***
*** Cluster 4 Statistics for Time SLot 5 Calculated***
*** Cluster 5 Statistics for Time SLot 5 Calculated***
*** Cluster 6 Statistics for Time SLot 5 Calculated***
*** Cluster 7 Statistics for Time SLot 5 Calculated***
*** Cluster 8 Statistics for Time SLot 5 Calculated***
*** Cluster 9 Statistics for Time SLot 5 Calculated***
*** Cluster 10 Statistics for Time SLot 5 Calculated***
*** Cluster 11 Statistics for Time SLot 5 Calculated***
*** Cluster 12 Statistics for Time SLot 5 Calculated***
*** Cluster 13 Statistics for Time SLot 5 Calculated***
*** Cluster 14 Statistics for Time SLot 5 Calculated***


                                                                                

Time Slot 6: K = 7 and the silhouette = 0.477386755633965
Time Slot 6: K = 8 and the silhouette = 0.4890355913946751


                                                                                

Time Slot 6: K = 9 and the silhouette = 0.48890174471705766
Time Slot 6: K = 10 and the silhouette = 0.4909442705488521


                                                                                

Time Slot 6: K = 11 and the silhouette = 0.5138980613928285


                                                                                

Time Slot 6: K = 12 and the silhouette = 0.4766360853625308


                                                                                

Time Slot 6: K = 13 and the silhouette = 0.5201557351731866


                                                                                

Time Slot 6: K = 14 and the silhouette = 0.5108825461298754


                                                                                

Time Slot 6: K = 15 and the silhouette = 0.5135109957356265


                                                                                

Time Slot 6: K = 16 and the silhouette = 0.4634344247810479


                                                                                

Time Slot 6: K = 17 and the silhouette = 0.5113870824675016


                                                                                

Time Slot 6: K = 18 and the silhouette = 0.5410512332950665


                                                                                

Time Slot 6: K = 19 and the silhouette = 0.533539146449439


                                                                                

Time Slot 6: K = 20 and the silhouette = 0.5290699895248147
*** The Best K is 18 for Time Slot 6 ***
*** Cluster 0 Statistics for Time SLot 6 Calculated***
*** Cluster 1 Statistics for Time SLot 6 Calculated***
*** Cluster 2 Statistics for Time SLot 6 Calculated***
*** Cluster 3 Statistics for Time SLot 6 Calculated***
*** Cluster 4 Statistics for Time SLot 6 Calculated***
*** Cluster 5 Statistics for Time SLot 6 Calculated***
*** Cluster 6 Statistics for Time SLot 6 Calculated***
*** Cluster 7 Statistics for Time SLot 6 Calculated***
*** Cluster 8 Statistics for Time SLot 6 Calculated***
*** Cluster 9 Statistics for Time SLot 6 Calculated***
*** Cluster 10 Statistics for Time SLot 6 Calculated***
*** Cluster 11 Statistics for Time SLot 6 Calculated***
*** Cluster 12 Statistics for Time SLot 6 Calculated***
*** Cluster 13 Statistics for Time SLot 6 Calculated***
*** Cluster 14 Statistics for Time SLot 6 Calculated***
*** Cluster 15 Statistics for Time SLot 6 Calculated***
*** Cluster 1

                                                                                

Time Slot 7: K = 7 and the silhouette = 0.5059157700124721
Time Slot 7: K = 8 and the silhouette = 0.510849975532739
Time Slot 7: K = 9 and the silhouette = 0.5400921205003731
Time Slot 7: K = 10 and the silhouette = 0.5144953071886136
Time Slot 7: K = 11 and the silhouette = 0.5476723049545933


                                                                                

Time Slot 7: K = 12 and the silhouette = 0.5104262813583808


                                                                                

Time Slot 7: K = 13 and the silhouette = 0.5548118703987903


                                                                                

Time Slot 7: K = 14 and the silhouette = 0.5090025703633916


                                                                                

Time Slot 7: K = 15 and the silhouette = 0.4983531890550443


                                                                                

Time Slot 7: K = 16 and the silhouette = 0.5007295095508436


                                                                                

Time Slot 7: K = 17 and the silhouette = 0.49071989928884385


                                                                                

Time Slot 7: K = 18 and the silhouette = 0.5513090656517879


                                                                                

Time Slot 7: K = 19 and the silhouette = 0.5305841830596796


                                                                                

Time Slot 7: K = 20 and the silhouette = 0.5357247398458936
*** The Best K is 13 for Time Slot 7 ***
*** Cluster 0 Statistics for Time SLot 7 Calculated***
*** Cluster 1 Statistics for Time SLot 7 Calculated***
*** Cluster 2 Statistics for Time SLot 7 Calculated***
*** Cluster 3 Statistics for Time SLot 7 Calculated***
*** Cluster 4 Statistics for Time SLot 7 Calculated***
*** Cluster 5 Statistics for Time SLot 7 Calculated***
*** Cluster 6 Statistics for Time SLot 7 Calculated***
*** Cluster 7 Statistics for Time SLot 7 Calculated***
*** Cluster 8 Statistics for Time SLot 7 Calculated***
*** Cluster 9 Statistics for Time SLot 7 Calculated***
*** Cluster 10 Statistics for Time SLot 7 Calculated***
*** Cluster 11 Statistics for Time SLot 7 Calculated***
*** Cluster 12 Statistics for Time SLot 7 Calculated***


                                                                                

Time Slot 8: K = 7 and the silhouette = 0.5397024913413905
Time Slot 8: K = 8 and the silhouette = 0.5116283076152962
Time Slot 8: K = 9 and the silhouette = 0.503297910735147
Time Slot 8: K = 10 and the silhouette = 0.5145038069502511


                                                                                

Time Slot 8: K = 11 and the silhouette = 0.5455696857619088
Time Slot 8: K = 12 and the silhouette = 0.5498117967353798


                                                                                

Time Slot 8: K = 13 and the silhouette = 0.544040935715562


                                                                                

Time Slot 8: K = 14 and the silhouette = 0.5395134631894988


                                                                                

Time Slot 8: K = 15 and the silhouette = 0.5572928563940035


                                                                                

Time Slot 8: K = 16 and the silhouette = 0.5819751476462426


                                                                                

Time Slot 8: K = 17 and the silhouette = 0.5761055925972453


                                                                                

Time Slot 8: K = 18 and the silhouette = 0.5315431282958824


                                                                                

Time Slot 8: K = 19 and the silhouette = 0.5376326992264806


                                                                                

Time Slot 8: K = 20 and the silhouette = 0.5696533692268588
*** The Best K is 16 for Time Slot 8 ***
*** Cluster 0 Statistics for Time SLot 8 Calculated***
*** Cluster 1 Statistics for Time SLot 8 Calculated***
*** Cluster 2 Statistics for Time SLot 8 Calculated***
*** Cluster 3 Statistics for Time SLot 8 Calculated***
*** Cluster 4 Statistics for Time SLot 8 Calculated***
*** Cluster 5 Statistics for Time SLot 8 Calculated***
*** Cluster 6 Statistics for Time SLot 8 Calculated***
*** Cluster 7 Statistics for Time SLot 8 Calculated***
*** Cluster 8 Statistics for Time SLot 8 Calculated***
*** Cluster 9 Statistics for Time SLot 8 Calculated***
*** Cluster 10 Statistics for Time SLot 8 Calculated***
*** Cluster 11 Statistics for Time SLot 8 Calculated***
*** Cluster 12 Statistics for Time SLot 8 Calculated***
*** Cluster 13 Statistics for Time SLot 8 Calculated***
*** Cluster 14 Statistics for Time SLot 8 Calculated***
*** Cluster 15 Statistics for Time SLot 8 Calculated***


                                                                                

Time Slot 9: K = 7 and the silhouette = 0.5436678101282212


                                                                                

Time Slot 9: K = 8 and the silhouette = 0.5269608811500314


                                                                                

Time Slot 9: K = 9 and the silhouette = 0.5283811693214089


                                                                                

Time Slot 9: K = 10 and the silhouette = 0.5548195613783098


                                                                                

Time Slot 9: K = 11 and the silhouette = 0.5271976382518486


                                                                                

Time Slot 9: K = 12 and the silhouette = 0.5662382166337983


                                                                                

Time Slot 9: K = 13 and the silhouette = 0.563294654552742


                                                                                

Time Slot 9: K = 14 and the silhouette = 0.5136393128122027


                                                                                

Time Slot 9: K = 15 and the silhouette = 0.5251824984773538


                                                                                

Time Slot 9: K = 16 and the silhouette = 0.5571557914337684


                                                                                

Time Slot 9: K = 17 and the silhouette = 0.5032653417088502


                                                                                

Time Slot 9: K = 18 and the silhouette = 0.5511905314802008


                                                                                

Time Slot 9: K = 19 and the silhouette = 0.5223305192006455


                                                                                

Time Slot 9: K = 20 and the silhouette = 0.5139902184315611
*** The Best K is 12 for Time Slot 9 ***
*** Cluster 0 Statistics for Time SLot 9 Calculated***
*** Cluster 1 Statistics for Time SLot 9 Calculated***
*** Cluster 2 Statistics for Time SLot 9 Calculated***
*** Cluster 3 Statistics for Time SLot 9 Calculated***
*** Cluster 4 Statistics for Time SLot 9 Calculated***
*** Cluster 5 Statistics for Time SLot 9 Calculated***
*** Cluster 6 Statistics for Time SLot 9 Calculated***
*** Cluster 7 Statistics for Time SLot 9 Calculated***
*** Cluster 8 Statistics for Time SLot 9 Calculated***
*** Cluster 9 Statistics for Time SLot 9 Calculated***
*** Cluster 10 Statistics for Time SLot 9 Calculated***
*** Cluster 11 Statistics for Time SLot 9 Calculated***


                                                                                

Time Slot 10: K = 7 and the silhouette = 0.5108102421685868


                                                                                

Time Slot 10: K = 8 and the silhouette = 0.5145770803655304


                                                                                

Time Slot 10: K = 9 and the silhouette = 0.531620868790183


                                                                                

Time Slot 10: K = 10 and the silhouette = 0.4659890693770833


                                                                                

Time Slot 10: K = 11 and the silhouette = 0.5336859659504211


                                                                                

Time Slot 10: K = 12 and the silhouette = 0.5093551248049383


                                                                                

Time Slot 10: K = 13 and the silhouette = 0.5674756282787476


                                                                                

Time Slot 10: K = 14 and the silhouette = 0.5667154684166685


                                                                                

Time Slot 10: K = 15 and the silhouette = 0.5602688898906426


                                                                                

Time Slot 10: K = 16 and the silhouette = 0.5485680794773596


                                                                                

Time Slot 10: K = 17 and the silhouette = 0.5346823112259284


                                                                                

Time Slot 10: K = 18 and the silhouette = 0.5260049747668533


                                                                                

Time Slot 10: K = 19 and the silhouette = 0.5262185452592192


                                                                                

Time Slot 10: K = 20 and the silhouette = 0.5493489530826935
*** The Best K is 13 for Time Slot 10 ***
*** Cluster 0 Statistics for Time SLot 10 Calculated***
*** Cluster 1 Statistics for Time SLot 10 Calculated***
*** Cluster 2 Statistics for Time SLot 10 Calculated***
*** Cluster 3 Statistics for Time SLot 10 Calculated***
*** Cluster 4 Statistics for Time SLot 10 Calculated***
*** Cluster 5 Statistics for Time SLot 10 Calculated***
*** Cluster 6 Statistics for Time SLot 10 Calculated***
*** Cluster 7 Statistics for Time SLot 10 Calculated***
*** Cluster 8 Statistics for Time SLot 10 Calculated***
*** Cluster 9 Statistics for Time SLot 10 Calculated***
*** Cluster 10 Statistics for Time SLot 10 Calculated***
*** Cluster 11 Statistics for Time SLot 10 Calculated***
*** Cluster 12 Statistics for Time SLot 10 Calculated***


                                                                                

Time Slot 11: K = 7 and the silhouette = 0.631669971937867


                                                                                

Time Slot 11: K = 8 and the silhouette = 0.5254296085532992


                                                                                

Time Slot 11: K = 9 and the silhouette = 0.5253696632083618


                                                                                

Time Slot 11: K = 10 and the silhouette = 0.5126715733617844


                                                                                

Time Slot 11: K = 11 and the silhouette = 0.5399413964398206


                                                                                

Time Slot 11: K = 12 and the silhouette = 0.5464488046304259


                                                                                

Time Slot 11: K = 13 and the silhouette = 0.591127037495448


                                                                                

Time Slot 11: K = 14 and the silhouette = 0.5876881869333745


                                                                                

Time Slot 11: K = 15 and the silhouette = 0.5838951473037953


                                                                                

Time Slot 11: K = 16 and the silhouette = 0.598704101222019


                                                                                

Time Slot 11: K = 17 and the silhouette = 0.5910380466666328


                                                                                

Time Slot 11: K = 18 and the silhouette = 0.5467681649939939


                                                                                

Time Slot 11: K = 19 and the silhouette = 0.5559045746467173


                                                                                

Time Slot 11: K = 20 and the silhouette = 0.5716273186099744
*** The Best K is 7 for Time Slot 11 ***
*** Cluster 0 Statistics for Time SLot 11 Calculated***
*** Cluster 1 Statistics for Time SLot 11 Calculated***
*** Cluster 2 Statistics for Time SLot 11 Calculated***
*** Cluster 3 Statistics for Time SLot 11 Calculated***
*** Cluster 4 Statistics for Time SLot 11 Calculated***
*** Cluster 5 Statistics for Time SLot 11 Calculated***
*** Cluster 6 Statistics for Time SLot 11 Calculated***
*** Cluster Statistics for Time Slot Calculated ***
+---------+------+
|Time_slot|Best_K|
+---------+------+
|        0|    13|
|        1|    18|
|        2|    17|
|        3|    19|
|        4|    20|
|        5|    15|
|        6|    18|
|        7|    13|
|        8|    16|
|        9|    12|
|       10|    13|
|       11|     7|
+---------+------+

+---------+--------------+------------------+------------------+-------------------+-------------------------------+-------------------------+-------

## Save Clusters

In [6]:
# Save the DataFrame as a single CSV file
stats_df.coalesce(1).write.format("csv").option("header", "true").mode("overwrite").save(
    output_folder_path + "/project-output.csv")

spark.stop()
