## Imports

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt

## Query C

In [None]:
spark = SparkSession.builder.master('local[*]').appName('kmeans').getOrCreate()
sc = spark.sparkContext

In [None]:
schema = StructType([StructField("medallion", StringType(), True),
                     StructField("hack_license", StringType(), True),
                     StructField("pickup_datetime", TimestampType(), True),
                     StructField("dropoff_datetime", TimestampType(), True),
                     StructField("trip_time_in_secs", IntegerType(), True),
                     StructField("trip_distance", DecimalType(precision=10, scale=2), True),
                     StructField("pickup_longitude", DecimalType(precision=18, scale=14), True),
                     StructField("pickup_latitude", DecimalType(precision=18, scale=14), True),
                     StructField("dropoff_longitude", DecimalType(precision=18, scale=14), True),
                     StructField("dropoff_latitude", DecimalType(precision=18, scale=14), True),
                     StructField("payment_type", StringType(), True),
                     StructField("fare_amount", DecimalType(precision=10, scale=2), True),
                     StructField("surcharge", DecimalType(precision=10, scale=2), True),
                     StructField("mta_tax", DecimalType(precision=10, scale=2), True),
                     StructField("tip_amount", DecimalType(precision=10, scale=2), True),
                     StructField("tolls_amount", DecimalType(precision=10, scale=2), True),
                     StructField("total_amount", DecimalType(precision=10, scale=2), True)])

In [None]:
data = spark.read.csv("sorted_data.csv", schema=schema)

data = data.where(data.pickup_longitude >= -74.916578) \
           .where(data.pickup_longitude <= -73.120778) \
           .where(data.dropoff_longitude >= -74.916578) \
           .where(data.dropoff_longitude <= -73.120778) \
           .where(data.pickup_latitude >= 40.129715978) \
           .where(data.pickup_latitude <= 41.477182778) \
           .where(data.dropoff_latitude >= 40.129715978) \
           .where(data.dropoff_latitude <= 41.477182778) \
           .where(data.trip_time_in_secs > 0)

assembler = VectorAssembler( \
    inputCols=["pickup_latitude", "pickup_longitude"], \
    outputCol="features")

dataPrepared = assembler.transform(data)

sse = {}
silhouette = []

for i in range(2,50):
    kmeans = KMeans(k=i, seed=1)
    clusters = kmeans.fit(dataPrepared)
    cost = clusters.computeCost(dataPrepared)
    sse[i] = cost
    
    kmeans_preds = kmeans.fit(dataPrepared).transform(dataPrepared)
    sil = ClusteringEvaluator().evaluate(kmeans_preds)
    silhouette.append(sil)
    
    print(str(i) + " -> " + str(cost))

## Elbow Method

In [None]:
f = plt.figure(figsize=(10, 8))
plt.title("K-Means: Elbow Method", fontsize=14, fontweight="bold")
plt.plot(list(sse.keys()), list(sse.values()), '-o')
plt.axvline(x=10, color="red", linestyle="--")
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
f.savefig("kmeans_elbow_method.png", bbox_inches='tight', dpi=300)
plt.show()

## Silhouette Score

In [None]:
f = plt.figure(figsize=(10, 8))
plt.title("K-Means: Silhouette Score", fontsize=14, fontweight="bold")
plt.plot([i for i in range(2,50)], silhouette)
plt.axvline(x=10, color="red", linestyle="--")
plt.xlabel("Number of cluster")
plt.ylabel("Silhouette Score")
f.savefig("kmeans_sil.png", bbox_inches='tight', dpi=300)
plt.show()

## Stands

In [None]:
kmeans = KMeans(k=10, seed=1)
clusters = kmeans.fit(dataPrepared)
cost = clusters.computeCost(dataPrepared)
    
print("Clusters centers = " + str(clusters.clusterCenters()))