In [21]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler

# initialise sparkContext
spark = SparkSession.builder \
    .master('local') \
    .appName('myAppName') \
    .config('spark.executor.memory', '2gb') \
    .config("spark.cores.max", "2") \
    .getOrCreate()



# using SQLContext to read parquet file
sc = spark.sparkContext
sqlContext = SQLContext(sc)

# to read parquet file
df = sqlContext.read.parquet('../assets/nyc-dataset/')
print(df.count())
print(df.printSchema())



2463931
root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

None


In [24]:
df.limit(5).toPandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [47]:
df2.columns

['passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee',
 'features']

In [48]:
df2 = df.drop('VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag')
df2 = df2.select([col(c).cast("double") for c in df2.columns])
df2 = df2.dropna(how='any')
assembler = VectorAssembler(
    inputCols=df2.columns,
    outputCol="features"
)
df2 = assembler.transform(df2)
df2.limit(5).toPandas()


Unnamed: 0,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,features
0,2.0,3.8,1.0,142.0,236.0,1.0,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,"[2.0, 3.8, 1.0, 142.0, 236.0, 1.0, 14.5, 3.0, ..."
1,1.0,2.1,1.0,236.0,42.0,1.0,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,"[1.0, 2.1, 1.0, 236.0, 42.0, 1.0, 8.0, 0.5, 0...."
2,1.0,0.97,1.0,166.0,166.0,1.0,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,"[1.0, 0.97, 1.0, 166.0, 166.0, 1.0, 7.5, 0.5, ..."
3,1.0,1.09,1.0,114.0,68.0,2.0,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,"[1.0, 1.09, 1.0, 114.0, 68.0, 2.0, 8.0, 0.5, 0..."
4,1.0,4.3,1.0,68.0,163.0,1.0,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,"[1.0, 4.3, 1.0, 68.0, 163.0, 1.0, 23.5, 0.5, 0..."


In [49]:
df2.select([count(when(isnan(c), c)).alias(c) for c in df2.drop('features').columns]).show()


+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|              0|            0|         0|           0|           0|           0|          0|    0|      0|         0|           0|                    0|           0|                   0|          0|
+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+


In [50]:
from pyspark.ml.clustering import KMeans
model = KMeans().setK(5)
print(model.explainParams())
model_fitted = model.fit(df2)

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 5)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max number of iterations (>= 0). (default: 20)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: -8192224382018474609)
solver

In [51]:
model_fitted.computeCost(df2)

AttributeError: 'KMeansModel' object has no attribute 'computeCost'

In [52]:



summary = model_fitted.summary
print(summary.clusterSizes) # number of points
centers = model_fitted.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

[534042, 1, 448717, 820498, 589170]
Cluster Centers: 
[1.37867026e+00 2.01244456e+00 1.23897658e+00 2.19185666e+02
 2.37978843e+02 1.21665062e+00 9.93335978e+00 1.03812865e+00
 4.89780075e-01 2.02627399e+00 7.48964441e-02 2.95887924e-01
 1.55221524e+01 2.38912420e+00 2.53597668e-03]
[1.0000000e+00 3.3000000e+00 1.0000000e+00 1.0700000e+02 1.4000000e+02
 4.0000000e+00 4.0109232e+05 2.5000000e+00 5.0000000e-01 0.0000000e+00
 0.0000000e+00 3.0000000e-01 4.0109562e+05 2.5000000e+00 0.0000000e+00]
[  1.40166935   4.33287057   1.80846595 117.2060295   71.20263497
   1.2699318   16.08383841   0.99910118   0.48696473   2.70106655
   0.60827958   0.29581755  22.70787742   1.99524777   0.18238411]
[1.39525625e+00 3.53142306e+00 1.39470572e+00 1.14534174e+02
 1.97106107e+02 1.22997116e+00 1.36270996e+01 1.04308423e+00
 4.92611348e-01 2.54494184e+00 5.33029868e-01 2.96937842e-01
 2.02242066e+01 2.27193623e+00 1.35293328e-01]
[1.38169953e+00 2.53266684e+00 1.30258502e+00 2.27523982e+02
 1.20556241e