# Use Case: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) 

In [20]:
!pip install folium



In [21]:
import pandas as pd
import numpy as np
import folium


In [22]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [23]:
#quick look at the data
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,pickup_borough,dropoff_borough,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
0,0,0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,...,0,0,0,0,0,0.640487,queens,queens,0,0
1,1,1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,...,0,0,0,0,0,5.25067,manhattan,manhattan,1,0
2,2,2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,...,0,0,0,0,0,0.863411,manhattan,manhattan,0,0
3,3,3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,...,0,0,0,0,0,1.739386,manhattan,manhattan,1,0
4,4,4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,...,0,0,0,0,0,1.242218,manhattan,manhattan,0,0


In [24]:
train.shape

(400000, 32)

In [25]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [26]:
coordinates.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.84161
1,40.711303,-74.016048,40.782004,-73.979268
2,40.76127,-73.982738,40.750562,-73.991242
3,40.733143,-73.98713,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655


## Clustering
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [27]:
from sklearn.cluster import KMeans

In [28]:
#define number of clusters and create instance
k=20
myKMeans=KMeans(n_clusters=k, n_jobs=-1)#parallelize to all cores

In [29]:
#train model
myKMeans.fit(coordinates.to_numpy())



KMeans(n_clusters=20, n_jobs=-1)

In [30]:
#get cluster centers
centers=myKMeans.cluster_centers_
labels=myKMeans.labels_    

In [31]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(k):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [32]:
cluster_map

In [33]:
labels

array([ 6, 12, 19, ...,  4,  4,  1])

In [34]:
#add cluster labels to DataFrame
train['clusterID']=labels

In [35]:
#GroupBy Clusters
clusters=train.groupby('clusterID')

In [36]:
clusters['fare_amount'].count()

clusterID
0     37022
1     42425
2      4787
3      4630
4     60459
5        98
6      3260
7      1705
8      7323
9      8366
10    47643
11     8120
12    40576
13       53
14     3111
15       92
16      721
17     8706
18    36877
19    84026
Name: fare_amount, dtype: int64

In [37]:
clusters.mean()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,200436.020312,204778.639971,204778.639971,12.747142,-73.978148,40.758648,-73.998127,40.725888,1.71101,15.589703,...,-73.998125,0.0,0.0,0.0,0.0,0.0,0.0,2.597586,0.038004,0.931905
1,199798.300224,204126.945692,204126.945692,10.475795,-73.980713,40.755507,-73.959388,40.778961,1.672882,15.641108,...,-73.959391,0.0,0.0,0.0,0.0,0.0,0.0,2.136227,0.143524,0.0
2,201548.479215,205915.184249,205915.184249,48.654126,-73.784774,40.646475,-73.9711,40.739403,1.800292,15.726551,...,-73.971108,0.975977,0.0,0.0,0.001044,0.0,0.011907,12.108439,0.0,0.17652
3,198879.874298,203188.490497,203188.490497,23.19157,-73.969652,40.774475,-73.924054,40.851532,1.708207,15.605184,...,-73.924057,0.001296,0.0,0.0,0.0,0.026782,0.0,6.05339,0.126134,0.0
4,200791.350337,205141.456574,205141.456574,7.626712,-73.997056,40.72836,-73.997833,40.727926,1.681272,15.680643,...,-73.997832,0.0,0.0,1.7e-05,0.0,0.0,0.0,1.038916,0.884699,0.917829
5,208037.27551,212544.540816,212544.540816,9.795102,-73.149918,41.366595,-73.147641,41.368939,1.387755,16.173469,...,-73.147255,0.0,0.0,0.0,0.0,0.0,0.0,0.242551,0.0,0.0
6,203392.495706,207799.057975,207799.057975,12.689316,-73.884595,40.762215,-73.880406,40.755495,1.711043,15.728221,...,-73.880396,0.0,0.00184,0.0,0.0,0.351227,0.122393,1.893311,0.0,0.0
7,201157.570088,205515.421701,205515.421701,23.604968,-73.787803,40.655686,-73.796721,40.688188,1.679765,15.698534,...,-73.796701,0.822287,0.333138,0.0,0.0,0.0,0.019941,3.346168,0.0,0.0
8,198402.235013,202700.554418,202700.554418,31.207162,-73.871448,40.770441,-73.978028,40.747638,1.726342,15.751741,...,-73.97803,0.0,0.0,0.0,0.0,0.916291,0.0,6.158722,0.0,0.200191
9,200994.635429,205349.250538,205349.250538,28.36224,-73.980267,40.752663,-73.874312,40.763333,1.68623,15.704518,...,-73.874316,0.0,0.0,0.0,0.0,0.0,0.582238,5.897509,0.24743,0.0


In [38]:
clusters.var()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13282100000.0,13859340000.0,13859340000.0,32.942141,0.000131,0.000125,0.000106,0.000161,1.743256,75.101634,...,0.000106,0.0,0.0,0.0,0.0,0.0,0.0,1.515137,0.036561,0.063459
1,13330240000.0,13909520000.0,13909520000.0,23.842483,0.00011,0.000146,0.000141,0.000159,1.693717,74.931678,...,0.000141,0.0,0.0,0.0,0.0,0.0,0.0,1.601569,0.122928,0.0
2,13662490000.0,14256470000.0,14256470000.0,131.234187,0.000129,6.4e-05,0.001003,0.002172,1.902858,76.56562,...,0.001003,0.023451,0.0,0.0,0.001044,0.0,0.011768,3.441175,0.0,0.145391
3,13434570000.0,14018540000.0,14018540000.0,241.841152,0.000985,0.001679,0.001027,0.001089,1.817841,76.300772,...,0.001027,0.001294,0.0,0.0,0.0,0.02607,0.0,9.981786,0.110248,0.0
4,13285530000.0,13862940000.0,13862940000.0,20.859631,0.000112,0.000148,0.000101,0.000124,1.686528,75.249824,...,0.000101,0.0,0.0,1.7e-05,0.0,0.0,0.0,0.408302,0.102009,0.07542
5,13682800000.0,14276860000.0,14276860000.0,54.881506,0.005296,0.000353,0.004019,0.000394,0.610983,91.217021,...,0.004026,0.0,0.0,0.0,0.0,0.0,0.0,2.412272,0.0,0.0
6,13473460000.0,14059090000.0,14059090000.0,193.236821,0.000826,0.000577,0.001425,0.001005,1.822584,76.690151,...,0.001424,0.0,0.001838,0.0,0.0,0.227937,0.107446,3.613266,0.0,0.0
7,13359630000.0,13940090000.0,13940090000.0,362.918639,0.000508,0.000667,0.002215,0.002336,1.736591,76.007656,...,0.002216,0.146217,0.222287,0.0,0.0,0.0,0.019555,12.025979,0.0,0.0
8,13291080000.0,13868670000.0,13868670000.0,78.734538,0.00015,0.000116,0.00033,0.001001,1.795628,75.989165,...,0.00033,0.0,0.0,0.0,0.0,0.076712,0.0,2.360424,0.0,0.160137
9,13466120000.0,14051440000.0,14051440000.0,77.605699,0.000255,0.000486,0.000487,0.000357,1.689582,74.636171,...,0.000487,0.0,0.0,0.0,0.0,0.0,0.243266,2.48645,0.186231,0.0
