# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [61]:
import pandas as pd
import numpy as np
import folium


In [62]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/JonasZimmer1994/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


In [64]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [65]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [66]:
from sklearn.cluster import KMeans

In [67]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [69]:
#train model
data = coordinates.to_numpy()[:100000,:] #use only subset of the data to make it faster

myKMeans.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [70]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [71]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [72]:
#cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [73]:
cluster_map_self = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)

prediction = myKMeans.predict(data)

In [74]:
def show_cluster(cluster_number):
  folium.CircleMarker([centers[cluster_number,0], centers[cluster_number,1]], radius=3, color="black", fill_opacity=0.9).add_to(cluster_map_self)
  folium.CircleMarker([centers[cluster_number,2], centers[cluster_number,3]], radius=3, color="black", fill_opacity=0.9).add_to(cluster_map_self)

  for i in range(len(prediction)):
    if(prediction[i] == cluster_number):
      folium.CircleMarker([data[i, 0], data[i, 1]], radius=3, color="green", fill_opacity=0.9).add_to(cluster_map_self)
      folium.CircleMarker([data[i, 2], data[i, 3]], radius=3, color="red", fill_opacity=0.9).add_to(cluster_map_self)
      

    

In [75]:
show_cluster(61)

In [76]:
cluster_map_self

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [77]:
def cluster_var(cluster_number):

  cluster = []
  for i in range(len(prediction)):
    if(prediction[i] == cluster_number):
      cluster.append([data[i,0], data[i,1]])

  var = np.var(cluster)
  return var

In [78]:
#define number of clusters and create instance
clusters=100
myKMeansVar=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

myKMeansVar.fit(data)

prediction = myKMeansVar.predict(data)

In [79]:
clustervar= []
for i in range(clusters):
  clustervar.append(cluster_var(i))

clustervar

[3292.772857128628,
 3291.3922776118548,
 3285.837478960929,
 3277.7646528669898,
 3287.288707455121,
 3290.8839717104306,
 3273.545612374292,
 3290.4433816928026,
 3291.0108325006368,
 3290.414879872948,
 3273.6125200705033,
 3290.900716837621,
 3277.76465286699,
 3285.6141962608654,
 3283.705544461998,
 3291.0885595339673,
 3273.907793478889,
 3290.892032388876,
 3290.8999506980676,
 3291.301097667601,
 3290.509982393047,
 3291.2018062191405,
 3291.7379824633176,
 3273.5672332459226,
 3287.410385994547,
 3291.9216583920347,
 3291.199862300824,
 3291.1454638348227,
 3291.4438375756117,
 3290.6644194305923,
 3290.1040221398907,
 3291.855994894179,
 3285.6745233135416,
 3285.327281572663,
 3290.6738250264284,
 3290.6650895842504,
 3289.4738992189405,
 3289.9094201661896,
 3291.3223392513123,
 3290.2272165176455,
 3291.6839688501536,
 3290.848871052388,
 3273.684479927593,
 3303.6540982638435,
 3297.581415428783,
 3315.142308539256,
 3290.78295194815,
 3290.815010972619,
 3290.4243365210

In [80]:
#define number of clusters and create instance
clusters=10
myKMeansVar=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

myKMeansVar.fit(data)

prediction = myKMeansVar.predict(data)

In [81]:
clustervar= []
for i in range(clusters):
  clustervar.append(cluster_var(i))

clustervar

[3290.3455575085795,
 3290.7953201824203,
 3276.279417319257,
 3291.242074572827,
 3290.6110091566084,
 3278.1838941451488,
 3290.556375435738,
 3291.4188593688837,
 3285.885315078994,
 3273.5981189626423]