# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
!pip install folium



In [2]:
import pandas as pd
import numpy as np
import folium


In [3]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [4]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [5]:
coordinates

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.841610
1,40.711303,-74.016048,40.782004,-73.979268
2,40.761270,-73.982738,40.750562,-73.991242
3,40.733143,-73.987130,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655
...,...,...,...,...
399995,40.746032,-73.986585,40.724077,-73.990865
399996,40.742359,-73.992882,40.762318,-73.972649
399997,40.731558,-73.985598,40.728738,-73.987657
399998,40.740735,-74.007692,40.722847,-73.988455


## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [6]:
from sklearn.cluster import KMeans

In [7]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [8]:
%%time
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster



Wall time: 1min 5s


KMeans(n_clusters=100, n_jobs=-1)

In [9]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [10]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [11]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [12]:
model_predict = myKMeans.predict(coordinates.to_numpy()) #Pro Coordinate/Sample 1 Cluster Number

In [13]:

def show_cluster(cluster_number, model_predict, centers):
    cluster_koordinate = coordinates.to_numpy()[cluster_number == model_predict]
    anzahl = len(cluster_koordinate)
    print("Anzahl Einträge cluster: ", anzahl)
    #draw the map of the given cluster number
    map_cluster = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    
    #Zentren der Cluster Nummer einzeichnen
    folium.PolyLine([[centers[cluster_number,0],centers[cluster_number,1]] , [centers[cluster_number,2],centers[cluster_number,3]]  ], color="black", weight=2.5, opacity=1).add_to(map_cluster)
    
    for i in range(anzahl):
        folium.CircleMarker([cluster_koordinate[i,0], cluster_koordinate[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(map_cluster)
        folium.CircleMarker([cluster_koordinate[i,2], cluster_koordinate[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(map_cluster)
    return map_cluster
   

In [14]:
show_cluster(5, model_predict, centers)

Anzahl Einträge cluster:  97


## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [26]:
def cluster_var(cluster_number, model_predict, k):
    var_coordinates = coordinates.to_numpy()
    print("Varianz der verschiedenen Cluster")
    for i in range(k):
        coor_var = var_coordinates[model_predict == i]
        variance = np.var(coor_var, axis = 0)
        print(i, variance)
        
    cluster_koordinate = coordinates.to_numpy()[cluster_number == model_predict]
    groesse_cluster = len(cluster_koordinate)
    print (groesse_cluster)
    print("Varianz innerhalb des Clusters: ", cluster_number )
    variance_cluster = np.var(cluster_koordinate, axis = 0)
    print(variance_cluster)
    for z in range(groesse_cluster):
        test = np.var(cluster_koordinate, axis = 0)
        print(z, test)

In [27]:
cluster_var(5,model_predict, 10)

Varianz der verschiedenen Cluster
0 [2.39164574e-05 2.89478962e-05 3.12692664e-05 2.14529046e-05]
1 [0.00028521 0.00026678 0.00018839 0.00016745]
2 [3.66245616e-05 1.00893707e-04 1.15498419e-03 6.65557935e-04]
3 [3.12812207e-05 3.10185609e-05 2.67812428e-05 2.87823705e-05]
4 [2.06141733e-04 8.35927333e-05 9.45838103e-05 1.59649449e-04]
5 [9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
6 [9.61861602e-05 1.36058581e-04 2.19970435e-04 1.27004202e-04]
7 [5.64940667e-05 2.92558748e-04 1.16857111e-03 9.67978230e-04]
8 [0.00034787 0.00016052 0.00011448 0.00015216]
9 [5.65456629e-05 4.26500238e-05 6.23943557e-05 7.19034226e-05]
97
Varianz innerhalb des Clusters:  5
[9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
0 [9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
1 [9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
2 [9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
3 [9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
4 

In [25]:
cluster_var(5,model_predict, 100)

Varianz außerhalb eines Cluster --> Extra Varianz
0 [2.39164574e-05 2.89478962e-05 3.12692664e-05 2.14529046e-05]
1 [0.00028521 0.00026678 0.00018839 0.00016745]
2 [3.66245616e-05 1.00893707e-04 1.15498419e-03 6.65557935e-04]
3 [3.12812207e-05 3.10185609e-05 2.67812428e-05 2.87823705e-05]
4 [2.06141733e-04 8.35927333e-05 9.45838103e-05 1.59649449e-04]
5 [9.79454088e-05 2.88233672e-03 7.75491726e-05 1.09113459e-03]
6 [9.61861602e-05 1.36058581e-04 2.19970435e-04 1.27004202e-04]
7 [5.64940667e-05 2.92558748e-04 1.16857111e-03 9.67978230e-04]
8 [0.00034787 0.00016052 0.00011448 0.00015216]
9 [5.65456629e-05 4.26500238e-05 6.23943557e-05 7.19034226e-05]
10 [3.40110400e-05 2.32280283e-05 2.67752219e-05 2.61639532e-05]
11 [0.00293911 0.00454012 0.00098063 0.00242336]
12 [2.86539827e-05 4.25356086e-05 2.88440706e-05 3.96471355e-05]
13 [0.0003905  0.0002348  0.00042526 0.00036466]
14 [1.35325111e-04 1.09714982e-04 8.41342046e-05 6.15284676e-05]
15 [6.26119370e-05 5.60902644e-05 4.33769250e-05 

Varianz innerhalb eines Clusters ist immer gleich!!