In [20]:
import pickle
import numpy as np
import pandas as pd
import math
from math import radians, cos, sin, asin, sqrt
import seaborn as sns

from geolib import geohash

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import DBSCAN

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)

In [4]:
df = pd.read_csv('./../data_raw/training.csv')

In [18]:
# Helper functions

# Map geohashes to coordinates
geohash_set = list(set(df["geohash6"]))
geohash_dict = {}
for i in geohash_set:
    geohash_dict[i] = geohash.decode(i)
geohash_dict = dict(sorted(geohash_dict.items())) 

# Returns lat & lon coordinates from geohash_dict
def getLatLon(df):
    a = []
    b = []
    for index,row in df.iterrows():
        point = geohash_dict[str(df.loc[index,"geohash6"])]
        a.append(point[0])
        b.append(point[1])
    return a,b


def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

# Join every unclustered point to nearest cluster
# geohash_dict should be sorted
def deriveClusterIDs(geohash_dict, clusterList):
    sortedList = []
    for point in geohash_dict.values():
        clusterCenter = clusterList[0]
        minDist = haversine(clusterCenter[0], clusterCenter[1], point[0], point[1])
        clusterID = 0
        for ID in range(1, len(clusterList)):
            currDist = haversine(clusterList[ID][0], clusterList[ID][1], point[0], point[1])
            if currDist < minDist:
                clusterID = ID
                minDist = currDist
        sortedList.append(clusterID)
    return sortedList

In [15]:
# groupby sum

df_groupby = df.groupby('geohash6').agg({'demand': np.sum})
df_groupby = df_groupby.reset_index()
df_groupby['lat'], df_groupby['lon'] = getLatLon(df3)

### Demand-based Clustering

In [44]:
# each key points to a list of clusterID per geohash, sorted by geohash
# each key represents a different type of clustering
    # to do: concat to featureset as additional feature columns

clusterDict = {} 

In [53]:
# repeat this, and the next cell,
# for: 2000, 2500, 2800, 3000, 3500
# to generate 5 types of clusters

demandCutoff = 3500 

In [54]:
# DBSCAN might not show good clusters since all coordinates are equidistant,
# Obtain total demand and keep only high-demand locations for clustering

total_demand_df = df_groupby[df_groupby['demand'] > demandCutoff]

kms_per_radian = 6371.0088
eps = 1.5 / kms_per_radian
dbscan_coords = total_demand_df.as_matrix(columns=['lat', 'lon'])
dbscan_coords = np.array(dbscan_coords, dtype=float)

db = DBSCAN(eps=eps, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(dbscan_coords))

cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([dbscan_coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

# create clusterList
# and to visualize in plotly
clusterList = []
coords = { 'lat': [], 'lon':[], 'demand':[] }
for i in clusters:
    point = i[0]
    clusterList.append(point)
    coords['lat'].append(point[0])
    coords['lon'].append(point[1])
    
coords['demand'] = [10]*len(coords['lat'])

# Join every unclustered point to nearest cluster
# Change keynames

keyName = "demand-%i".replace("%i", str(demandCutoff))
clusterDict[keyName] = deriveClusterIDs(geohashDict, clusterList)

Number of clusters: 5



Method .as_matrix will be removed in a future version. Use .values instead.



### Visualize on 'map'

In [59]:
# Visualize on 'map':
    # 1. Total demand across all locations
    # 2. Cluster centers

# Plot daily demand vs locations, comment out to plot cluster centers
# Uncommnent second part below too
# coords = pd.concat([df_groupby['lat'], df_groupby['lon'], df_groupby['demand']], axis=1)

cases = []
colors = ['rgb(239,243,255)','rgb(189,215,231)','rgb(107,174,214)','rgb(33,113,181)']

for i in range(6,10)[::-1]:
    cases.append(go.Scattergeo(
        lon = coords['lon'],
        lat = coords['lat'],
        marker = dict(
            size = coords['demand'], # / 50, # uncomment if plotting total demand
            color = colors[i-6],
            opacity = .4,
            line = dict(width = 0)
        ),
    ) )

cases[0]['mode'] = 'markers'

layout = go.Layout(
    title = 'Total Traffic Demand across time period',
    geo = dict(
        scope = 'asia',
        showframe = True,
        showcoastlines = True,
        showland = True,
        landcolor = "rgb(229, 229, 229)",
        countrycolor = "rgb(255, 255, 255)" ,
        coastlinecolor = "rgb(255, 255, 255)",
        projection=dict( type = 'natural earth'),
        lonaxis = dict( range= [ 90.5877685546875, 90.9722900390625 ] ),
        lataxis = dict( range= [ -5.48492431640625, -5.23773193359375 ] ),
        
    ),
    legend = dict(
           traceorder = 'reversed'
    )
)


fig = go.Figure(layout=layout, data=cases)
plotly.offline.iplot(fig, validate=False, filename='iantest')

### save cluster ids and geo features

In [58]:
# add lat and lon
clusterDict['geohash6'] = list(geohash_dict.keys())
cluster_df = pd.DataFrame(clusterDict)
cluster_df['lat'], cluster_df['lon'] = getLatLon(cluster_df)

# encode
le = LabelEncoder()
cluster_df['lat'] = le.fit_transform(cluster_df['lat'].values)
cluster_df['lon'] = le.fit_transform(cluster_df['lon'].values)

# create unique ids per location, by moving horizontally & vertically
max_lon = max(set(cluster_df['lon'].values)) + 1 # 35 + 1
max_lat = max(set(cluster_df['lat'].values)) + 1 # 45 + 1
cluster_df['grid_id_by_lat'] = cluster_df['lat'] + cluster_df['lon']*max_lat
cluster_df['grid_id_by_lon'] = cluster_df['lon'] + cluster_df['lat']*max_lon

# save clusters as csv
cluster_df = cluster_df.set_index('geohash6')
cluster_df = cluster_df.reset_index()
cluster_df.to_csv("./../data_temp/cluster_df.csv", index=False)