# Rideshare data

Using this notebook, it is possible to reproduce the clustering results on the Rideshare data from the paper [Stable and consistent density-based clustering](https://arxiv.org/abs/2005.09048)

In [1]:
import persistable
import numpy as np
import pandas as pd
import json
import plotly.express as px
from plotly.offline import plot

## Load data

This file and other uber datasets are available from [FiveThirtyEight](https://github.com/fivethirtyeight/uber-tlc-foil-response).

In [2]:
file_name = 'uber-tlc-foil-response-master/uber-trip-data/uber-raw-data-apr14.csv'
rideshare_data = pd.read_csv(file_name).iloc[:,[1,2]]

## Cluster data near Laguardia airport

As in the paper, we begin by examining a piece of the data, centered at Laguardia airport

In [3]:
# coordinates of Laguardia airport from openstreetmap
laguardia = [40.76933, -73.86738]

# take all data points in a square centered at Laguardia
x = 0.01
condition = ((rideshare_data.iloc[:,0] < laguardia[0] + x) & 
             (rideshare_data.iloc[:,0] > laguardia[0] - x) &
             (rideshare_data.iloc[:,1] < laguardia[1] + x) & 
             (rideshare_data.iloc[:,1] > laguardia[1] - x))

laguardia_data = rideshare_data.iloc[condition.values,:]

### Optional: plot data

In [5]:
# optional: plot data using plotly.express

fig = px.scatter_mapbox(laguardia_data, lat='Lat', lon='Lon', opacity=0.5, 
                        zoom=14, mapbox_style='open-street-map')
plot(fig, auto_open=True)

'temp-plot.html'

## Reproduce clusterings from the paper

In [4]:
# create Persistable object
p = persistable.Persistable(np.asarray(laguardia_data), n_neighbors=100)

In [6]:
# re-produce result with 8 clusters

params = {'n_clusters': 8, 
          'start': [0.00011, 0.003890201862691098], 
          'end': [0.0007106512662005413, 0.0003693533211186369]}

cluster_labels = p.cluster(params['n_clusters'], params['start'], params['end'])

In [12]:
# plot clustering of the Laguardia data using plotly.express

laguardia_data_clustered = laguardia_data.assign(cluster=cluster_labels)
laguardia_data_clustered = laguardia_data_clustered.astype({'cluster':'str'})

# pick color scheme
labels = [str(i) for i in range(-1, np.amax(cluster_labels)+1)]
category_orders={'cluster' : labels}
N = len(px.colors.qualitative.Vivid)
color_sequence = ['rgb(105, 105, 105)'] # dark grey for noise points
for i in range(np.amax(cluster_labels) + 1):
    color_sequence.append(px.colors.qualitative.Vivid[i % (N - 1)]) # skip the last color in Vivid since it's grey

fig = px.scatter_mapbox(laguardia_data_clustered, lat='Lat', lon='Lon', 
                        color='cluster', category_orders=category_orders, 
                        color_discrete_sequence=color_sequence, 
                        zoom=14, mapbox_style='open-street-map')
plot(fig, auto_open=True)

'temp-plot.html'

In [8]:
# re-produce result with 4 clusters

params = {'n_clusters': 4, 
          'start': [0.00011, 0.003890201862691098], 
          'end': [0.0007106512662005413, 0.0003693533211186369]}

cluster_labels = p.cluster(params['n_clusters'], params['start'], params['end'])

## Reproduce the instance of the Persistable interactive mode from the paper

### Basic usage of the Persistable interactive mode

- Run the cell below to open the graphical user interface.
- To see the Component Counting Function, click "Compute".
- To see the Prominence Vineyard, in the box "Interactive inputs selection", choose "Family of lines". Now, one sees the two chosen lines that determine the Prominence Vineyard. Next, click "Compute" under "Prominence Vineyard".
- To get a clustering, in the box "Parameter selection", choose "On".
- To re-create the clusterings in the paper, select Line number 37, and Gap number 4 or 8. Then click "Choose parameter".
- To get the labels for this clustering, run the cell below the graphical user interface.

In [10]:
# to reproduce the instance of the Persistable interactive mode 
# from the paper, we load a state dictionary and pass it to start_ui

with open('laguardia_state.json', 'r') as fp:
    state = json.load(fp)

# start UI
pi = persistable.PersistableInteractive(p)
port = pi.start_ui(ui_state=state, jupyter_mode='inline')

In [11]:
# get clustering with parameters chosen via the interactive mode
cluster_labels = pi.cluster()

#### Optional: cluster with hdbscan

In [None]:
import hdbscan

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=50, min_samples=50)
clusterer.fit(laguardia_data)
cluster_labels = clusterer.labels_

## Cluster the whole dataset



### Optional: plot data

In [13]:
# optional: plot data using plotly.express

fig = px.scatter_mapbox(rideshare_data, lat='Lat', lon='Lon', opacity=0.1, 
                        zoom=9, mapbox_style='open-street-map')
plot(fig, auto_open=True)

'temp-plot.html'

## Reproduce clusterings from the paper

In [13]:
# create Persistable object, 
# using a subsample of 30 000 points to speed up computation
p_rideshare = persistable.Persistable(np.asarray(rideshare_data), 
                                      subsample=30000, n_neighbors=200)

In [14]:
# re-produce clustering of the rideshare data from the paper,
# with persistence-based flattening

params = {'n_clusters': 10, 
          'start': [0, 0.0038], 
          'end': [0.005, 0]}

cluster_labels = p_rideshare.cluster(params['n_clusters'], 
                                     params['start'], 
                                     params['end'])

In [16]:
# re-produce clustering of the rideshare data from the paper,
# with exhaustive persistence-based flattening

params = {'n_clusters': 10, 
          'start': [0, 0.0038], 
          'end': [0.005, 0]}

cluster_labels = p_rideshare.cluster(params['n_clusters'], 
                                     params['start'], 
                                     params['end'], 
                                     flattening_mode='exhaustive')

In [17]:
# plot clustering of the rideshare data using plotly.express

rideshare_data_clustered = rideshare_data.assign(cluster=cluster_labels)
rideshare_data_clustered = rideshare_data_clustered.astype({'cluster':'str'})

# pick color scheme
labels = [str(i) for i in range(-1, np.amax(cluster_labels)+1)]
category_orders={'cluster' : labels}
N = len(px.colors.qualitative.Vivid)
color_sequence = ['rgb(105, 105, 105)'] # dark grey for noise points
for i in range(np.amax(cluster_labels) + 1):
    color_sequence.append(px.colors.qualitative.Vivid[i % (N - 1)]) # skip the last color in Vivid since it's grey

fig = px.scatter_mapbox(rideshare_data_clustered, lat='Lat', lon='Lon', 
                        color='cluster', category_orders=category_orders, 
                        color_discrete_sequence=color_sequence, 
                        zoom=9, mapbox_style='open-street-map')
plot(fig, auto_open=True)

'temp-plot.html'

In [18]:
# re-produce finer clustering of the rideshare data mentioned in the paper.

params = {'n_clusters': 27, 
          'start': [0.0, 0.0034508474576271184], 
          'end': [0.003254237288135593, 0.0]}

cluster_labels = p_rideshare.cluster(params['n_clusters'], 
                                     params['start'], 
                                     params['end'])

## Reproduce the instance of the Persistable interactive mode from the paper

### Basic usage of the Persistable interactive mode

- Run the cell below to open the graphical user interface.
- To see the Component Counting Function, click "Compute".
- To see the Prominence Vineyard, in the box "Interactive inputs selection", choose "Family of lines". Now, one sees the two chosen lines that determine the Prominence Vineyard. Next, click "Compute" under "Prominence Vineyard".
- To get a clustering, in the box "Parameter selection", choose "On".
- To re-create the clusterings in the paper, select Line number 40, and Gap number 10. Then click "Choose parameter". To get the finer result, select Line number 6 and gap 27.
- To get the labels for this clustering, run the cell below the graphical user interface.

In [18]:
# to reproduce the instance of the Persistable interactive mode 
# from the paper, we load a state dictionary and pass it to start_ui

with open('rideshare_state.json', 'r') as fp:
    state = json.load(fp)

# start UI
pi_rideshare = persistable.PersistableInteractive(p_rideshare)
port = pi_rideshare.start_ui(ui_state=state, jupyter_mode='inline')

In [19]:
# get clustering with parameters chosen via the interactive mode
cluster_labels = pi_rideshare.cluster()