## Modeling Both Clean Bus & Train Avg Datasets

### Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score, accuracy_score, homogeneity_score, completeness_score, v_measure_score
from sklearn.cluster import DBSCAN 

import time

### Reading in Average Ridership for Buses and Trains

In [2]:
bus = pd.read_csv('../brianna_folder/datasets/clean_bus_avg_ridership.csv')
train = pd.read_csv('../brianna_folder/datasets/clean_train_avg_ridership.csv')

In [3]:
bus.head()

Unnamed: 0,route_id,route_name,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,month_total
0,1,Indiana/Hyde Park,2001-01-01,6982.6,0.0,0.0,153617
1,2,Hyde Park Express,2001-01-01,1000.0,0.0,0.0,22001
2,3,King Drive,2001-01-01,21406.5,13210.7,8725.3,567413
3,4,Cottage Grove,2001-01-01,22432.2,17994.0,10662.2,618796
4,6,Jackson Park Express,2001-01-01,18443.0,13088.2,7165.6,493926


In [4]:
train.head()

Unnamed: 0,station_id,station_name,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,month_total
0,40900,Howard,2001-01-01,6233.9,3814.5,2408.6,164447
1,41190,Jarvis,2001-01-01,1489.1,1054.0,718.0,40567
2,40100,Morse,2001-01-01,4412.5,3064.5,2087.8,119772
3,41300,Loyola,2001-01-01,4664.5,3156.0,1952.8,125008
4,40760,Granville,2001-01-01,3109.8,2126.0,1453.8,84189


In [5]:
bus.set_index('month_beginning', inplace=True)

In [6]:
bus.head()

Unnamed: 0_level_0,route_id,route_name,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,month_total
month_beginning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-01,1,Indiana/Hyde Park,6982.6,0.0,0.0,153617
2001-01-01,2,Hyde Park Express,1000.0,0.0,0.0,22001
2001-01-01,3,King Drive,21406.5,13210.7,8725.3,567413
2001-01-01,4,Cottage Grove,22432.2,17994.0,10662.2,618796
2001-01-01,6,Jackson Park Express,18443.0,13088.2,7165.6,493926


In [7]:
train.set_index('month_beginning', inplace=True)

In [8]:
train.head()

Unnamed: 0_level_0,station_id,station_name,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,month_total
month_beginning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-01,40900,Howard,6233.9,3814.5,2408.6,164447
2001-01-01,41190,Jarvis,1489.1,1054.0,718.0,40567
2001-01-01,40100,Morse,4412.5,3064.5,2087.8,119772
2001-01-01,41300,Loyola,4664.5,3156.0,1952.8,125008
2001-01-01,40760,Granville,3109.8,2126.0,1453.8,84189


#### Handling Categorical Variables

In [9]:
bus = pd.get_dummies(bus, columns = ['route_id', 'route_name'])

In [10]:
bus.columns.nunique()

380

In [11]:
bus.head()

Unnamed: 0_level_0,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,month_total,route_id_1,route_id_10,route_id_100,route_id_1001,route_id_1002,route_id_103,...,route_name_West 65th,route_name_West 95th,route_name_West Cermak,route_name_West Lawrence,route_name_West Loop/South Loop,route_name_Westchester,route_name_Western,route_name_Western Express,route_name_Wilson/Michigan Express,route_name_Wrigley Field Express
month_beginning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-01-01,6982.6,0.0,0.0,153617,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2001-01-01,1000.0,0.0,0.0,22001,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2001-01-01,21406.5,13210.7,8725.3,567413,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2001-01-01,22432.2,17994.0,10662.2,618796,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2001-01-01,18443.0,13088.2,7165.6,493926,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
bus.to_csv('./avg_bus_dummified.csv')

In [13]:
train = pd.get_dummies(train, columns = ['station_id', 'station_name'])

In [14]:
train.to_csv('./avg_train_dummified.csv')

## DBSCAN

### Implementing Function

In [15]:
def best_dbscan(df, cluster_list, min_samp_list):
    
    X_sc = StandardScaler().fit_transform(df)
    best_sil_score = -1
    
    for i in cluster_list:
        for j in min_samp_list:
            
            df_scan = DBSCAN(eps = i , min_samples = j)
            df_scan.fit(X_sc)
            
            try:
                sil_score = silhouette_score(X_sc, df_scan.labels_)
            except ValueError:
                sil_score = -1
           
            if sil_score > best_sil_score:
                best_sil_score = sil_score
                best_param = (i, j, best_sil_score)
                

    df_scan = DBSCAN(eps=best_param[0], min_samples=best_param[1])
    df_scan.fit(X_sc)
    df['cluster'] = df_scan.labels_
    sns.pairplot(df, hue='cluster')
    
    print(f'Best # of clusters: {best_param[0]}')
    print(f'Best # of minimum samples : {best_param[1]}')
    print(f'Best silhouette score: {best_param[2]}')
   
    return best_param

## DBSCAN on Average Bus Ridership

In [None]:
best_dbscan(bus, [.5, 1, 2, 3, 4, 5], [2, 3, 4, 5, 7, 10])
%%time

### Plotting Bus Clusters

In [None]:
core_samples = np.zeros_like(labels, dtype = bool)  
core_samples[db.core_sample_indices_] = True 
print(core_samples)

In [None]:
unique_labels = np.unique(labels)
colors = plt.cm.Spectral(np.linspace(0,1, len(unique_labels)))

for (label, color) in zip(unique_labels, colors):
    class_member_mask = (labels == label)
    n = X_sc.loc[class_member_mask & core_samples, :]
    plt.plot(n.iloc[:,0],n.iloc[:,1], 'o', markerfacecolor = color, markersize = 10)
    
    n = X_sc.loc[class_member_mask & ~core_samples, :]
    plt.plot(n.iloc[:,0],n.iloc[:,1], 'o', markerfacecolor = color, markersize = 5)

### Evaluating Scores

In [None]:
print(homogeneity_score(y, df_scan.labels_)

In [None]:
print(completeness_score(y, df_scan.labels_)

In [None]:
print(v_measure_score(y, df_scan.labels_)

## DBSCAN on Average Train Ridership

In [None]:
best_dbscan(train, [.5, 1, 2, 3, 4, 5, 7, 10, 15], [1, 2, 3, 4, 5, 9, 15])
%%time

### Plotting Train Clusters

In [None]:
core_samples = np.zeros_like(labels, dtype = bool)  
core_samples[db.core_sample_indices_] = True 
print(core_samples)

In [None]:
unique_labels = np.unique(labels)
colors = plt.cm.Spectral(np.linspace(0,1, len(unique_labels)))

for (label, color) in zip(unique_labels, colors):
    class_member_mask = (labels == label)
    n = X_sc.loc[class_member_mask & core_samples, :]
    plt.plot(n.iloc[:,0],n.iloc[:,1], 'o', markerfacecolor = color, markersize = 10)
    
    n = X_sc.loc[class_member_mask & ~core_samples, :]
    plt.plot(n.iloc[:,0],n.iloc[:,1], 'o', markerfacecolor = color, markersize = 5)

### Evaluating Scores

In [None]:
print(homogeneity_score(y, df_scan.labels_)

In [None]:
print(completeness_score(y, df_scan.labels_)

In [None]:
print(v_measure_score(y, df_scan.labels_)