In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score



import time

### Reading in Dummified CSVs

In [2]:
bus = pd.read_csv('./avg_bus_dummified.csv')
train = pd.read_csv('./avg_train_dummified.csv')

In [3]:
bus.head()

Unnamed: 0,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,month_total,route_id_1,route_id_10,route_id_100,route_id_1001,route_id_1002,...,route_name_West 65th,route_name_West 95th,route_name_West Cermak,route_name_West Lawrence,route_name_West Loop/South Loop,route_name_Westchester,route_name_Western,route_name_Western Express,route_name_Wilson/Michigan Express,route_name_Wrigley Field Express
0,2001-01-01,6982.6,0.0,0.0,153617,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2001-01-01,1000.0,0.0,0.0,22001,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001-01-01,21406.5,13210.7,8725.3,567413,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2001-01-01,22432.2,17994.0,10662.2,618796,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2001-01-01,18443.0,13088.2,7165.6,493926,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
bus['month_beginning'] = bus['month_beginning'].apply(lambda x: x.split('-')[0])

In [5]:
bus['month_beginning'] = bus['month_beginning'].apply(lambda x: int(x))

In [6]:
train.head()

Unnamed: 0,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,month_total,station_id_40010,station_id_40020,station_id_40030,station_id_40040,station_id_40050,...,station_name_Washington/State,station_name_Washington/Wabash,station_name_Washington/Wells,station_name_Wellington,station_name_Western-Brown,station_name_Western-Cermak,station_name_Western-Forest Park,station_name_Western-Orange,station_name_Western/Milwaukee,station_name_Wilson
0,2001-01-01,6233.9,3814.5,2408.6,164447,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2001-01-01,1489.1,1054.0,718.0,40567,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001-01-01,4412.5,3064.5,2087.8,119772,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2001-01-01,4664.5,3156.0,1952.8,125008,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2001-01-01,3109.8,2126.0,1453.8,84189,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
train['month_beginning'] = train['month_beginning'].apply(lambda x: x.split('-')[0])

In [8]:
train['month_beginning'] = train['month_beginning'].apply(lambda x: int(x))

In [9]:
X_bus = bus.drop('month_total', axis=1)

In [10]:
X_tr = train.drop('month_total', axis=1)

## KMeans on Bus

In [11]:
results = pd.DataFrame(columns = ['k','silhouette','type'])

def cluster(clusters, data, type):
    for c in clusters:
        km = KMeans(n_clusters=c)
        km.fit(data)
        labels = km.labels_
        sil_score = silhouette_score(data, labels)
        results.loc[len(results)]=[str(c), sil_score, type]

In [12]:
def best_cluster(clusters, data):
    cluster(clusters, data, 'default')
    
    # normalized version
    X_n = normalize(data)
    cluster(clusters, X_n, 'normalized')
    
    # standard scale version
    sc = StandardScaler()
    X_sc = sc.fit_transform(data)
    cluster(clusters, X_sc, 'standard_scaler')
    
    # minmax scale version
    mm = MinMaxScaler()
    X_mm = mm.fit_transform(data)
    cluster(clusters, X_mm, 'min_max_scaler')

    return results.loc[results['silhouette'].idxmax()]

In [1]:
# Leaving out 2/3 clusters because they typically have the highest silhouette scores, which is an oversimplification.

In [13]:
clusters = list(range(4, 21))
best_cluster(clusters, X_bus)

k                    4
silhouette    0.583419
type           default
Name: 0, dtype: object

In [None]:
km_bus = KMeans(n_clusters=4)
km_bus.fit(X_bus)
labels_bus = km_bus.labels_
    
X_bus_best = X_bus

X_bus_best['clusters']=labels_bus
sns.pairplot(X_bus_best, hue='clusters');

## KMeans on Train

In [None]:
clusters = list(range(2, 21))
best_cluster(clusters, X_tr)

In [None]:
km_train = KMeans(n_clusters=?)
km_train.fit(X_tr)
labels_train = km_train.labels_
    
X_tr_best = X_tr

X_tr_best['clusters']=labels_train
sns.pairplot(X_tr, hue='clusters');