# Clustering & Eval

Which clustering solution, including number of clusters & algorithm used, is best for the marathon data.

Once an acceptable solution is found, write a data story, including visualizations, where you teach the reader something about the marathon based on the clusters.

In [1]:
# Import modules.
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics

# Clustering module imports.
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation
from itertools import cycle

# Aesthetics.
%matplotlib inline
sns.set_style('darkgrid')

In [2]:
# Load dataset.
raw_data = pd.read_csv('~/src/data/unit4/boston-results-2013.csv')

# Rename to df.
df = raw_data
print(df.shape)
print(df.columns)
df.head()

(16164, 21)
Index(['25k', 'age', 'name', 'division', '10k', 'gender', 'half', 'official',
       'bib', 'ctz', 'country', 'overall', 'pace', 'state', '30k', '5k',
       'genderdiv', '20k', '35k', 'city', '40k'],
      dtype='object')


Unnamed: 0,25k,age,name,division,10k,gender,half,official,bib,ctz,...,overall,pace,state,30k,5k,genderdiv,20k,35k,city,40k
0,49.87,28,"Cassidy, Josh R.",9,18.18,M,40.93,90.9,W1,,...,9,3.47,ON,62.07,8.9,9,38.8,74.73,Toronto,85.55
1,77.27,30,"Korir, Wesley",5,30.9,M,64.9,132.5,1,,...,5,5.07,,92.97,15.9,5,61.52,108.78,Kenya,124.77
2,77.23,23,"Desisa, Lelisa",1,30.9,M,64.92,130.37,2,,...,1,4.98,,92.72,15.93,1,61.53,108.68,Ambo,123.78
3,50.5,32,"Fearnley, Kurt H.",5,18.73,M,42.0,88.43,W2,,...,5,3.38,,61.35,8.98,5,39.88,73.0,Hamilton,83.43
4,48.75,39,"Hokinoue, Kota",3,18.18,M,40.57,87.22,W3,,...,3,3.33,,59.92,8.92,3,38.55,71.68,Iizuka,81.88


# EDA

In [3]:
df.dtypes

25k           object
age            int64
name          object
division       int64
10k           object
gender        object
half          object
official     float64
bib           object
ctz           object
country       object
overall        int64
pace         float64
state         object
30k           object
5k            object
genderdiv      int64
20k           object
35k           object
city          object
40k           object
dtype: object

In [4]:
# Countplot.
#g = sns.countplot(x='gender', data=df)
#plt.show()

# Countplot.
#g = sns.countplot(x='age', data=df)
#plt.show()

# boxplot.
#g = sns.boxplot(df['gender'], df['age'])
#plt.show()

# jointplot.
#g = sns.jointplot(df['age'], df['official'], kind='kde', height=7, space=0)
#plt.show()

# Settle on .....

In [26]:
# Select numeric data & create sep dataframe.
numeric_df = df.select_dtypes(include=['int64', 'float64'])
print(numeric_df.shape)
print(numeric_df.columns)
numeric_df.head()

(16164, 6)
Index(['age', 'division', 'official', 'overall', 'pace', 'genderdiv'], dtype='object')


Unnamed: 0,age,division,official,overall,pace,genderdiv
0,28,9,90.9,9,3.47,9
1,30,5,132.5,5,5.07,5
2,23,1,130.37,1,4.98,1
3,32,5,88.43,5,3.38,5
4,39,3,87.22,3,3.33,3


In [27]:
# Check data for Gaussian distributions.
def plotHist(df):
    for col in df.columns:
        plt.hist(df[col], data=df)
        plt.title([col])
        plt.show()
        
#plotHist(numeric_df)

In [28]:
# Rename again.
model_df = numeric_df

# Break into a set of features and variable for knwon outcome: "overall".
X = model_df.loc[:, list(model_df.columns[0:3]) + list(model_df.columns[4:])]
Y = model_df['overall']

In [29]:
# Normalize the data.
from sklearn.preprocessing import normalize
X_norm = normalize(X)

In [30]:
# Choose number of clusters.
num_clust = 2

In [31]:
# Create the 2 feature PCA.
pca = PCA(n_components = 2)
X_pca = pca.fit_transform(X_norm)

In [32]:
# Split the data into 4 equally sized samples. First we break in half:
X_half1, X_half2, X_pcahalf1, X_pcahalf2 = train_test_split(
    X_norm,
    X_pca,
    test_size=0.5,
    random_state=42)

# Then we halve the halves.
X1, X2, X_pca1, X_pca2 = train_test_split(
    X_half1,
    X_pcahalf1,
    test_size=0.5,
    random_state=42)
X3, X4, X_pca3, X_pca4 = train_test_split(
    X_half2,
    X_pcahalf2,
    test_size=0.5,
    random_state=42)

## Clustering models

In [None]:
def runKMeans():
    n_clusters = num_clust
    from sklearn.cluster import KMeans
    y_pred = KMeans(n_clusters=num_clust, random_state=42).fit_predict(sample)
    
def runMeanShift():
    from sklearn.cluster import MeanShift, estimate_bandwidth
    

In [57]:
# Silhouette Coefficient.
def silhouetteCoeff():
    from sklearn import metrics
    from sklearn.metrics import pairwise_distances
    scores = []
    for sample in [X1, X2, X3, X4]:
        model = KMeans(n_clusters=num_clust, random_state=42).fit(sample)
        labels = model.labels_
        scores.append(metrics.silhouette_score(sample, labels, metric='euclidean'))
    print('Silhouette Score Mean:', np.mean(scores))
    print('Silhouette Score StdDev:', np.std(scores))
    
def silhouetteCoeffPCA():
    from sklearn import metrics
    from sklearn.metrics import pairwise_distances
    scores_pca = []
    for sample in [X_pca1, X_pca2, X_pca3, X_pca4]:
        model = KMeans(n_clusters=num_clust, random_state=42).fit(sample)
        labels = model.labels_
        scores_pca.append(metrics.silhouette_score(sample, labels, metric='euclidean'))
    print('Silhouette Score w/ PCA Mean:', np.mean(scores_pca))
    print('Silhouette Score w/ PCA StdDev:', np.std(scores_pca))

In [58]:
# View results.
silhouetteCoeff()
silhouetteCoeffPCA()

Silhouette Score Mean: 0.7297151903860846
Silhouette Score StdDev: 0.004053962626368842
Silhouette Score w/ PCA Mean: 0.7349148238292054
Silhouette Score w/ PCA StdDev: 0.003906399936060118
