In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns = None

In [None]:
gl = pd.read_csv("./data/final_gl.csv")

In [None]:
season_end_performance = gl[(gl.year>=2019)&(gl.year<=2023)].groupby(['HomeTeam','year'])[['season_end_rank','season_end_w_l_ratio']].max().reset_index()

# Wide format results
season_end_performance = season_end_performance.pivot_table(index='HomeTeam',columns='year')

# flatten columns
flat_cols = [str(col[1])+"_"+col[0] for col in season_end_performance.columns]
season_end_performance.columns = flat_cols
season_end_performance = season_end_performance.reset_index()


In [None]:
attendance_stats = gl[(gl.year>=2018)&(gl.year<=2022)].groupby('HomeTeam')['Attendance'].agg(['mean','std']).reset_index()
attendance_stats.rename({'mean':'3yr_attedance_mean', 'std':'3yr_attedance_std'},inplace=True)

In [None]:
cluster_df = season_end_performance.merge(attendance_stats, on='HomeTeam')

In [None]:
from sklearn.cluster import KMeans

# Initialize the KMeans algorithm with 5 clusters
kmeans = KMeans(n_clusters=4)

cluster_df.fillna(0,inplace=True)
# Fit the KMeans model to the data
kmeans.fit(cluster_df[cluster_df.columns[1:]])

# Get the cluster labels for each point
labels = kmeans.labels_

# Get the centroids of each cluster
centroids = kmeans.cluster_centers_

In [None]:
# print cluster characteristics
pd.concat([pd.DataFrame([0,1,2,3]), pd.DataFrame(centroids)], axis=1)
# 3, 1, 2, 0

Unnamed: 0,0,0.1,1,2,3,4,5,6,7
0,0,3.7,3.1,3.6,0.445618,0.47716,0.448148,17668.289552,7121.655888
1,1,2.0,1.75,2.75,0.566566,0.482675,0.519645,34881.944634,5944.610448
2,2,3.444444,3.777778,2.888889,0.475995,0.458848,0.503429,27523.274942,8071.476088
3,3,2.0,2.0,1.0,0.57497,0.592593,0.623457,43584.761289,5371.714718


In [None]:
cluster_df.columns

Index(['HomeTeam', '2019_season_end_rank', '2022_season_end_rank',
       '2023_season_end_rank', '2019_season_end_w_l_ratio',
       '2022_season_end_w_l_ratio', '2023_season_end_w_l_ratio', 'mean',
       'std'],
      dtype='object')

In [None]:
cluster_df['label'] = labels

In [None]:
cluster_df.sort_values('label')

Unnamed: 0,HomeTeam,2019_season_end_rank,2022_season_end_rank,2023_season_end_rank,2019_season_end_w_l_ratio,2022_season_end_w_l_ratio,2023_season_end_w_l_ratio,mean,std,label
14,MIA,5.0,4.0,4.0,0.391304,0.41358,0.425926,10411.082305,4730.36516,0
26,TBA,3.0,1.0,3.0,0.555556,0.617284,0.530864,14246.234568,5067.201214,0
3,BAL,5.0,5.0,4.0,0.290123,0.320988,0.512346,17450.065844,8338.120071,0
21,PIT,4.0,5.0,4.0,0.509317,0.376543,0.382716,17414.103306,7502.056049,0
5,CHA,4.0,1.0,2.0,0.382716,0.574074,0.5,21768.392562,7485.5636,0
19,OAK,2.0,3.0,5.0,0.598765,0.530864,0.37037,16662.198347,10205.119787,0
7,CIN,5.0,3.0,4.0,0.41358,0.512346,0.382716,19893.831276,7756.593892,0
8,CLE,1.0,2.0,1.0,0.561728,0.493827,0.567901,20416.514403,7970.979422,0
10,DET,3.0,3.0,4.0,0.395062,0.475309,0.407407,20221.081967,6483.098224,0
12,KCA,5.0,4.0,5.0,0.358025,0.45679,0.401235,18199.390947,5677.461463,0
