# Batter K-Means Clustering
---
This notebook aims to use current stats and salary of batters for K-Means Clustering. 

## Import Libraries
---

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import seaborn as sns
import matplotlib.pyplot as plt

## Import Dataset
---

In [2]:
df = pd.read_csv('../data/mlb_players_bat.csv').drop('Unnamed: 0', axis = 1)
df.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Pos,Age,G,AB,R,...,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,salary
0,547989,Jose,Abreu,Jose Abreu,CWS,1B,34,152,566,86,...,2,30,117,61,143,0.261,0.351,0.481,0.832,"$17,666,666"
1,642715,Willy,Adames,Willy Adames,TB,SS,26,41,132,16,...,1,5,15,10,51,0.197,0.254,0.371,0.625,"$590,000"
2,501303,Ehire,Adrianza,Ehire Adrianza,ATL,SS,32,109,182,32,...,2,5,28,21,42,0.247,0.327,0.401,0.728,"$1,500,000"
3,542583,Jesus,Aguilar,Jesus Aguilar,MIA,1B,31,131,449,49,...,0,22,93,46,93,0.261,0.329,0.459,0.788,"$4,500,000"
4,605113,Nick,Ahmed,Nick Ahmed,ARI,SS,31,129,434,46,...,3,5,38,34,104,0.221,0.28,0.339,0.619,"$8,125,000"


In [3]:
# Convert salary from object to int
df['salary'] = df['salary'].str.replace(',', '').str.replace('$', '').astype(int)

#Copied from https://stackoverflow.com/questions/38516481/trying-to-remove-commas-and-dollars-signs-with-pandas-in-python

  df['salary'] = df['salary'].str.replace(',', '').str.replace('$', '').astype(int)


In [4]:
df_copy = df.drop(columns = ['MLBID', 'FIRSTNAME', 'LASTNAME', 'Player', 'Team', 'Pos', 'G', 'Age'])

## Preprocessing
---
### Standard Scaler

In [5]:
sc = StandardScaler()
X_sc = sc.fit_transform(df_copy)

### PCA

In [6]:
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

## Model
---

In [7]:
km = KMeans(n_clusters = 6, random_state=42)
km.fit(X_pca)

KMeans(n_clusters=6, random_state=42)

In [8]:
df['cluster'] = km.labels_
df.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Pos,Age,G,AB,R,...,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,salary,cluster
0,547989,Jose,Abreu,Jose Abreu,CWS,1B,34,152,566,86,...,30,117,61,143,0.261,0.351,0.481,0.832,17666666,0
1,642715,Willy,Adames,Willy Adames,TB,SS,26,41,132,16,...,5,15,10,51,0.197,0.254,0.371,0.625,590000,4
2,501303,Ehire,Adrianza,Ehire Adrianza,ATL,SS,32,109,182,32,...,5,28,21,42,0.247,0.327,0.401,0.728,1500000,5
3,542583,Jesus,Aguilar,Jesus Aguilar,MIA,1B,31,131,449,49,...,22,93,46,93,0.261,0.329,0.459,0.788,4500000,3
4,605113,Nick,Ahmed,Nick Ahmed,ARI,SS,31,129,434,46,...,5,38,34,104,0.221,0.28,0.339,0.619,8125000,2


In [9]:
df.groupby('cluster').mean()

Unnamed: 0_level_0,MLBID,Age,G,AB,R,H,2B,3B,HR,RBI,BB,SO,AVG,OBP,SLG,OPS,salary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,588428.22449,29.102041,146.081633,536.918367,89.877551,147.285714,30.142857,1.530612,31.55102,92.734694,62.020408,128.346939,0.273959,0.354061,0.513,0.867061,11394480.0
1,594071.0,29.214286,76.178571,252.571429,43.964286,73.428571,15.392857,1.071429,12.321429,37.678571,33.642857,59.25,0.293429,0.381786,0.511964,0.89375,8757903.0
2,600327.074074,28.62963,110.246914,341.271605,42.271605,79.555556,15.493827,1.271605,11.271605,42.777778,31.938272,91.074074,0.231667,0.304605,0.388407,0.693012,3054138.0
3,600517.684932,28.561644,139.410959,497.410959,71.068493,127.671233,26.082192,2.835616,17.123288,64.739726,48.30137,117.808219,0.256041,0.327945,0.425219,0.753164,5820815.0
4,601945.383562,28.630137,56.917808,152.383562,16.205479,30.232877,5.739726,0.424658,3.876712,14.616438,12.164384,43.410959,0.195616,0.263849,0.314466,0.578315,2207520.0
5,600385.752809,28.516854,65.910112,187.05618,24.179775,47.516854,9.382022,0.988764,5.685393,22.932584,17.0,44.651685,0.252315,0.322562,0.408978,0.731539,2444367.0


In [10]:
df.columns

Index(['MLBID', 'FIRSTNAME', 'LASTNAME', 'Player', 'Team', 'Pos', 'Age', 'G',
       'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'AVG', 'OBP',
       'SLG', 'OPS', 'salary', 'cluster'],
      dtype='object')

In [11]:
silhouette_score(X_pca, km.labels_)

0.22484264960836448

In [12]:
df['cluster'].value_counts(normalize = True)

5    0.226463
2    0.206107
3    0.185751
4    0.185751
0    0.124682
1    0.071247
Name: cluster, dtype: float64

## Save Dataset with Clusters
---

In [13]:
df.to_csv('../data/clusters_bat.csv')

## Recap
---
K-Means Clustering was used on the dataset with current stats and salary of batters. This dataset will be used for the recommender system.