# Pitcher K-Means Clustering
---
This notebook aims to use current stats and salary of pitchers for K-Means Clustering.

## Import Libraries
---

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import seaborn as sns
import matplotlib.pyplot as plt

## Import Dataset
---

In [2]:
df = pd.read_csv('../data/mlb_players_pitch.csv').drop('Unnamed: 0', axis = 1)
df.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Age,W,L,ERA,IP,H,ER,HR,BB,K,WHIP,salary
0,472551,Fernando,Abad,Fernando Abad,BAL,35,0,0,5.6,17.2,23,11,1,7,10,1.7,"$570,500"
1,676265,Cory,Abbott,Cory Abbott,CHC,26,0,0,6.75,17.1,20,13,7,11,12,1.79,"$570,500"
2,642758,Domingo,Acevedo,Domingo Acevedo,OAK,27,0,0,3.27,11.0,9,4,3,4,9,1.18,"$570,500"
3,613534,Austin,Adams,Austin Adams,SD,30,3,2,4.1,52.2,28,24,1,35,76,1.2,"$580,200"
4,669211,Keegan,Akin,Keegan Akin,BAL,26,2,10,6.63,95.0,110,70,17,40,82,1.58,"$570,500"


In [3]:
# Convert salary from object to int
df['salary'] = df['salary'].str.replace(',', '').str.replace('$', '').astype(int)

#Copied from https://stackoverflow.com/questions/38516481/trying-to-remove-commas-and-dollars-signs-with-pandas-in-python

  df['salary'] = df['salary'].str.replace(',', '').str.replace('$', '').astype(int)


In [4]:
df_copy = df.drop(columns = ['MLBID', 'FIRSTNAME', 'LASTNAME', 'Player', 'Team', 'Age'])

## Preprocessing
---
### Standard Scaler

In [5]:
sc = StandardScaler()
X_sc = sc.fit_transform(df_copy)

### PCA

In [6]:
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

## Model
---

In [7]:
km = KMeans(n_clusters = 6, random_state=42)
km.fit(X_pca)

KMeans(n_clusters=6, random_state=42)

In [8]:
df['cluster'] = km.labels_
df.head()

Unnamed: 0,MLBID,FIRSTNAME,LASTNAME,Player,Team,Age,W,L,ERA,IP,H,ER,HR,BB,K,WHIP,salary,cluster
0,472551,Fernando,Abad,Fernando Abad,BAL,35,0,0,5.6,17.2,23,11,1,7,10,1.7,570500,2
1,676265,Cory,Abbott,Cory Abbott,CHC,26,0,0,6.75,17.1,20,13,7,11,12,1.79,570500,2
2,642758,Domingo,Acevedo,Domingo Acevedo,OAK,27,0,0,3.27,11.0,9,4,3,4,9,1.18,570500,4
3,613534,Austin,Adams,Austin Adams,SD,30,3,2,4.1,52.2,28,24,1,35,76,1.2,580200,3
4,669211,Keegan,Akin,Keegan Akin,BAL,26,2,10,6.63,95.0,110,70,17,40,82,1.58,570500,1


In [9]:
df.groupby('cluster').mean()

Unnamed: 0_level_0,MLBID,Age,W,L,ERA,IP,H,ER,HR,BB,K,WHIP,salary
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,511034.642857,33.428571,5.857143,3.357143,3.118571,73.835714,59.0,25.071429,8.5,21.857143,88.214286,1.119286,24765720.0
1,616556.603175,27.666667,6.349206,7.984127,4.61381,120.355556,118.968254,61.063492,17.873016,40.269841,108.634921,1.328254,2268407.0
2,604453.774194,28.596774,1.516129,2.419355,6.119677,33.958065,37.532258,23.096774,5.903226,16.387097,32.790323,1.585645,1440671.0
3,596382.838983,29.372881,4.542373,3.754237,3.492966,68.038983,56.737288,26.220339,8.228814,25.423729,72.737288,1.213729,2137334.0
4,579891.294118,30.529412,1.519608,1.245098,3.083824,28.309804,22.431373,9.352941,2.872549,9.647059,29.019608,1.149804,1703257.0
5,579516.117647,29.666667,11.254902,8.470588,3.755882,169.476471,150.372549,70.254902,21.882353,49.647059,175.862745,1.183529,9360466.0


In [10]:
df.columns

Index(['MLBID', 'FIRSTNAME', 'LASTNAME', 'Player', 'Team', 'Age', 'W', 'L',
       'ERA', 'IP', 'H', 'ER', 'HR', 'BB', 'K', 'WHIP', 'salary', 'cluster'],
      dtype='object')

In [11]:
silhouette_score(X_pca, km.labels_)

0.2757783945908426

In [12]:
df['cluster'].value_counts(normalize = True)

3    0.287805
4    0.248780
1    0.153659
2    0.151220
5    0.124390
0    0.034146
Name: cluster, dtype: float64

## Save Dataset with Clusters
---

In [13]:
df.to_csv('../data/clusters_pitch.csv')

## Recap
---
K-Means Clustering was used on the dataset with current stats and salary of pitchers. This dataset will be used for the recommender system.