In [1]:
# import dependencies
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter

In [3]:
# Loading data
file_path = Path('csv-exports/encoded_data_binned.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,ID,stint,teamID,lgID,W,L,G,GS,CG,SHO,...,R,SH,SF,GIDP,weight,height,bats,throws,salary,salary-bin
0,2010aardsda01,1,24,0,0,6,53,0,0,0,...,19,7.0,1.0,5.0,215.0,75.0,2,1,2750000.0,mid
1,2010accarje01,1,29,0,0,1,5,0,0,0,...,6,0.0,0.0,2.0,195.0,72.0,2,1,1080000.0,mid
2,2010aceveal01,1,18,0,3,0,10,0,0,0,...,5,0.0,0.0,0.0,205.0,74.0,2,1,435650.0,low
3,2010adamsmi03,1,23,1,4,1,70,0,0,0,...,14,0.0,0.0,2.0,210.0,77.0,2,1,1000000.0,mid
4,2010affelje01,1,25,1,4,3,53,0,0,0,...,25,7.0,1.0,4.0,225.0,76.0,1,0,4000000.0,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3094,2016youngch03,1,12,0,3,9,34,13,0,0,...,63,0.0,4.0,3.0,255.0,82.0,2,1,4250000.0,high
3095,2016zieglbr01,1,0,1,2,3,36,0,0,0,...,13,1.0,1.0,10.0,220.0,76.0,2,1,5500000.0,high
3096,2016zieglbr01,2,3,0,2,4,33,0,0,0,...,8,1.0,0.0,6.0,220.0,76.0,2,1,5500000.0,high
3097,2016zimmejo02,1,9,0,9,7,19,18,0,0,...,63,1.0,5.0,8.0,225.0,74.0,2,1,18000000.0,high


In [4]:
# Create our features
x_cols=[i for i in df.columns if i not in ('salary','ID','salary-bin')]
X = df[x_cols]


# Create our target
y = df['salary-bin'].ravel()

In [5]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [8]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [9]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array(['low', 'high', 'low', 'mid', 'low', 'mid', 'high', 'high', 'high',
       'high', 'low', 'high', 'low', 'high', 'low', 'high', 'high', 'low',
       'high', 'high', 'low', 'high', 'high', 'high', 'high', 'low',
       'low', 'low', 'high', 'mid', 'low', 'high', 'high', 'mid', 'high',
       'low', 'high', 'high', 'high', 'high', 'low', 'mid', 'high', 'low',
       'high', 'high', 'high', 'low', 'mid', 'low', 'low', 'low', 'mid',
       'mid', 'mid', 'high', 'low', 'high', 'high', 'low', 'low', 'low',
       'low', 'mid', 'high', 'mid', 'high', 'high', 'mid', 'high', 'high',
       'low', 'high', 'low', 'mid', 'high', 'low', 'mid', 'high', 'low',
       'high', 'low', 'high', 'high', 'high', 'low', 'high', 'mid', 'mid',
       'high', 'low', 'high', 'high', 'mid', 'high', 'high', 'mid', 'mid',
       'low', 'mid', 'high', 'high', 'low', 'high', 'low', 'low', 'low',
       'high', 'low', 'mid', 'high', 'high', 'mid', 'high', 'high', 'mid',
       'mid', 'high', 'mid', 'low', 'high

In [10]:
# balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions)

0.5265411922506147

In [11]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00962306, 0.04394046, 0.01031962, 0.02872445, 0.02958772,
       0.04359313, 0.02517952, 0.00587987, 0.00250023, 0.02899906,
       0.05339742, 0.04756748, 0.03966958, 0.03321103, 0.04547472,
       0.04853841, 0.0470756 , 0.04923914, 0.02442936, 0.0276129 ,
       0.02699897, 0.00999571, 0.05549944, 0.04923829, 0.03907554,
       0.02402312, 0.02219715, 0.03184441, 0.04332392, 0.0339808 ,
       0.01050286, 0.008757  ])

In [12]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.055499439739914835, 'BFP'),
 (0.0533974182870598, 'IPouts'),
 (0.049239143354684246, 'ERA'),
 (0.04923829497017141, 'GF'),
 (0.04853841216687347, 'SO'),
 (0.047567480519276895, 'H'),
 (0.04707560206771526, 'BAOpp'),
 (0.04547472287111616, 'BB'),
 (0.04394046213769958, 'teamID'),
 (0.04359312613908091, 'G'),
 (0.04332392160896062, 'weight'),
 (0.039669583685966986, 'ER'),
 (0.03907554196699713, 'R'),
 (0.03398079815317921, 'height'),
 (0.03321103000808662, 'HR'),
 (0.03184440975048976, 'GIDP'),
 (0.029587719163424696, 'L'),
 (0.028999057434079937, 'SV'),
 (0.028724446980108273, 'W'),
 (0.027612900617373273, 'WP'),
 (0.026998974700831947, 'HBP'),
 (0.025179515969098096, 'GS'),
 (0.024429355843061232, 'IBB'),
 (0.024023120381025005, 'SH'),
 (0.022197153375033248, 'SF'),
 (0.01050286021583964, 'bats'),
 (0.010319623104576337, 'lgID'),
 (0.009995713247167512, 'BK'),
 (0.00962306492751518, 'stint'),
 (0.008757001829345848, 'throws'),
 (0.005879871575117868, 'CG'),
 (0.0025002332091290623

In [13]:
# We can sort the features by their importance.
feature_rank=sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [15]:
feature_df=pd.DataFrame(feature_rank, columns=['Feature Value','Feature Abbreviation'])
feature_df

Unnamed: 0,Feature Value,Feature Abbreviation
0,0.055499,BFP
1,0.053397,IPouts
2,0.049239,ERA
3,0.049238,GF
4,0.048538,SO
5,0.047567,H
6,0.047076,BAOpp
7,0.045475,BB
8,0.04394,teamID
9,0.043593,G


In [16]:
stat_def = ['Batters Faced by Pitcher','Outs Pitched (innings pitched x 3)','Earned Run Average','Games Finished','Strikeouts','Hits','Opponent Batting Average','Walks','Team','Games','Player Weight','Earned Runs','Runs Allowed','Player Height','Homeruns','Grounded into double plays by opposing batter','Losses','Saves','Wins','Wild Pitches','Batters Hit by Pitch','Games Started','Intentional Walks','Sacrifices by Opposing Batters','Sacrifice Flies by Opposing Batters','Player Batting (L/R)','League ID','Balks','Player Stint- order of appearances in a season','Player Throwing (L/R)','Complete Games','Shutouts']
stat_def

['Batters Faced by Pitcher',
 'Outs Pitched (innings pitched x 3)',
 'Earned Run Average',
 'Games Finished',
 'Strikeouts',
 'Hits',
 'Opponent Batting Average',
 'Walks',
 'Team',
 'Games',
 'Player Weight',
 'Earned Runs',
 'Runs Allowed',
 'Player Height',
 'Homeruns',
 'Grounded into double plays by opposing batter',
 'Losses',
 'Saves',
 'Wins',
 'Wild Pitches',
 'Batters Hit by Pitch',
 'Games Started',
 'Intentional Walks',
 'Sacrifices by Opposing Batters',
 'Sacrifice Flies by Opposing Batters',
 'Player Batting (L/R)',
 'League ID',
 'Balks',
 'Player Stint- order of appearances in a season',
 'Player Throwing (L/R)',
 'Complete Games',
 'Shutouts']

In [17]:
feature_df['Feature Definition'] = stat_def
feature_df

Unnamed: 0,Feature Value,Feature Abbreviation,Feature Definition
0,0.055499,BFP,Batters Faced by Pitcher
1,0.053397,IPouts,Outs Pitched (innings pitched x 3)
2,0.049239,ERA,Earned Run Average
3,0.049238,GF,Games Finished
4,0.048538,SO,Strikeouts
5,0.047567,H,Hits
6,0.047076,BAOpp,Opponent Batting Average
7,0.045475,BB,Walks
8,0.04394,teamID,Team
9,0.043593,G,Games


In [21]:
feature_df['Feature Value Percentage'] = (round((feature_df['Feature Value']*100),2))
feature_df

Unnamed: 0,Feature Value,Feature Abbreviation,Feature Definition,Feature Value Percentage
0,0.055499,BFP,Batters Faced by Pitcher,5.55
1,0.053397,IPouts,Outs Pitched (innings pitched x 3),5.34
2,0.049239,ERA,Earned Run Average,4.92
3,0.049238,GF,Games Finished,4.92
4,0.048538,SO,Strikeouts,4.85
5,0.047567,H,Hits,4.76
6,0.047076,BAOpp,Opponent Batting Average,4.71
7,0.045475,BB,Walks,4.55
8,0.04394,teamID,Team,4.39
9,0.043593,G,Games,4.36


In [22]:
feature_df=feature_df.drop(['Feature Value'], axis=1)
feature_df

Unnamed: 0,Feature Abbreviation,Feature Definition,Feature Value Percentage
0,BFP,Batters Faced by Pitcher,5.55
1,IPouts,Outs Pitched (innings pitched x 3),5.34
2,ERA,Earned Run Average,4.92
3,GF,Games Finished,4.92
4,SO,Strikeouts,4.85
5,H,Hits,4.76
6,BAOpp,Opponent Batting Average,4.71
7,BB,Walks,4.55
8,teamID,Team,4.39
9,G,Games,4.36


In [23]:
# export as csv
feature_df.to_csv('feature-ranking.csv', index=False)