In [1]:
# import dependencies
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter

In [3]:
# Loading data
file_path = Path("salaries-binned.csv")
df = pd.read_csv(file_path)
df

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,SH,SF,GIDP,salary,birthYear,birthMonth,weight,height,throws,salary-bin
0,abadfe01,2016,1,16,0,1,4,39,0,0,...,0.0,1.0,6.0,1250000.0,1985.0,12.0,235.0,74.0,0,mid
1,alberma01,2016,1,4,0,2,6,58,1,0,...,3.0,2.0,4.0,2000000.0,1983.0,1.0,225.0,73.0,1,mid
2,allenco01,2016,1,7,0,3,5,67,0,0,...,3.0,2.0,7.0,4150000.0,1988.0,11.0,210.0,73.0,1,high
3,alvarjo02,2016,1,12,0,1,3,64,0,0,...,1.0,1.0,5.0,507500.0,1989.0,5.0,195.0,71.0,0,low
4,anderbr04,2016,1,13,1,1,2,4,3,0,...,1.0,1.0,0.0,15800000.0,1988.0,2.0,230.0,76.0,0,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,yateski01,2016,1,17,0,2,1,41,0,0,...,1.0,1.0,1.0,511900.0,1987.0,3.0,205.0,70.0,1,low
412,youngch03,2016,1,11,0,3,9,34,13,0,...,0.0,4.0,3.0,4250000.0,1979.0,5.0,255.0,82.0,1,high
413,zieglbr01,2016,1,0,1,2,3,36,0,0,...,1.0,1.0,10.0,5500000.0,1979.0,10.0,220.0,76.0,1,high
414,zimmejo02,2016,1,9,0,9,7,19,18,0,...,1.0,5.0,8.0,18000000.0,1986.0,5.0,225.0,74.0,1,high


In [7]:
# Create our features
x_cols=[i for i in df.columns if i not in ('salary','playerID', 'salary-bin')]
X = df[x_cols]


# Create our target
y = df['salary-bin'].ravel()

In [8]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [11]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array(['mid', 'low', 'mid', 'high', 'mid', 'mid', 'low', 'mid', 'low',
       'high', 'low', 'low', 'mid', 'mid', 'low', 'low', 'high', 'low',
       'low', 'high', 'low', 'low', 'low', 'mid', 'low', 'high', 'high',
       'mid', 'mid', 'low', 'mid', 'mid', 'mid', 'low', 'low', 'high',
       'high', 'high', 'mid', 'high', 'low', 'mid', 'mid', 'high', 'high',
       'low', 'low', 'mid', 'low', 'high', 'high', 'low', 'low', 'mid',
       'high', 'mid', 'low', 'mid', 'low', 'high', 'low', 'low', 'mid',
       'high', 'low', 'high', 'high', 'low', 'high', 'low', 'mid', 'low',
       'high', 'low', 'high', 'mid', 'low', 'low', 'mid', 'high', 'high',
       'low', 'mid', 'high', 'high', 'low', 'high', 'mid', 'low', 'mid',
       'low', 'high', 'low', 'mid', 'mid', 'low', 'low', 'low', 'high',
       'mid', 'high', 'mid', 'high', 'mid'], dtype=object)

In [13]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.6442307692307693

In [14]:
# balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions)

0.6318155025051576

In [15]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.        , 0.03358292, 0.00603005, 0.02605039,
       0.02822168, 0.03594557, 0.02608003, 0.00377961, 0.00197911,
       0.02677958, 0.03531951, 0.03967346, 0.03955292, 0.0325292 ,
       0.0295293 , 0.04081251, 0.04358803, 0.04316096, 0.02126189,
       0.03039889, 0.01963267, 0.00807495, 0.03465792, 0.04043896,
       0.03579205, 0.01922835, 0.01385861, 0.02510801, 0.14535119,
       0.0318463 , 0.04453299, 0.03204062, 0.00516176])

In [17]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.14535118631697233, 'birthYear'),
 (0.04453298588602458, 'weight'),
 (0.043588027452677196, 'BAOpp'),
 (0.0431609569868534, 'ERA'),
 (0.04081251144482249, 'SO'),
 (0.04043896105267377, 'GF'),
 (0.03967345802858526, 'H'),
 (0.039552921564781, 'ER'),
 (0.03594557173489704, 'G'),
 (0.03579204613366499, 'R'),
 (0.03531951387451324, 'IPouts'),
 (0.03465792262019832, 'BFP'),
 (0.033582924282034396, 'teamID'),
 (0.032529204621335364, 'HR'),
 (0.032040617681870213, 'height'),
 (0.031846298541353166, 'birthMonth'),
 (0.030398894940087504, 'WP'),
 (0.02952929648302648, 'BB'),
 (0.028221684907537344, 'L'),
 (0.026779580056518117, 'SV'),
 (0.026080025789570936, 'GS'),
 (0.026050390424475423, 'W'),
 (0.025108005446624476, 'GIDP'),
 (0.021261891204814, 'IBB'),
 (0.01963267425615192, 'HBP'),
 (0.019228353420288943, 'SH'),
 (0.013858608344986903, 'SF'),
 (0.008074953518590004, 'BK'),
 (0.006030054775529795, 'lgID'),
 (0.005161760781603341, 'throws'),
 (0.0037796099697049836, 'CG'),
 (0.001979107457