In [10]:
# import dependencies
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter

In [15]:
# Loading data
file_path = Path('../encoded_data.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,ID,stint,teamID,lgID,W,L,G,GS,CG,SHO,...,GF,R,SH,SF,GIDP,weight,height,bats,throws,salary
0,2010aardsda01,1,24,0,0,6,53,0,0,0,...,43,19,7.0,1.0,5.0,215.0,75.0,2,1,2750000.0
1,2010accarje01,1,29,0,0,1,5,0,0,0,...,2,6,0.0,0.0,2.0,195.0,72.0,2,1,1080000.0
2,2010aceveal01,1,18,0,3,0,10,0,0,0,...,2,5,0.0,0.0,0.0,205.0,74.0,2,1,435650.0
3,2010adamsmi03,1,23,1,4,1,70,0,0,0,...,3,14,0.0,0.0,2.0,210.0,77.0,2,1,1000000.0
4,2010affelje01,1,25,1,4,3,53,0,0,0,...,14,25,7.0,1.0,4.0,225.0,76.0,1,0,4000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3094,2016youngch03,1,12,0,3,9,34,13,0,0,...,7,63,0.0,4.0,3.0,255.0,82.0,2,1,4250000.0
3095,2016zieglbr01,1,0,1,2,3,36,0,0,0,...,30,13,1.0,1.0,10.0,220.0,76.0,2,1,5500000.0
3096,2016zieglbr01,2,3,0,2,4,33,0,0,0,...,12,8,1.0,0.0,6.0,220.0,76.0,2,1,5500000.0
3097,2016zimmejo02,1,9,0,9,7,19,18,0,0,...,1,63,1.0,5.0,8.0,225.0,74.0,2,1,18000000.0


In [16]:
# Create our features
x_cols=[i for i in df.columns if i not in ('salary','ID')]
X = df[x_cols]


# Create our target
y = df['salary'].ravel()

In [17]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [18]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

In [20]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [21]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([  512500.,  7000000.,   515500.,   875000.,  1500000.,   800000.,
        9000000.,  1075000., 15000000.,  1000000.,   500000.,  8500000.,
         517500.,  3250000., 12000000.,  1760000.,   414000.,  1500000.,
        4500000.,  7400000.,   426500.,   405000.,   515000.,  4600000.,
        4000000.,   518000.,  2000000.,  2450000.,  8000000.,   480000.,
        5875000.,   517500., 17000000.,   520000.,   546000.,  2850000.,
        8000000.,  7750000.,  2250000., 13000000.,   750000.,   508000.,
        3950000.,   480000.,  1000000.,   850000., 10000000.,  1200000.,
         508500.,   520700.,  3666667.,   800000.,   519000.,  2000000.,
        3000000.,  7000000.,   406500., 10000000.,  1000000.,   512500.,
         490000.,  1400000.,   490000.,  1250000.,  4500000.,  1350000.,
        4500000.,  6000000.,  1300000.,  5000000., 10000000.,   515000.,
         495000.,   510000.,   750000.,  5000000.,   427500.,  3500000.,
        4000000.,   504500.,  1000000.,  3950000., 

In [22]:
# balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, predictions)



0.01150358262427228

In [23]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.016774193548387096

In [24]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00602158, 0.04690225, 0.01349107, 0.03076697, 0.03057612,
       0.04194446, 0.0177727 , 0.00662662, 0.0046511 , 0.01662592,
       0.0439115 , 0.04224166, 0.0392944 , 0.03758027, 0.04579844,
       0.04574746, 0.04918551, 0.04861605, 0.02843406, 0.03337596,
       0.03120219, 0.01362568, 0.04386372, 0.03269578, 0.03950936,
       0.02994834, 0.02854083, 0.03742029, 0.04746413, 0.0405229 ,
       0.01453803, 0.01110468])

In [25]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.04918550729331832, 'BAOpp'),
 (0.0486160472546323, 'ERA'),
 (0.047464125558374586, 'weight'),
 (0.04690224814195206, 'teamID'),
 (0.04579843754661403, 'BB'),
 (0.045747456782583434, 'SO'),
 (0.04391149997272195, 'IPouts'),
 (0.043863717646994455, 'BFP'),
 (0.04224165879422784, 'H'),
 (0.041944464151434144, 'G'),
 (0.0405229018469237, 'height'),
 (0.03950935577130806, 'R'),
 (0.03929440407774342, 'ER'),
 (0.03758027042918364, 'HR'),
 (0.037420290355144624, 'GIDP'),
 (0.03337595776005236, 'WP'),
 (0.03269578351309893, 'GF'),
 (0.031202190668776712, 'HBP'),
 (0.03076696604302849, 'W'),
 (0.0305761179225366, 'L'),
 (0.029948341060881803, 'SH'),
 (0.028540830170696887, 'SF'),
 (0.028434056554171697, 'IBB'),
 (0.0177726952551777, 'GS'),
 (0.01662592429618595, 'SV'),
 (0.01453802730000197, 'bats'),
 (0.013625681822139931, 'BK'),
 (0.013491065975362265, 'lgID'),
 (0.011104678555766568, 'throws'),
 (0.006626618892987021, 'CG'),
 (0.00602158122097796, 'stint'),
 (0.004651097365000634, 'SHO')