In [294]:
import numpy as np
from sklearn.svm import SVR
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing


In [295]:
batting_data_path = 'bsb_ref.csv'
# INFO:
# 101,332 Players with up to 20 features each (exluding year, including team)
# if metric not reported for player, set to 0.0 by default

df_original = pd.read_csv(batting_data_path)
df = df_original.fillna(0) # fixing this could really help with results potentially
df_recent_players = df[df.year_ID >= 1975] 
df_probably_not_pitchers = df_recent_players[df_recent_players['PA'] >= 50]

# for the player specific SVR, I will choose players with at least 20 years of activity. 
# the data will be split as follows:
# x_train: year 0-5 data. the shape is: (num_samples, num_features) -> (6, num_features)
# y_train: year 6-12 OPS_plus. the shape is: (num_samples, 1) -> (6,1)

# x_test: year 13-14 data
# y_test: year 14-15 data
players = list(set(df_probably_not_pitchers['name_common'])) # just a set of all players

players_with_more_than_20_years = [player for player in players if len(df_probably_not_pitchers[df_probably_not_pitchers['name_common'] == player]) >= 20]
print(players_with_more_than_20_years)

['Tim Raines', 'Harold Baines', 'Dave Martinez', 'Adrian Beltre', 'Aramis Ramirez', 'Alex Rodriguez', 'Lenny Harris', 'Kenny Lofton', 'Jerry Hairston', 'Jose Cruz', 'Barry Bonds', 'Derek Jeter', 'Luis Gonzalez', 'Shawon Dunston', 'Andre Dawson', 'Carlos Beltran', 'Craig Biggio', 'Alex Gonzalez', 'Fred McGriff', 'Sandy Alomar', 'Julio Franco', 'Frank Thomas', 'Dave Winfield', 'Cal Ripken Jr.', 'Ivan Rodriguez', 'Tony Phillips', 'Ken Griffey', 'Jim Thome', 'Gary Matthews', 'Todd Zeile', 'Rickey Henderson', 'Omar Vizquel', 'Tony Pena', 'Jason Giambi', 'Tony Gwynn', 'Manny Ramirez', 'Steve Finley', 'B.J. Surhoff', 'Ruben Sierra', 'Tom Glavine', 'Ken Griffey Jr.', 'Gary Sheffield', 'Gary Gaetti', 'Paul Molitor', 'Eddie Murray', 'Rafael Palmeiro']


In [296]:
def prepare_test_train(player_x_data, player_y_data):
    i_0, i_1, i_2, i_3 = 6, 12, 14, 16 # hardcoded for now. these are just the ranges described above
    x_train = player_x_data[:i_0,:]
    y_train = player_y_data[i_0:i_1]
    x_test = player_x_data[i_1:i_2]
    y_test = player_y_data[i_2: i_3]
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    return (x_train, y_train, x_test, y_test)

In [297]:
def run_svr(data_tuple):
    x_train, y_train, x_test, y_test = data_tuple
    parameters = {
    "kernel": ["rbf"],
    "C": [1,10,10,100,1000],
    "gamma": [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    }

    svr = GridSearchCV(SVR(), parameters, cv=5)
    svr.fit(x_train, y_train.ravel())

    y_svr_pred = svr.predict(x_test)
    print(y_test.ravel())
    print(y_svr_pred)

    print('Score: %.3f' % svr.score(x_train, y_train))
    print("Mean squared error: %.3f" % mean_squared_error(y_test, y_svr_pred))
    print('Mean Absolute error: %.3f' % mean_absolute_error(y_test, y_svr_pred))
    print('Variance score: %.3f' % r2_score(y_test, y_svr_pred))


    

In [298]:
def prepare_rows(player_rows):
    player_x_data =player_rows[['PA', 'G', 'Inn', 'runs_bat', 'runs_br', 'runs_dp', 'runs_field',
                                       'WAA', 'WAA_off', 'WAA_def', 'WAR', 'WAR_def', 'WAR_off', 'WAR_rep',
                                         'teamRpG', 'oppRpG', 'oppRpPA_rep', 'oppRpG_rep', 'OPS_plus']].values
    player_y_data = player_rows[['OPS_plus']].values
    return (preprocessing.scale(sandy_rows_np), preprocessing.scale(player_y_data))
    

In [299]:
for player in players_with_more_than_20_years:
    print(player)
    player_rows = df_probably_not_pitchers[df_probably_not_pitchers['name_common'] == player]
    data = prepare_rows(player_rows)
    data_tuple = prepare_test_train(data[0], data[1])
    run_svr(data_tuple)
    
    

Tim Raines
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.23391573 -0.12815217]
[-0.01984985  0.16741433]
Score: 0.580
Mean squared error: 0.067
Mean Absolute error: 0.255
Variance score: -22.813
Harold Baines
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.17332996  0.71362622]
[0.06980729 0.06980753]
Score: -0.117
Mean squared error: 0.237
Mean Absolute error: 0.443
Variance score: -0.204
Dave Martinez
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.19147835 -2.43758064]
[0.48590961 0.48590904]
Score: -0.000
Mean squared error: 4.317
Mean Absolute error: 1.609
Variance score: -1.498
Adrian Beltre
(6, 19) (6, 1) (2, 19) (2, 1)




[1.10326561 1.04396506]
[-0.29381201 -0.29381213]
Score: -0.022
Mean squared error: 1.871
Mean Absolute error: 1.367
Variance score: -2126.920
Aramis Ramirez
(6, 19) (6, 1) (2, 19) (2, 1)




[1.00104395 1.00358912]
[0.77750362 0.77750384]
Score: -0.088
Mean squared error: 0.051
Mean Absolute error: 0.225
Variance score: -31208.016
Alex Rodriguez
(6, 19) (6, 1) (2, 19) (2, 1)




[0.56854228 0.27256929]
[0.54978037 0.80688913]
Score: 0.860
Mean squared error: 0.143
Mean Absolute error: 0.277
Variance score: -5.526
Lenny Harris
(6, 19) (6, 1) (2, 19) (2, 1)




[-2.03681418  1.27462578]
[0.17463597 0.25166571]
Score: 0.453
Mean squared error: 2.968
Mean Absolute error: 1.617
Variance score: -0.083
Kenny Lofton
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.31756858 -0.4576521 ]
[-0.11401393  0.00156705]
Score: 0.965
Mean squared error: 0.126
Mean Absolute error: 0.331
Variance score: -24.716
Jerry Hairston
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.66921073 -0.24539942]
[0.04373509 0.28345525]
Score: 0.039
Mean squared error: 0.335
Mean Absolute error: 0.577
Variance score: -0.604
Jose Cruz
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.45450362 -0.25942849]
[0.69484358 0.28851162]
Score: 0.929
Mean squared error: 0.179
Mean Absolute error: 0.394
Variance score: -0.405
Barry Bonds
(6, 19) (6, 1) (2, 19) (2, 1)




[0.16175928 1.80291353]
[0.09771477 0.09771498]
Score: -0.010
Mean squared error: 1.456
Mean Absolute error: 0.885
Variance score: -1.162
Derek Jeter
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.64287714 -0.88445124]
[0.56199437 0.56199386]
Score: -0.012
Mean squared error: 1.049
Mean Absolute error: 0.764
Variance score: -0.799
Luis Gonzalez
(6, 19) (6, 1) (2, 19) (2, 1)




[0.27904079 0.06980152]
[0.37383478 0.37383414]
Score: -0.043
Mean squared error: 0.051
Mean Absolute error: 0.199
Variance score: -3.633
Shawon Dunston
(6, 19) (6, 1) (2, 19) (2, 1)




[-1.17519791  0.38572055]
[0.21962054 0.21962068]
Score: -0.000
Mean squared error: 0.987
Mean Absolute error: 0.780
Variance score: -0.620
Andre Dawson
(6, 19) (6, 1) (2, 19) (2, 1)




[0.96302286 0.06203615]
[0.31947375 0.75421174]
Score: 0.763
Mean squared error: 0.447
Mean Absolute error: 0.668
Variance score: -1.201
Carlos Beltran
(6, 19) (6, 1) (2, 19) (2, 1)




[1.43047281 1.75615436]
[0.47506894 0.47791767]
Score: -0.023
Mean squared error: 1.273
Mean Absolute error: 1.117
Variance score: -47.020
Craig Biggio
(6, 19) (6, 1) (2, 19) (2, 1)




[-1.01865067 -0.63258345]
[1.35179938 1.35179947]
Score: -0.250
Mean squared error: 4.778
Mean Absolute error: 2.177
Variance score: -127.238
Alex Gonzalez
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.65935739 -1.53157903]
[-0.43801535  0.00402116]
Score: 0.601
Mean squared error: 1.781
Mean Absolute error: 1.316
Variance score: -0.484
Fred McGriff
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.78983221  0.48008111]
[-0.12557996 -0.46305758]
Score: 0.983
Mean squared error: 0.665
Mean Absolute error: 0.804
Variance score: -0.650
Sandy Alomar
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.72993664  1.11319367]
[0.80904996 0.80904985]
Score: -0.019
Mean squared error: 1.230
Mean Absolute error: 0.922
Variance score: -0.449
Julio Franco
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.82755266  0.13361604]
[2.25737706 0.71527294]
Score: 0.986
Mean squared error: 4.928
Mean Absolute error: 1.833
Variance score: -20.335
Frank Thomas
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.22041419 -0.49198749]
[-0.10167861 -0.10167883]
Score: -0.000
Mean squared error: 0.128
Mean Absolute error: 0.356
Variance score: -0.009
Dave Winfield
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.17391402 -1.83019545]
[-1.34331957  0.34582062]
Score: 0.923
Mean squared error: 3.519
Mean Absolute error: 1.847
Variance score: -2.504
Cal Ripken Jr.
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.45920525 -0.84193797]
[-0.64575007 -0.28119306]
Score: -0.041
Mean squared error: 0.175
Mean Absolute error: 0.374
Variance score: -3.768
Ivan Rodriguez
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.23174265 -0.12854735]
[0.98034091 0.95950353]
Score: 0.003
Mean squared error: 1.327
Mean Absolute error: 1.150
Variance score: -497.250
Tony Phillips
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.07508967 -0.43550875]
[ 0.13157536 -0.02190993]
Score: 0.546
Mean squared error: 0.087
Mean Absolute error: 0.235
Variance score: -0.337
Ken Griffey
(6, 19) (6, 1) (2, 19) (2, 1)




[-1.62264616 -0.40524994]
[-0.02815062 -0.02815034]
Score: -0.004
Mean squared error: 1.342
Mean Absolute error: 0.986
Variance score: -2.623
Jim Thome
(6, 19) (6, 1) (2, 19) (2, 1)




[-1.70438062  0.57668836]
[0.62604078 0.58183713]
Score: -0.002
Mean squared error: 2.715
Mean Absolute error: 1.168
Variance score: -1.087
Gary Matthews
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.7051764  -0.37452393]
[0.7697516  0.78091054]
Score: 0.016
Mean squared error: 1.755
Mean Absolute error: 1.315
Variance score: -63.217
Todd Zeile
(6, 19) (6, 1) (2, 19) (2, 1)




[0.56238941 0.58797388]
[0.59719277 0.70856346]
Score: 0.994
Mean squared error: 0.008
Mean Absolute error: 0.078
Variance score: -47.133
Rickey Henderson
(6, 19) (6, 1) (2, 19) (2, 1)




[1.06540553 1.89757387]
[0.32497527 0.50889165]
Score: 0.964
Mean squared error: 1.238
Mean Absolute error: 1.065
Variance score: -6.153
Omar Vizquel
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.008177    1.20919274]
[0.99174848 0.83760275]
Score: 0.968
Mean squared error: 0.569
Mean Absolute error: 0.686
Variance score: -0.536
Tony Pena
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.18556412 -1.07402416]
[0.19890345 0.19890403]
Score: -0.005
Mean squared error: 0.810
Mean Absolute error: 0.643
Variance score: -1.043
Jason Giambi
(6, 19) (6, 1) (2, 19) (2, 1)




[-1.0235063  -0.68166017]
[0.78089    0.78066696]
Score: -0.003
Mean squared error: 2.697
Mean Absolute error: 1.633
Variance score: -91.321
Tony Gwynn
(6, 19) (6, 1) (2, 19) (2, 1)




[0.46443889 1.29223254]
[0.59834726 0.62379283]
Score: 0.870
Mean squared error: 0.232
Mean Absolute error: 0.401
Variance score: -0.356
Manny Ramirez
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.63963037 -0.31569588]
[-0.64383591  0.81659147]
Score: 0.618
Mean squared error: 0.641
Mean Absolute error: 0.568
Variance score: -23.436
Steve Finley
(6, 19) (6, 1) (2, 19) (2, 1)




[0.68391859 0.3257673 ]
[0.46911638 0.81075487]
Score: 0.945
Mean squared error: 0.141
Mean Absolute error: 0.350
Variance score: -3.387
B.J. Surhoff
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.51876772 -0.88365849]
[0.77044363 0.77044319]
Score: -0.001
Mean squared error: 1.400
Mean Absolute error: 0.953
Variance score: -1.847
Ruben Sierra
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.83927019 -1.60012358]
[0.23675176 0.54933228]
Score: 0.152
Mean squared error: 2.889
Mean Absolute error: 1.613
Variance score: -18.962
Tom Glavine
(6, 19) (6, 1) (2, 19) (2, 1)




[-2.09854476 -0.77214274]
[0.66538485 0.66538509]
Score: -0.031
Mean squared error: 4.853
Mean Absolute error: 2.101
Variance score: -10.033
Ken Griffey Jr.
(6, 19) (6, 1) (2, 19) (2, 1)




[ 0.56689104 -0.14921553]
[0.54095522 0.54095579]
Score: -0.003
Mean squared error: 0.239
Mean Absolute error: 0.358
Variance score: -0.860
Gary Sheffield
(6, 19) (6, 1) (2, 19) (2, 1)




[1.35494526 0.98767145]
[0.23191505 0.2718354 ]
Score: -0.084
Mean squared error: 0.887
Mean Absolute error: 0.919
Variance score: -25.297
Gary Gaetti
(6, 19) (6, 1) (2, 19) (2, 1)




[0.7046304  0.47323649]
[-0.5162117  -0.51160108]
Score: -0.001
Mean squared error: 1.230
Mean Absolute error: 1.103
Variance score: -90.902
Paul Molitor
(6, 19) (6, 1) (2, 19) (2, 1)




[1.12248034 0.86503894]
[0.37667968 0.37667925]
Score: -0.002
Mean squared error: 0.397
Mean Absolute error: 0.617
Variance score: -22.982
Eddie Murray
(6, 19) (6, 1) (2, 19) (2, 1)




[-0.73270374 -0.34547299]
[-0.27138419  0.80539539]
Score: 0.850
Mean squared error: 0.769
Mean Absolute error: 0.806
Variance score: -19.505
Rafael Palmeiro
(6, 19) (6, 1) (2, 19) (2, 1)
[0.47556638 0.6543033 ]
[0.22151617 0.22727028]
Score: -0.000
Mean squared error: 0.123
Mean Absolute error: 0.341
Variance score: -14.457


