In [43]:
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [173]:
data_file_1 = pd.read_csv('../data/completed_df.csv')
data_file_2 = pd.read_csv('../data/completed_df_part2.csv')
data_file_1 = pd.get_dummies(data_file_1, columns=['position'], drop_first=True)
batter_data = data_file_1.append(data_file_2)
batter_data = batter_data.replace([np.inf, -np.inf], np.nan)
batter_data = batter_data.dropna()
batter_data.head()

Unnamed: 0.1,Unnamed: 0,playerID,BA,2B/AB,3B/AB,HR/AB,RBI/G,R/G,BB/G,SB/G,...,position_2.0,position_3.0,position_4.0,position_5.0,position_6.0,position_7.0,position_8.0,position_9.0,position_10.0,position_11.0
0,3083,tabatjo01,0.275126,0.050879,0.009422,0.010678,0.247544,0.408644,0.253438,0.092338,...,0,0,0,0,0,0,0,1,0,0
1,2539,preslal01,0.263383,0.035689,0.013562,0.0207,0.246667,0.348889,0.18,0.066667,...,0,0,0,0,0,1,0,0,0,0
2,2012,mccutan01,0.284697,0.056807,0.00795,0.040245,0.52752,0.60235,0.522573,0.11812,...,0,0,0,0,0,0,1,0,0,0
3,1174,gordode01,0.285937,0.032233,0.014037,0.004679,0.233533,0.530938,0.167665,0.332335,...,0,0,0,0,1,0,0,0,0,0
4,896,ellisma01,0.262458,0.050225,0.004886,0.02052,0.383275,0.480836,0.305226,0.057143,...,0,0,1,0,0,0,0,0,0,0


In [174]:
batter_data.shape

(498650, 54)

In [175]:
# Drop unused categories
batter_data = batter_data.drop(columns=['date', 'season', 'singles_', 'doubles_', 'triples_', 'home_runs_', 'rbis_', 'runs_', 'walks_', 'stolen_bases_', 'hbp_', 'ab_', 'Unnamed: 0', 'playerID', 'starting_pitcher_hand'])

X = batter_data.drop(columns=['fantasy_points'])
y = batter_data['fantasy_points']

# Normalize data so it can be handle by linear regression models
ss = StandardScaler()
ss.fit(X)
X = ss.transform(X)

# Split into a training and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y)

Index(['BA', '2B/AB', '3B/AB', 'HR/AB', 'RBI/G', 'R/G', 'BB/G', 'SB/G',
       'HBP/G', 'fantasy_ppg', 'ppg_vs_hand', 'fantasy_points', 'wind_factor',
       'temp', 'starting_p_rpip', 'hr_factor', 'ba_factor',
       'seasonal_singles_p_game', 'seasonal_doubles_p_game',
       'seasonal_triples_p_game', 'seasonal_hr_p_game', 'seasonal_rbis_p_game',
       'seasonal_runs_p_game', 'seasonal_walks_p_game', 'seasonal_sb_p_game',
       'seasonal_hbp_p_game', 'seasonal_ab_p_game', 'last_7_fantasy',
       'position_1.0', 'position_2.0', 'position_3.0', 'position_4.0',
       'position_5.0', 'position_6.0', 'position_7.0', 'position_8.0',
       'position_9.0', 'position_10.0', 'position_11.0'],
      dtype='object')

In [184]:
batter_data.shape

(498650, 39)

In [4]:
# Establish baseline to compare models to 
mean_fantasy = y_train.mean()
mean_ = np.full((y_test.shape[0],), mean_fantasy)
mean_squared_error(y_test, mean_)**.5


9.2521102791034

In [95]:
# Fit 1st model
xg_reg = xgb.XGBRegressor(n_estimators = 100, learning_rate =.1, max_depth=3)
xg_reg.fit(X_train, y_train)

y_hat_xg = xg_reg.predict(X_test)
mean_squared_error(y_test, y_hat_xg)**.5

8.773251599795252

In [8]:
# Fit 2nd model
xg_reg_2 = xgb.XGBRegressor(n_estimators = 1000, learning_rate =.1, max_depth=2)
xg_reg_2.fit(X_train, y_train)
y_hat_xg2 = xg_reg_2.predict(X_test)
mean_squared_error(y_test, y_hat_xg2)**.5

8.768181147612948

In [10]:
# Fit 3rd model
xg_reg_3 = xgb.XGBRegressor(n_estimators = 1000, learning_rate =.2, max_depth=1)
xg_reg_3.fit(X_train, y_train)
y_hat_xg3 = xg_reg_3.predict(X_test)
mean_squared_error(y_test, y_hat_xg3)**.5

8.779845452495143

In [12]:
# Fit 4th model
line = LinearRegression()
line.fit(X_train, y_train)
y_hat_line = line.predict(X_test)
mean_squared_error(y_test, y_hat_line)**.5

8.79087168263175

In [93]:
# Fit 5th model
features = [22,0]
line_2 = LinearRegression()
line_2.fit(X_train[:,features], y_train)
y_hat_line_2 = line_2.predict(X_test[:,features])
mean_squared_error(y_test, y_hat_line_2)**.5

9.077051025960968

In [65]:
# Fit 6th model
rfor = RandomForestRegressor(n_estimators=100, max_depth=3)
rfor.fit(X_train[:100000,[0,5]], y_train[:100000])
y_hat_rfor = rfor.predict(X_test[:,[0,5]])
mean_squared_error(y_test, y_hat_rfor)**.5

9.006970220591485

In [107]:
# Combine models into an ensemble and give a weight to each model
def ensemble(w1, w2,w3,w4,w5,w6):
    sum_w = w1+w2+w3+w4+w5+w6
    return (y_hat_xg * w1 + y_hat_xg2 * w2 + y_hat_xg3 * w3 + y_hat_line * w4 + y_hat_rfor * w5 + y_hat_rfor * w6) / sum_w



In [106]:
def find_best_weights(l1,l2,l3,l4,l5,l6):
    best = 100
    for w1 in l1:
        for w2 in l2:
            for w3 in l3:
                for w4 in l4:
                    for w5 in l5:
                        for w6 in l6:
                            y_hat = ensemble(w1,w2,w3,w4,w5,w6)
                            try:
                                mse = mean_squared_error(y_test, y_hat)**.5
                                if mse < best:
                                    best = mse
                                    weights = f'{w1}, {w2}, {w3}, {w4}, {w5}, {w6}'
                            except:
                                pass
    print(weights, best)
    
    
l1 = [.8,.9,1,1.1,1.2]
l2 = [4.8,4.9,5,5.1,5.2]
l3 = [0, .1,.2,.3]
l4 = [.8,.9,1,1.,1.2]
l5 = [0,.1,.2,.3]
l6 = [0,1,2,3,4,5]

find_best_weights(l1,l2,l3,l4,l5,l6)

1.1, 5.1, 0, 0.9, 0, 0 8.767049088901341


In [111]:
dump(xg_reg, 'fit_models/1_xgboost.joblib')

['fit_models/1_xgboost.joblib']

In [112]:
dump(xg_reg_2, 'fit_models/2_xgboost.joblib')


['fit_models/2_xgboost.joblib']

In [113]:
dump(line, 'fit_models/3_line.joblib')

['fit_models/3_line.joblib']

In [114]:
weight_dictionary = {'w1':1.1, 'w2':5.1, 'w3':.9}

In [180]:
file = open("fit_models/weights.txt", "w")
file.write("{'w1':1.1, 'w2':5.1, 'w3':.9}")
file.close()