In [5]:
import numpy as np
import pandas as pd
import csv
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import optuna
import time

def scrape(option: int, player_pos, player_name) -> pd.DataFrame:
    url = None

    if player_pos == 'D':
        # hacky way to get seasonal defensive stats
        url = 'https://www.fantasypros.com/nfl/stats/dst.php'
    else:
        url_head = r'https://www.nfl.com/players/'
        url_feet = '/stats/career'
        url = url_head + player_name + url_feet

    s_df = pd.read_html(url)
    return s_df[option]  # option of 0 = first table, 1 = second table

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.28),
        'n_estimators': trial.suggest_int('n_estimators', 40, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)  # ,'seed': trial.suggest_int('seed', 0, 10000)
    }
    m = xgb.XGBRegressor(**param)
    m.fit(data_train, label_train)
    y_pred = m.predict(data_test)
    return np.sqrt(mean_squared_error(label_test, y_pred))

In [6]:
best_sets = []

with open('ks.csv', mode ='r') as file:
    f = csv.reader(file)

    for row in f:
        fantasy_points = []
        player, pos = row[0], row[1]
        print("Optimizing: ", player)
        df = scrape(0, pos, player)  # get kicking

        my_dict = {'0': [], '1': [], '2': [], '3': [], '4': [], '5': [], '6': [], '7': []}
        final_df = pd.DataFrame(my_dict)

        for i in range(df.shape[0] - 1):
            points = 0
            contents = []

            # calculate points gained and penalty for 0-39 yd FGs
            data = df.at[i, '30-39']
            data = data.split('-')

            temp1 = int(data[0])
            other = int(data[1])
            penalty = other - temp1

            data = df.at[i, '20-29']
            data = data.split('-')
            temp2 = int(data[0])
            other = int(data[1])
            penalty += other - temp2

            data = df.at[i, '1-19']
            data = data.split('-')
            temp3 = int(data[0])
            other = int(data[1])
            penalty += other - temp3

            points += (temp1 + temp2) * 5
            points -= penalty * 2

            contents.append(temp1 + temp2)
            contents.append(penalty)

            # calculate points gained and penalty for 40-49 yard FGs
            data = df.at[i, '40-49']
            data = data.split('-')
            temp1 = int(data[0])
            other = int(data[1])
            penalty = other - temp1

            points += temp1 * 4
            points -= penalty

            contents.append(temp1)
            contents.append(penalty)

            # calculate points gained for 50+ yard FGs
            data = df.at[i, '60+']
            data = data.split('-')
            temp1 = int(data[0])

            data = df.at[i, '50-59']
            data = data.split('-')
            temp2 = int(data[0])

            points += (temp1 + temp2) * 5

            # formulate the data so xgboost can interpret it
            contents.append(temp1 + temp2)
            contents.append(df.at[i, 'FGM'])
            contents.append(df.at[i, 'FG ATT'])
            contents.append(df.at[i, 'PCT'])
            temp = pd.DataFrame(my_dict)
            temp.loc[len(df.index)] = contents
            final_df = pd.concat([final_df, temp], axis=0)

            fantasy_points.append(points)

        fantasy_points.append(sum(fantasy_points))  # this line calculates the total sum of all fantasy points on table
        df['Fantasy Points'] = fantasy_points

        df.drop('YEAR', axis=1, inplace=True)
        df.drop('TEAM', axis=1, inplace=True)
        df.drop('G', axis=1, inplace=True)

        df.drop(df.shape[0] - 1, axis=0, inplace=True)

        df = df[::-1]

        temp = pd.DataFrame()
        temp['0-39-good'] = final_df['0']
        temp['0-39-miss'] = final_df['1']
        temp['40-49-good'] = final_df['2']
        temp['40-49-miss'] = final_df['3']
        temp['50+good'] = final_df['4']
        temp['FGM'] = final_df['5']
        temp['FG ATT'] = final_df['6']
        temp['PCT'] = final_df['7']
        temp['Fantasy Points'] = df['Fantasy Points']
        target = df['Fantasy Points'].tolist()
        target.pop(0)
        target.append(0)
        temp['Target'] = target
        df = temp
        df = df[::-1]  # reverse the rows

        df = df.fillna(0)
        df = df.iloc[1:]

        #print(df.shape[0])

        if df.shape[0] > 1:
            data, label = df.iloc[:, :-1], df.iloc[:, -1]
            xgb.DMatrix(data=data, label=label, enable_categorical=True)

            data_train, data_test = data.iloc[1:], data.iloc[:1]  # most recent season
            label_train, label_test = label.iloc[1:], label.iloc[:1]

            scaler = StandardScaler()
            #print(data_train)
            data_train = scaler.fit_transform(data_train)
            data_test = scaler.transform(data_test)

            model = xgb.XGBRegressor()
            model.fit(data_train, label_train)
            preds = model.predict(data_test)

            # optimize parameters
            num_trials = 200

            start = time.time()
            optuna.logging.set_verbosity(optuna.logging.WARNING)
            study = optuna.create_study(direction='minimize', study_name='regression') # when first run
            #study = joblib.load('qb.pkl')
            study.optimize(objective, n_trials=num_trials, show_progress_bar=True, n_jobs=-1)

            #joblib.dump(study, "test.pkl")  # save study

            # result = study.best_params
            result = study.best_trial.params
            #print("Time: ", time.time() - start)
            #print(result)
            best_sets.append(result)

Optimizing:  justin-tucker


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  daniel-carlson


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  brett-maher


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  jason-myers


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  younghoe-koo


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  eddy-pineiro


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  nick-folk


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  tyler-bass


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  graham-gano


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  robbie-gould


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  riley-patterson
Optimizing:  greg-zuerlein


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  chase-mclaughlin


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  brandon-mcmanus


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  jason-sanders


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  ryan-succop


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  matt-gay


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  greg-joseph


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  evan-mcpherson
Optimizing:  ka-imi-fairbairn


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  jake-elliott


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  cade-york
Optimizing:  mason-crosby


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  mike-badgley


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  joey-slye


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  wil-lutz


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  cairo-santos


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  harrison-butker


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  chris-boswell


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  cameron-dicker
Optimizing:  randy-bullock


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  matthew-wright


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  dustin-hopkins


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  austin-seibert


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  matt-ammendola


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  rodrigo-blankenship


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

Optimizing:  taylor-bertolet
Optimizing:  caleb-shudak
Optimizing:  nick-sciba
Optimizing:  dominik-eberle
Optimizing:  josh-lambo


  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

In [7]:
best_sets

[{'max_depth': 6,
  'learning_rate': 0.25992024721458007,
  'n_estimators': 528,
  'min_child_weight': 2,
  'gamma': 0.9089477847041423,
  'subsample': 0.6878367822329426,
  'colsample_bytree': 0.03528554889841008,
  'reg_alpha': 0.8555519315237761,
  'reg_lambda': 0.8160520324365781,
  'random_state': 47},
 {'max_depth': 3,
  'learning_rate': 0.2457440549761218,
  'n_estimators': 747,
  'min_child_weight': 1,
  'gamma': 0.4781539669414922,
  'subsample': 0.9223137009622872,
  'colsample_bytree': 0.09180771862590394,
  'reg_alpha': 0.411798148001434,
  'reg_lambda': 0.8250177870660836,
  'random_state': 425},
 {'max_depth': 2,
  'learning_rate': 0.2633840104968418,
  'n_estimators': 672,
  'min_child_weight': 1,
  'gamma': 0.6530699576775191,
  'subsample': 0.7807009870865689,
  'colsample_bytree': 0.17083220570848856,
  'reg_alpha': 0.010433507864942328,
  'reg_lambda': 0.8822315451899957,
  'random_state': 659},
 {'max_depth': 5,
  'learning_rate': 0.1879799168627806,
  'n_estimators

In [8]:
max_depth_agg = []
lr_agg = []
n_est_agg = []
min_cw_agg = []
gamma_agg = []
subsam_agg = []
col_bytree_agg = []
alpha_agg = []
lambda_agg = []
random_agg = []

for node in best_sets:
    max_depth_agg.append(node['max_depth'])
    lr_agg.append(node['learning_rate'])
    n_est_agg.append(node['n_estimators'])
    min_cw_agg.append(node['min_child_weight'])
    gamma_agg.append(node['gamma'])
    subsam_agg.append(node['subsample'])
    col_bytree_agg.append(node['colsample_bytree'])
    alpha_agg.append(node['reg_alpha'])
    lambda_agg.append(node['reg_lambda'])
    random_agg.append(node['random_state'])

max_depth_agg = int(sum(max_depth_agg)/len(max_depth_agg))
lr_agg = sum(lr_agg)/len(lr_agg)
n_est_agg = int(sum(n_est_agg)/len(n_est_agg))
min_cw_agg = int(sum(min_cw_agg)/len(min_cw_agg))
gamma_agg = sum(gamma_agg)/len(gamma_agg)
subsam_agg = sum(subsam_agg)/len(subsam_agg)
col_bytree_agg = sum(col_bytree_agg)/len(col_bytree_agg)
alpha_agg = sum(alpha_agg)/len(alpha_agg)
lambda_agg = sum(lambda_agg)/len(lambda_agg)
random_agg = int(sum(random_agg)/len(random_agg))

best_set = {
    'max_depth': max_depth_agg,
    'learning_rate': lr_agg,
    'n_estimators': n_est_agg,
    'min_child_weight': min_cw_agg,
    'gamma': gamma_agg,
    'subsample': subsam_agg,
    'colsample_bytree': col_bytree_agg,
    'reg_alpha': alpha_agg,
    'reg_lambda': lambda_agg,
    'random_state': random_agg
}

best_set

{'max_depth': 5,
 'learning_rate': 0.16896843342614076,
 'n_estimators': 520,
 'min_child_weight': 2,
 'gamma': 0.3804131083440961,
 'subsample': 0.5317913069999278,
 'colsample_bytree': 0.3864875772698342,
 'reg_alpha': 0.4471409827902937,
 'reg_lambda': 0.515144918107477,
 'random_state': 511}