##  <span style='color:green '>Evaluate Models</span>

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import math
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')
import nfl_data_py as nfl
pd.get_option("display.max_columns")

20

###  <span style='color:green '> Set up variables to be used in Fit Determination Function</span>

In [2]:
index = []
scores = {'Train Score': [], "Test Score": [], 'Test Train Difference': [], 'Scaler': []}
scalers = ["none", StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler()]
models = [LinearRegression(),
          LogisticRegression(),
          KNeighborsRegressor(),
          RandomForestRegressor(),
          RandomForestClassifier(),
          ExtraTreesRegressor(),
          ExtraTreesClassifier(),
          AdaBoostRegressor(),
          AdaBoostClassifier()]

# Variables for year substitution 
C2021 = ['2021 Games', '2021 FantasyPoints','2021 GS','2021 Tgt', '2021 Rec', '2021 RushingTD',
        '2021 RushingYds', '2021 RushingAtt', '2021 ReceivingYds', '2021 ReceivingTD']
C2020 = ['2020 Games', '2020 FantasyPoints','2020 GS','2020 Tgt', '2020 Rec', '2020 RushingTD',
        '2020 RushingYds', '2020 RushingAtt', '2020 ReceivingYds', '2020 ReceivingTD']
C2019 = ['2019 Games', '2019 FantasyPoints','2019 GS','2020 Tgt', '2019 Rec', '2019 RushingTD',
        '2019 RushingYds', '2019 RushingAtt', '2019 ReceivingYds', '2019 ReceivingTD']

###  <span style='color:green '>Define Function to Test Models and Scalers</span>

In [3]:
def test_model(data):
    for scaler in scalers:
        global index
        global scores
        if scaler != "none":
            scaler.fit(X_train)
            X_train_scaled = scaler.transform(X_train)
            X_test_scaled = scaler.transform(X_test)
        else:
            X_train_scaled = X_train
            X_test_scaled = X_test
        data = X_train_scaled, X_test_scaled, y_train, y_test
        for model in models:
            reg = model.fit(X_train_scaled, y_train)
            y_pred = reg.predict(X_test_scaled)            
            scores["Train Score"].append(reg.score(X_train_scaled, y_train))
            scores["Test Score"].append(reg.score(X_test_scaled, y_test))
            scores["Test Train Difference"].append((reg.score(X_train_scaled, y_train)-(reg.score(X_test_scaled, y_test))))
            scores["Scaler"].append(scaler)
            index += [type(reg).__name__]

###  <span style='color:green '>Get data and evaluate readiness for model</span>

 <span style='color:green '>* Find features and target   
     * Test for imbalance </span>

In [4]:
nfl.see_pbp_cols()
pbp2021 = nfl.import_pbp_data(years=[2021], downcast=True, cache=False, alt_path=None)
pbp2021


# load additional team data 
pbpclean = pbp2021
teams = nfl.import_team_desc()
players = nfl.import_rosters([2021])
player_seasons = nfl.import_seasonal_data([2021])
player_seasons.head(5)

adp = pd.read_csv('Resources/ADPxFinal.csv') 

# When ineligible players are dropped before training the model scores suffer
# Nulls are preserved and dropped just before the Predicted file is created
# Drop players ineligible to be drafted 
# adp = adp.dropna(subset=['AVG'])

# Preserve label information for Output file 
adp_scope = adp[['Player',
                 '2019 FantasyPoints',
                 '2020 FantasyPoints',
                 '2021 FantasyPoints',
                 'Production21',
                 'Average Total Production',
                 '2021 Tm',
                 'Pos',
                 'AVG'
                ]].copy()

2021 done.
Downcasting floats.


In [5]:
# Correct "inf" and "-inf" values 
# Replace infinite updated data with 'drop'
# drop these rows from what will become our result set 
adp_scope.replace([np.inf, -np.inf], 'drop', inplace=True)
res = adp_scope[~adp_scope.eq('drop').any(1)]
adp_scope  = res
adp_scope

Unnamed: 0,Player,2019 FantasyPoints,2020 FantasyPoints,2021 FantasyPoints,Production21,Average Total Production,2021 Tm,Pos,AVG
0,Jonathan Taylor,0.0,217.0,333.0,97.250653,59.723157,IND,RB,1.0
1,Austin Ekeler,309.0,111.0,274.0,84.16,79.979791,LAC,RB,3.0
2,Cooper Kupp,270.5,117.0,295.0,100.229167,84.274732,LAR,WR,3.5
3,Derrick Henry,294.6,314.0,175.0,36.853556,67.397175,TEN,RB,3.8
4,Christian McCaffrey,469.2,73.0,91.0,39.35,47.962044,CAR,RB,4.0
...,...,...,...,...,...,...,...,...,...
509,Ethan Wolf,0.0,0.0,1.0,4.0,1.333333,NOR,TE,
510,Nick Boyle,75.1,23.0,0.0,3.333333,35.519423,BAL,TE,
511,Darrell Daniels,1.4,15.0,0.0,0.0,23.245614,ARI,TE,
512,Daniel Helm,0.0,0.0,0.0,-3.0,-1.0,LVR,TE,


In [6]:
adp

Unnamed: 0.1,Unnamed: 0,Player,Pos,2021 Tm,Age,2021 Games,2021 GS,2021 Tgt,2021 Rec,2021 RushingYds,...,TotYdsScrm20,Production20,Usage21,Touchdowns21,TotYdsScrm21,Production21,Average Total Usage,Average Total Yards,Average Total Production,AVG
0,227,Jonathan Taylor,RB,IND,22.0,17,17,51.0,40.0,1811.0,...,1468.0,81.918819,22.529412,20.0,2171.0,97.250653,13.532026,1213.000000,59.723157,1.0
1,228,Austin Ekeler,RB,LAC,26.0,16,16,94.0,70.0,911.0,...,933.0,51.712707,18.750000,20.0,1558.0,84.160000,17.283333,1347.000000,79.979791,3.0
2,0,Cooper Kupp,WR,LAR,28.0,17,17,191.0,145.0,18.0,...,1007.0,70.138889,19.764706,16.0,1965.0,100.229167,16.138235,1379.000000,84.274732,3.5
3,240,Derrick Henry,RB,TEN,27.0,8,8,20.0,18.0,937.0,...,2141.0,84.420538,29.875000,10.0,1091.0,36.853556,25.745833,1659.333333,67.397175,3.8
4,269,Christian McCaffrey,RB,CAR,25.0,7,7,41.0,37.0,442.0,...,374.0,14.615385,20.000000,2.0,785.0,39.350000,24.270833,1183.666667,47.962044,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,509,Ethan Wolf,TE,NOR,26.0,2,1,2.0,2.0,0.0,...,0.0,0.000000,2.000000,0.0,8.0,4.000000,0.666667,2.666667,1.333333,
510,510,Nick Boyle,TE,BAL,28.0,5,3,2.0,1.0,0.0,...,113.0,33.387097,0.600000,0.0,2.0,3.333333,2.889815,145.333333,35.519423,
511,511,Darrell Daniels,TE,ARI,27.0,15,4,4.0,1.0,0.0,...,92.0,58.736842,0.333333,0.0,0.0,0.000000,0.760101,32.000000,23.245614,
512,512,Daniel Helm,TE,LVR,26.0,9,1,2.0,1.0,0.0,...,0.0,0.000000,0.333333,0.0,-1.0,-3.000000,0.111111,-0.333333,-1.000000,


In [7]:
# Variables for year substitution 
CKS = ['Active2021', 'Active2020', 'Active2019']
C2021 = ['2021 Games', '2021 FantasyPoints','2021 GS','2021 Tgt', '2021 Rec', '2021 RushingTD',
        '2021 RushingYds', '2021 RushingAtt', '2021 ReceivingYds', '2021 ReceivingTD']
C2020 = ['2020 Games', '2020 FantasyPoints','2020 GS','2020 Tgt', '2020 Rec', '2020 RushingTD',
        '2020 RushingYds', '2020 RushingAtt', '2020 ReceivingYds', '2020 ReceivingTD']
C2019 = ['2019 Games', '2019 FantasyPoints','2019 GS','2019 Tgt', '2019 Rec', '2019 RushingTD',
        '2019 RushingYds', '2019 RushingAtt', '2019 ReceivingYds', '2019 ReceivingTD']

# Create Checksum for years 
adp['Active2021'] = adp[C2021].sum(axis=1)
adp['Active2020'] = adp[C2020].sum(axis=1)
adp['Active2019'] = adp[C2019].sum(axis=1)

# Replace 2020 stats with 2021 stats for learning model 
adp.loc[adp["Active2020"] == 0, '2020 Games'] = adp['2021 Games']
adp.loc[adp["Active2020"] == 0, '2020 FantasyPoints'] = adp['2021 FantasyPoints']
adp.loc[adp["Active2020"] == 0, '2020 GS'] = adp['2021 GS']
adp.loc[adp["Active2020"] == 0, '2020 Tgt'] = adp['2021 Tgt']
adp.loc[adp["Active2020"] == 0, '2020 Rec'] = adp['2021 Rec']
adp.loc[adp["Active2020"] == 0, '2020 RushingYds'] = adp['2021 RushingYds']
adp.loc[adp["Active2020"] == 0, '2020 RushingAtt'] = adp['2021 RushingAtt']
adp.loc[adp["Active2020"] == 0, '2020 ReceivingYds'] = adp['2021 ReceivingYds']
adp.loc[adp["Active2020"] == 0, '2020 ReceivingTD'] = adp['2021 ReceivingTD']

# Replace 2019 stats with 2020 stats for learning model 
adp.loc[adp["Active2019"] == 0, '2019 Games'] = adp['2020 Games']
adp.loc[adp["Active2019"] == 0, '2019 FantasyPoints'] = adp['2020 FantasyPoints']
adp.loc[adp["Active2019"] == 0, '2019 GS'] = adp['2020 GS']
adp.loc[adp["Active2019"] == 0, '2019 Tgt'] = adp['2020 Tgt']
adp.loc[adp["Active2019"] == 0, '2019 Rec'] = adp['2020 Rec']
adp.loc[adp["Active2019"] == 0, '2019 RushingTD'] = adp['2020 RushingTD']
adp.loc[adp["Active2019"] == 0, '2019 RushingAtt'] = adp['2020 RushingAtt']
adp.loc[adp["Active2019"] == 0, '2019 ReceivingYds'] = adp['2020 ReceivingYds']
adp.loc[adp["Active2019"] == 0, '2019 ReceivingTD'] = adp['2020 ReceivingTD']


# Verify all 2021 players are active 
notA21 = list(adp['Active2021']).count(0)
notA20 = list(adp['Active2020']).count(0)
notA19 = list(adp['Active2019']).count(0)

print(f'Not Active in 2019: {notA19} Not Active in 2020: {notA20} Not Active in 2021: {notA21}')

# Drop Check Sum columns from DataFrame 
# adp.drop(columns=CKS, inplace=True)
# adp

showme = adp[adp['Player'] == 'Najee Harris'] 
pd.set_option("display.max_columns", None)
showme

Not Active in 2019: 191 Not Active in 2020: 120 Not Active in 2021: 0


Unnamed: 0.1,Unnamed: 0,Player,Pos,2021 Tm,Age,2021 Games,2021 GS,2021 Tgt,2021 Rec,2021 RushingYds,2021 RushingTD,2021 RushingAtt,2021 ReceivingYds,2021 ReceivingTD,2021 FantasyPoints,2019 Games,2019 GS,2019 Tgt,2019 Rec,2019 RushingYds,2019 RushingTD,2019 RushingAtt,2019 ReceivingYds,2019 ReceivingTD,2019 FantasyPoints,2020 Games,2020 GS,2020 Tgt,2020 Rec,2020 RushingYds,2020 RushingTD,2020 RushingAtt,2020 ReceivingYds,2020 ReceivingTD,2020 FantasyPoints,Usage19,Touchdowns19,TotYdsScrm19,Production19,Usage20,Touchdowns20,TotYdsScrm20,Production20,Usage21,Touchdowns21,TotYdsScrm21,Production21,Average Total Usage,Average Total Yards,Average Total Production,AVG,Active2021,Active2020,Active2019
6,230,Najee Harris,RB,PIT,23.0,17,17,94.0,74.0,1200.0,7.0,307.0,467.0,3.0,227.0,17.0,17.0,94.0,74.0,0.0,0.0,307.0,467.0,3.0,227.0,17.0,17.0,94.0,74.0,1200.0,0.0,307.0,467.0,3.0,227.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.588235,10.0,1667.0,71.094763,7.862745,555.666667,23.698254,7.3,2413.0,0.0,0.0


In [8]:
adp_production = adp[['Player',
                     '2019 FantasyPoints',
                     '2020 FantasyPoints',
                     '2021 FantasyPoints',
                     'Production19',
                     'Production20',
                     'Production21',
                     '2021 Tm',
                     'Pos',
                     'AVG'
                    ]].copy()

In [9]:
# Verify that 2021 production has corrected values 
adp["Production21"].value_counts()

 28.000000    4
 12.000000    3
 64.000000    3
 68.000000    3
 0.000000     3
             ..
 98.227848    1
 28.157303    1
 69.062500    1
 74.666667    1
-3.000000     1
Name: Production21, Length: 473, dtype: int64

In [10]:
# Display known infinity value 
# adp.iloc[[341]]

In [11]:
# Check for invalid data
count_nan = adp.isna().sum().sum()
count_nan

292

In [12]:
# Identify invalid data 
# These columns are drop from the ML Dataset 
# They are added back to the results set 
# they will still need to be corrected in the 
# previous step. 
missing_average = adp['AVG'].isna().sum()
missing_pos = adp['Pos'].isna().sum()
missing_team = adp['2021 Tm'].isna().sum()
print(f'Column AVG has {missing_average} null values')
print(f'Column POS has {missing_pos} null values')
print(f'Column Team has {missing_team} null values')

Column AVG has 292 null values
Column POS has 0 null values
Column Team has 0 null values


In [13]:
# Remove invalid data 
# Remove unnamed column, Player, Pos and 2021 Team
col = [0,1,2,3]
adp.drop(adp.columns[col],axis=1,inplace=True)

In [14]:
adp

Unnamed: 0,Age,2021 Games,2021 GS,2021 Tgt,2021 Rec,2021 RushingYds,2021 RushingTD,2021 RushingAtt,2021 ReceivingYds,2021 ReceivingTD,2021 FantasyPoints,2019 Games,2019 GS,2019 Tgt,2019 Rec,2019 RushingYds,2019 RushingTD,2019 RushingAtt,2019 ReceivingYds,2019 ReceivingTD,2019 FantasyPoints,2020 Games,2020 GS,2020 Tgt,2020 Rec,2020 RushingYds,2020 RushingTD,2020 RushingAtt,2020 ReceivingYds,2020 ReceivingTD,2020 FantasyPoints,Usage19,Touchdowns19,TotYdsScrm19,Production19,Usage20,Touchdowns20,TotYdsScrm20,Production20,Usage21,Touchdowns21,TotYdsScrm21,Production21,Average Total Usage,Average Total Yards,Average Total Production,AVG,Active2021,Active2020,Active2019
0,22.0,17,17,51.0,40.0,1811.0,18.0,332.0,360.0,2.0,333.0,15.0,13.0,39.0,36.0,0.0,11.0,232.0,299.0,1.0,217.0,15.0,13.0,39.0,36.0,1169.0,11.0,232.0,299.0,1.0,217.0,0.000000,0.0,0.0,0.000000,18.066667,12.0,1468.0,81.918819,22.529412,20.0,2171.0,97.250653,13.532026,1213.000000,59.723157,1.0,2981.0,2032.0,0.0
1,26.0,16,16,94.0,70.0,911.0,12.0,206.0,647.0,8.0,274.0,16.0,8.0,108.0,92.0,557.0,3.0,132.0,993.0,8.0,309.0,10.0,10.0,65.0,54.0,530.0,1.0,116.0,403.0,2.0,111.0,15.000000,11.0,1550.0,104.066667,18.100000,3.0,933.0,51.712707,18.750000,20.0,1558.0,84.160000,17.283333,1347.000000,79.979791,3.0,2254.0,1302.0,2226.0
2,28.0,17,17,191.0,145.0,18.0,0.0,4.0,1947.0,16.0,295.0,16.0,14.0,134.0,94.0,4.0,0.0,2.0,1161.0,10.0,270.5,15.0,12.0,124.0,92.0,33.0,0.0,4.0,974.0,3.0,117.0,14.250000,10.0,1165.0,82.456140,14.400000,3.0,1007.0,70.138889,19.764706,16.0,1965.0,100.229167,16.138235,1379.000000,84.274732,3.5,2650.0,1374.0,1705.5
3,27.0,8,8,20.0,18.0,937.0,10.0,219.0,154.0,0.0,175.0,15.0,15.0,24.0,18.0,1540.0,16.0,303.0,206.0,2.0,294.6,16.0,16.0,31.0,19.0,2027.0,17.0,378.0,114.0,0.0,314.0,21.800000,18.0,1746.0,80.917431,25.562500,17.0,2141.0,84.420538,29.875000,10.0,1091.0,36.853556,25.745833,1659.333333,67.397175,3.8,1549.0,2932.0,2433.6
4,25.0,7,7,41.0,37.0,442.0,1.0,99.0,343.0,1.0,91.0,16.0,16.0,142.0,116.0,1387.0,15.0,287.0,1005.0,4.0,469.2,3.0,3.0,19.0,17.0,225.0,5.0,59.0,149.0,1.0,73.0,26.812500,19.0,2392.0,89.920746,26.000000,6.0,374.0,14.615385,20.000000,2.0,785.0,39.350000,24.270833,1183.666667,47.962044,4.0,1069.0,554.0,3457.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,26.0,2,1,2.0,2.0,0.0,0.0,0.0,8.0,0.0,1.0,2.0,1.0,2.0,2.0,0.0,0.0,0.0,8.0,0.0,1.0,2.0,1.0,2.0,2.0,0.0,0.0,0.0,8.0,0.0,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,2.000000,0.0,8.0,4.000000,0.666667,2.666667,1.333333,,16.0,0.0,0.0
510,28.0,5,3,2.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,16.0,15.0,43.0,31.0,0.0,0.0,0.0,321.0,2.0,75.1,9.0,9.0,17.0,14.0,0.0,0.0,0.0,113.0,2.0,23.0,4.625000,2.0,321.0,69.837838,3.444444,2.0,113.0,33.387097,0.600000,0.0,2.0,3.333333,2.889815,145.333333,35.519423,,13.0,187.0,503.1
511,27.0,15,4,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,3.0,1.0,0.0,0.0,0.0,4.0,0.0,1.4,12.0,8.0,11.0,8.0,0.0,0.0,0.0,92.0,1.0,15.0,0.363636,0.0,4.0,11.000000,1.583333,1.0,92.0,58.736842,0.333333,0.0,0.0,0.000000,0.760101,32.000000,23.245614,,24.0,147.0,20.4
512,26.0,9,1,2.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,9.0,1.0,2.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,9.0,1.0,2.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.333333,0.0,-1.0,-3.000000,0.111111,-0.333333,-1.000000,,12.0,0.0,0.0


In [15]:
# Get the mean of the Average of the Average Draft Position 
meanJoeGreen = adp["AVG"].mean()
adp["AVG"] = adp["AVG"].fillna(meanJoeGreen)

In [16]:
# Remove label columns from ML data set
# AVG removed for testing until corrected by earlier step *** Corrected above
# Subseqent versions of the input file removed these columns 
# adp = adp.drop(columns=['Team','POS'])
# adp

In [17]:
# Check for remaining nulls 
count_nan = adp.isna().sum().sum()
count_nan

0

In [18]:
# Look for non numerics 
adp.applymap(np.isreal)
invalidNumbers = adp[~adp.applymap(np.isreal).all(1)]
print(len(invalidNumbers))
if len(invalidNumbers) > 0:
    print((f'There are {len(invalidNumbers)} rows with invaid numeric data'))

0


In [19]:
# Correct "inf" and "-inf" values 
# Replace infinite updated data with nan
adp.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN that were just created by the infinity value replacements
adp.dropna(inplace=True)
adp

Unnamed: 0,Age,2021 Games,2021 GS,2021 Tgt,2021 Rec,2021 RushingYds,2021 RushingTD,2021 RushingAtt,2021 ReceivingYds,2021 ReceivingTD,2021 FantasyPoints,2019 Games,2019 GS,2019 Tgt,2019 Rec,2019 RushingYds,2019 RushingTD,2019 RushingAtt,2019 ReceivingYds,2019 ReceivingTD,2019 FantasyPoints,2020 Games,2020 GS,2020 Tgt,2020 Rec,2020 RushingYds,2020 RushingTD,2020 RushingAtt,2020 ReceivingYds,2020 ReceivingTD,2020 FantasyPoints,Usage19,Touchdowns19,TotYdsScrm19,Production19,Usage20,Touchdowns20,TotYdsScrm20,Production20,Usage21,Touchdowns21,TotYdsScrm21,Production21,Average Total Usage,Average Total Yards,Average Total Production,AVG,Active2021,Active2020,Active2019
0,22.0,17,17,51.0,40.0,1811.0,18.0,332.0,360.0,2.0,333.0,15.0,13.0,39.0,36.0,0.0,11.0,232.0,299.0,1.0,217.0,15.0,13.0,39.0,36.0,1169.0,11.0,232.0,299.0,1.0,217.0,0.000000,0.0,0.0,0.000000,18.066667,12.0,1468.0,81.918819,22.529412,20.0,2171.0,97.250653,13.532026,1213.000000,59.723157,1.000000,2981.0,2032.0,0.0
1,26.0,16,16,94.0,70.0,911.0,12.0,206.0,647.0,8.0,274.0,16.0,8.0,108.0,92.0,557.0,3.0,132.0,993.0,8.0,309.0,10.0,10.0,65.0,54.0,530.0,1.0,116.0,403.0,2.0,111.0,15.000000,11.0,1550.0,104.066667,18.100000,3.0,933.0,51.712707,18.750000,20.0,1558.0,84.160000,17.283333,1347.000000,79.979791,3.000000,2254.0,1302.0,2226.0
2,28.0,17,17,191.0,145.0,18.0,0.0,4.0,1947.0,16.0,295.0,16.0,14.0,134.0,94.0,4.0,0.0,2.0,1161.0,10.0,270.5,15.0,12.0,124.0,92.0,33.0,0.0,4.0,974.0,3.0,117.0,14.250000,10.0,1165.0,82.456140,14.400000,3.0,1007.0,70.138889,19.764706,16.0,1965.0,100.229167,16.138235,1379.000000,84.274732,3.500000,2650.0,1374.0,1705.5
3,27.0,8,8,20.0,18.0,937.0,10.0,219.0,154.0,0.0,175.0,15.0,15.0,24.0,18.0,1540.0,16.0,303.0,206.0,2.0,294.6,16.0,16.0,31.0,19.0,2027.0,17.0,378.0,114.0,0.0,314.0,21.800000,18.0,1746.0,80.917431,25.562500,17.0,2141.0,84.420538,29.875000,10.0,1091.0,36.853556,25.745833,1659.333333,67.397175,3.800000,1549.0,2932.0,2433.6
4,25.0,7,7,41.0,37.0,442.0,1.0,99.0,343.0,1.0,91.0,16.0,16.0,142.0,116.0,1387.0,15.0,287.0,1005.0,4.0,469.2,3.0,3.0,19.0,17.0,225.0,5.0,59.0,149.0,1.0,73.0,26.812500,19.0,2392.0,89.920746,26.000000,6.0,374.0,14.615385,20.000000,2.0,785.0,39.350000,24.270833,1183.666667,47.962044,4.000000,1069.0,554.0,3457.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,26.0,2,1,2.0,2.0,0.0,0.0,0.0,8.0,0.0,1.0,2.0,1.0,2.0,2.0,0.0,0.0,0.0,8.0,0.0,1.0,2.0,1.0,2.0,2.0,0.0,0.0,0.0,8.0,0.0,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,2.000000,0.0,8.0,4.000000,0.666667,2.666667,1.333333,181.646847,16.0,0.0,0.0
510,28.0,5,3,2.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,16.0,15.0,43.0,31.0,0.0,0.0,0.0,321.0,2.0,75.1,9.0,9.0,17.0,14.0,0.0,0.0,0.0,113.0,2.0,23.0,4.625000,2.0,321.0,69.837838,3.444444,2.0,113.0,33.387097,0.600000,0.0,2.0,3.333333,2.889815,145.333333,35.519423,181.646847,13.0,187.0,503.1
511,27.0,15,4,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,3.0,1.0,0.0,0.0,0.0,4.0,0.0,1.4,12.0,8.0,11.0,8.0,0.0,0.0,0.0,92.0,1.0,15.0,0.363636,0.0,4.0,11.000000,1.583333,1.0,92.0,58.736842,0.333333,0.0,0.0,0.000000,0.760101,32.000000,23.245614,181.646847,24.0,147.0,20.4
512,26.0,9,1,2.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,9.0,1.0,2.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,9.0,1.0,2.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.333333,0.0,-1.0,-3.000000,0.111111,-0.333333,-1.000000,181.646847,12.0,0.0,0.0


In [20]:
# Drop Target from our data set 
# Set Target Variable 
Target = adp["Production21"].values
# adp.drop('Production21', axis=1, inplace=True)

# Alternate target   
AltTarget = adp["AVG"].values


In [21]:
print(len(Target))

510


###  <span style='color:green '>Apply Scaling for Initial Model Test </span>

In [22]:
# Standarize data with StandardScaler
# adps = StandardScaler().fit_transform(adp)
# When best Model changed - the best scaler changed
adps = MinMaxScaler().fit_transform(adp)
print(adps[0:1])

[[0.0625     1.         1.         0.26701571 0.27586207 1.
  1.         1.         0.18615385 0.125      1.         0.875
  0.76470588 0.24840764 0.31034483 0.00773196 0.6875     0.75570033
  0.20604396 0.07692308 0.4647708  0.875      0.76470588 0.23493976
  0.28346457 0.57920549 0.64705882 0.61375661 0.19635891 0.05555556
  0.69303797 0.         0.         0.         0.         0.69107468
  0.57142857 0.68566091 0.48761202 0.75363748 1.         1.
  0.35774631 0.5246538  0.7310705  0.48222749 0.         1.
  0.69304229 0.        ]]


###  <span style='color:green '> PCA</span>
<span style='color:green '> Applying PCA to reduce dimensions while preserving 99% of the explained variance </span>

In [23]:
# Applying PCA to reduce dimensions while preserving 90% of the explained variance 
# Initialize PCA model by setting n-components to desired level
pca = PCA(n_components= .99)

# Fit our new Principal Component Analysis reduced Features to our Model
pfa = pca.fit_transform(adps)

# Transform PCA data to a DataFrame
pf = pd.DataFrame(data=pfa)
pf.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
0,2.015376,1.493647,1.623394,-0.760681,-0.302469,0.323591,0.05244,0.561642,0.211632,-0.070885,0.334292,0.609761,-0.092759,0.194964,-0.10137,0.234482,-0.049535,0.152392,-0.033121,0.136257,-0.327977,-0.231714,0.019501
1,2.376581,0.85363,0.20642,-0.408223,0.689747,-0.350882,-0.363725,0.161322,-0.065887,0.359099,0.236021,0.039666,-0.155222,0.34635,-0.266206,0.487087,0.114074,-0.149875,0.132977,0.060336,0.017681,-0.058366,-0.016382


###  <span style='color:green '>Create X and Y | Reshape the data</span>

In [24]:
# Create X (features) and y (target) sets
X = pf
y = Target

print("Shape: ", X.shape, y.shape)

Shape:  (510, 23) (510,)


###  <span style='color:green '>Split data into Training and Testing sets</span>

In [25]:
# Split the data into training and testing sets
data = X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42)

###  <span style='color:green '>Get to know our data</span>

In [26]:
%%time
# setting Random forest as sample to view data 
# classifier = ExtraTreesRegressor()
classifier = LinearRegression()
clf = classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

# Does not apply to linear Regression 
# features = clf.feature_importances_
# print(features)
# plt.bar(x = range(len(features)), height=features)
# plt.show()

Training Data Score: 0.9823376100978953
Testing Data Score: 0.9794784241406523
Wall time: 9.69 ms


In [27]:
# Does not apply to linear Regression 
# %%time
# features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
# cols = [f[0] for f in features]
# width = [f[1] for f in features]

# fig, ax = plt.subplots()
# fig.set_size_inches(10,5)
# plt.margins(y=0.001)

# ax.barh(y=cols, width=width)

# plt.show()

In [28]:
# Does not apply to linear Regression 
# %%time
# features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
# features = features[:10]
# cols = [f[0] for f in features]
# width = [f[1] for f in features]

# fig, ax = plt.subplots()
# fig.set_size_inches(10,5)
# plt.margins(y=0.001)

# ax.barh(y=cols, width=width)

# plt.show()

In [29]:
y_predicted = clf.predict(X_test)
scores1 = clf.score(X_test, y_test)

print(y_predicted)
print(scores1)

[ 42.16448636  38.78099111  57.26085869  68.55112712  12.62398008
  75.68561271  54.88064062  44.281016    62.42322288  49.26719751
   2.51218598  44.59362872  65.32826083  51.16028895  55.95613165
  64.41844146   8.12634876  53.37710868  -0.87087166  98.61163464
   3.28391261  84.69969529  59.78935134  70.3752606   11.18303222
  26.25065578  69.58341466   2.10245803  82.16232767  20.86932973
  33.35828064  63.13130099  53.60206199  58.11091774 116.4794307
   8.89504711  31.510115    61.29087829   9.24678847  36.01020622
  89.96402437  -5.70146909  69.0418275   99.38657419  80.44022759
  11.3802225   65.69598801  56.86430461  81.23896014  33.06106174
  77.55457458  63.61068515  41.95641835  76.96170602  35.71306975
   5.29806953  60.95566851  68.62345624  71.27176201  65.00672244
  86.05853204 102.20702969  70.99367633  76.90366107  97.20218485
  64.525843     5.98057764  80.95337518  77.50281218  64.31507324
  18.13030943  28.00409707  46.05473407  45.31249649  50.68215592
  72.595885

In [30]:
y_predicted = clf.predict(X)
print(y_predicted)

[102.20702969  83.5036513   99.38657419  27.29962613  43.27767269
 115.72837671  60.76429227  58.37592531 143.06382438  84.69969529
  89.13157515  76.90366107  91.84348017  79.36846425  50.44998529
  63.03897812 100.51698039 155.79058142  46.05473407  82.13310406
  51.93710761  82.09785685  64.525843    89.04343205  88.29840625
  72.33195299  65.2874114   71.27283235  91.07904073  65.84112202
   2.10245803  53.96404572  88.57378744  60.95566851  89.66142389
  57.46723576  81.80404881  62.49941315  65.36185116  52.02558715
  49.59293367  93.74902877  84.82612774  88.06796018  54.1016635
  78.657465    75.01056364  61.19226471  44.7448026   70.89240002
  42.44401303  72.91718444  69.68861862  74.08604768  91.91052519
  75.68561271  52.65748922  86.35288356  13.29633821  97.4701138
  41.094981    83.82649574  66.32793231  69.0418275   97.32538536
  50.7317473   49.07831781  50.67848359  86.05853204  62.46983343
  98.61163464  68.03105447  64.31507324  58.11091774 112.22294309
 110.0905668

In [31]:
adp_scope['Prediction'] = y_predicted
adp_scope.rename(columns={"2021 Tm": "Team"})

Unnamed: 0,Player,2019 FantasyPoints,2020 FantasyPoints,2021 FantasyPoints,Production21,Average Total Production,Team,Pos,AVG,Prediction
0,Jonathan Taylor,0.0,217.0,333.0,97.250653,59.723157,IND,RB,1.0,102.207030
1,Austin Ekeler,309.0,111.0,274.0,84.16,79.979791,LAC,RB,3.0,83.503651
2,Cooper Kupp,270.5,117.0,295.0,100.229167,84.274732,LAR,WR,3.5,99.386574
3,Derrick Henry,294.6,314.0,175.0,36.853556,67.397175,TEN,RB,3.8,27.299626
4,Christian McCaffrey,469.2,73.0,91.0,39.35,47.962044,CAR,RB,4.0,43.277673
...,...,...,...,...,...,...,...,...,...,...
509,Ethan Wolf,0.0,0.0,1.0,4.0,1.333333,NOR,TE,,3.283913
510,Nick Boyle,75.1,23.0,0.0,3.333333,35.519423,BAL,TE,,5.111986
511,Darrell Daniels,1.4,15.0,0.0,0.0,23.245614,ARI,TE,,8.982710
512,Daniel Helm,0.0,0.0,0.0,-3.0,-1.0,LVR,TE,,3.323570


In [32]:
adp_scope.sort_values('Prediction', ascending=False).head(50)

Unnamed: 0,Player,2019 FantasyPoints,2020 FantasyPoints,2021 FantasyPoints,Production21,Average Total Production,2021 Tm,Pos,AVG,Prediction
294,Andre Roberts,5.7,-1.0,7.0,288.0,124.922222,2TM,WR,,265.913888
412,Nick Bawden,5.7,0.0,2.0,180.0,69.444444,NYJ,RB,,164.191673
17,Deebo Samuel,187.1,48.0,262.0,144.161616,95.756771,SFO,WR,19.0,155.790581
8,Ja'Marr Chase,0.0,0.0,224.0,121.114833,40.371611,CIN,WR,9.3,143.063824
233,DeSean Jackson,36.9,31.0,56.0,136.296296,64.280783,2TM,WR,,141.741409
406,Jakob Johnson,1.5,10.0,4.0,146.2,73.4,NWE,RB,,140.313966
153,Kendrick Bourne,95.8,81.0,126.0,126.48,95.517284,NWE,WR,254.0,130.417485
298,Alex Erickson,96.24,11.0,6.0,133.571429,96.145734,CAR,WR,,129.237417
308,Gunner Olszewski,3.4,0.0,4.0,128.0,57.777778,NWE,WR,,127.31766
210,Brandon Bolden,50.9,0.0,83.0,115.892473,73.823132,NWE,RB,375.0,116.479431


In [33]:
# save
joblib.dump(classifier, "Resources/draft_position_no_QB.joblib")

# Temporary write file while we are working on data ***
adp_scope.dropna(subset=['AVG'])

adp_scope.to_csv('Resources/Draft_position_no_QB.csv', index=False)

### <span style='color:green '>The section below is to evaluate new potential Models </span>

In [34]:
# Create X (features) and y (target) sets
X = pf

#convert y values to categorical values(for model selection matrix)
lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)
y = y_transformed

# Split the data into training and testing sets
data = X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42)
print(len(y))
print(len(X))

510
510


In [35]:
%%time
# call function test_model to test all models being considered
scores = {'Train Score': [], "Test Score": [], 'Test Train Difference': [], 'Scaler': []}
test_model(data)

Wall time: 32.1 s


In [36]:
# Print by Training Score
df_scores = pd.DataFrame(scores, index=index)
df_scores.sort_values('Train Score', ascending=False).head(10)

Unnamed: 0,Train Score,Test Score,Test Train Difference,Scaler
RandomForestClassifier,1.0,0.0,1.0,MinMaxScaler()
ExtraTreesRegressor,1.0,0.831264,0.168736,MinMaxScaler()
RandomForestClassifier,1.0,0.0,1.0,MaxAbsScaler()
ExtraTreesRegressor,1.0,0.843135,0.156865,MaxAbsScaler()
ExtraTreesClassifier,1.0,0.0,1.0,MaxAbsScaler()
ExtraTreesClassifier,1.0,0.0,1.0,StandardScaler()
ExtraTreesRegressor,1.0,0.836349,0.163651,StandardScaler()
RandomForestClassifier,1.0,0.0,1.0,StandardScaler()
ExtraTreesClassifier,1.0,0.0,1.0,MinMaxScaler()
RandomForestClassifier,1.0,0.0,1.0,RobustScaler()


In [37]:
# Print by Testing Score
df_scores.sort_values('Test Score', ascending=False).head(10)

Unnamed: 0,Train Score,Test Score,Test Train Difference,Scaler
LinearRegression,0.913295,0.954193,-0.040899,RobustScaler()
LinearRegression,0.913295,0.954193,-0.040899,StandardScaler()
LinearRegression,0.913295,0.954193,-0.040899,MinMaxScaler()
LinearRegression,0.913295,0.954193,-0.040899,MaxAbsScaler()
LinearRegression,0.913295,0.954193,-0.040899,none
ExtraTreesRegressor,1.0,0.843135,0.156865,MaxAbsScaler()
ExtraTreesRegressor,1.0,0.836349,0.163651,StandardScaler()
ExtraTreesRegressor,1.0,0.831264,0.168736,MinMaxScaler()
RandomForestRegressor,0.968879,0.829092,0.139787,StandardScaler()
RandomForestRegressor,0.969197,0.828762,0.140435,none


In [38]:
# Print by Testing Score
df_scores['Test Train Magnitude'] = abs(df_scores['Test Train Difference'])
df_scores.sort_values('Test Train Magnitude').head(10)

Unnamed: 0,Train Score,Test Score,Test Train Difference,Scaler,Test Train Magnitude
AdaBoostClassifier,0.026178,0.007812,0.018366,StandardScaler(),0.018366
AdaBoostClassifier,0.028796,0.007812,0.020983,RobustScaler(),0.020983
AdaBoostClassifier,0.028796,0.007812,0.020983,MinMaxScaler(),0.020983
AdaBoostClassifier,0.034031,0.007812,0.026219,MaxAbsScaler(),0.026219
LinearRegression,0.913295,0.954193,-0.040899,MaxAbsScaler(),0.040899
LinearRegression,0.913295,0.954193,-0.040899,none,0.040899
LinearRegression,0.913295,0.954193,-0.040899,RobustScaler(),0.040899
LinearRegression,0.913295,0.954193,-0.040899,StandardScaler(),0.040899
LinearRegression,0.913295,0.954193,-0.040899,MinMaxScaler(),0.040899
AdaBoostClassifier,0.049738,0.007812,0.041926,none,0.041926


While a number of different models performed better over the span of out testing. With the feature data in its final format, the linear regression model performed the best with a training score of .913 and a testing score of .954. While the magnitude between training and testing did perform the best; the over all testing and training scores for AdaBoostClassifier, took it out of consideration. 