In [7]:
#Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
pd.set_option('display.max_columns', 500)

%matplotlib inline


In [8]:

nba = pd.read_csv('data/nba_stats_info2.csv', index_col = 0)
ncaa = pd.read_csv('data/ncaa_stats_info2.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
grades = []
for x in ncaa.grade:
    if x in ('Fr','RS-Fr'):
        grades.append(1)
    elif x in ('So','RS-So'):
        grades.append(2)
    elif x in ('Jr','RS-Jr'):
        grades.append(3)
    else:
        grades.append(4)
        
ncaa.grade = grades
        

In [10]:
#filter nba data for players who played at least 15 games in a season
nba_played = nba.loc[nba.gp >= 20]

#filter college data for players who played at least 10 games in a season
ncaa_played = ncaa.loc[(ncaa.gp >= 15)]

#get rookie nba seasons
rookies = nba_played.loc[nba_played.season_count == 1]

#get last college season
last_college_stats = ncaa_played.groupby('name').last().reset_index()

preNBA = last_college_stats.loc[(last_college_stats.highest_level_reached.isin(['NBA','2017-18']))]

In [186]:
ncaaClean = preNBA.merge(rookies[['realgm_summary_page','ts_pct','blk_pct','ast_pct','stl_pct','reb_pct','ows','ortg','dws','drtg','ws','per']], 
             on = 'realgm_summary_page', suffixes=(['','_nba']), how='left')
ncaaClean.head()

Unnamed: 0,name,realgm_summary_page,highest_level_reached,season,school,league,conference,teamid,year,grade,gp,gs,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,pf,tov,pts,mp_tot,fgm_tot,fga_tot,fg3m_tot,fg3a_tot,ftm_tot,fta_tot,oreb_tot,dreb_tot,reb_tot,ast_tot,stl_tot,blk_tot,pf_tot,tov_tot,pts_tot,dbl_dbl,tpl_dbl,pts40,pts20,ast20,techs,hob,ast_to,stl_to,ft_fga,w,l,win_pct,ows,dws,ws,ts_pct,efg_pct,oreb_pct,dreb_pct,reb_pct,ast_pct,tov_pct,stl_pct,blk_pct,usg_pct,total_s_pct,ppr,pps,ortg,drtg,per,ff,season_count,realgm_link,id,pos,height,weight,final_school,dob_code,added_birthdays,bday,hometown,highest_level,rsci_year,rsci_rank,year247,rank247,max_pred_wingspan,avg_pred_wingspan,max_wingspan,avg_wingspan,last_cbb_year,ncaa_seasons (d-i),nba_seasons,intl_seasons,gl_seasons,sl_seasons,intl_before_college,sports-reference_id_1,sports-reference_id_2,Unnamed: 29,draft_year,pick,age,draft_age,ts_pct_nba,blk_pct_nba,ast_pct_nba,stl_pct_nba,reb_pct_nba,ows_nba,ortg_nba,dws_nba,drtg_nba,ws_nba,per_nba
0,A.J. Brodeur,/player/AJ-Brodeur/Summary/86008,2017-18,2017-18,Pennsylvania,ncaa,Ivy League,160.0,2017,2,33,33,31.0,5.42,10.15,0.534,0.61,2.15,0.282,1.67,2.73,0.611,1.79,5.42,7.21,2.55,0.88,1.21,2.7,2.42,13.12,1024.0,179,335,20,71,55,90,59,179,238,84,29,40,89,80,433,5.0,0.0,0.0,0.0,0.0,0.0,0.295,1.05,0.36,0.27,24.0,9.0,0.727,1.72,2.34,4.05,0.573,0.564,6.55,19.13,12.96,16.94,17.48,1.61,4.01,22.87,142.71,-2.34,1.29,106.4,93.0,18.45,5.9461,2,/player/AJ-Brodeur/Summary/86008,86008,PF,80,220,Pennsylvania,1997104.0,,10/4/1997,Northfield (MA),2017-18,2016.0,232.0,,,83.69,83.24,,,,2,0,0,0,0,,/cbb/players/aj-brodeur-1.html,,,,99.0,20.005479,,,,,,,,,,,,
1,A.J. Hammons,/player/AJ-Hammons/Summary/24294,NBA,2015-16,Purdue,ncaa,Big Ten Conference,105.0,2015,4,33,20,24.6,6.03,10.18,0.592,0.18,0.33,0.545,2.73,3.85,0.709,2.33,5.85,8.18,1.12,0.27,2.55,2.36,1.97,14.97,813.0,199,336,6,11,90,127,77,193,270,37,9,84,78,65,494,10.0,0.0,0.0,0.0,0.0,0.0,0.262,0.57,0.14,0.38,24.0,9.0,0.727,2.83,2.18,5.01,0.623,0.601,11.56,24.86,18.71,10.6,14.09,0.66,10.4,28.64,184.64,-5.1,1.47,117.9,88.4,29.21,6.1319,4,/player/AJ-Hammons/Summary/24294,24294,C,84,250,Purdue,19920827.0,,8/27/1992,Carmel (IN),NBA,2012.0,74.0,2012.0,76.0,86.82,86.28,87.0,87.0,2016.0,4,1,0,2,2,,/cbb/players/aj-hammons-1.html,,,2016.0,46.0,23.109589,24.112329,0.472,7.49,3.87,0.32,13.01,-0.23,86.7,0.14,110.2,-0.08,8.41
2,A.J. Price,/player/AJ-Price/Summary/1656,NBA,2008-09,Connecticut,ncaa,Big East Conference,64.0,2008,4,35,35,31.8,4.86,11.91,0.408,2.34,5.83,0.402,2.66,3.69,0.721,0.57,2.91,3.49,4.69,0.69,0.0,1.26,2.69,14.71,1112.0,170,417,82,204,93,129,20,102,122,164,24,0,44,94,515,3.0,0.0,0.0,0.0,0.0,0.0,0.346,1.74,0.26,0.31,30.0,5.0,0.857,2.89,1.85,4.74,0.538,0.506,2.08,8.97,5.81,28.35,16.43,1.26,0.0,25.17,153.06,1.35,1.24,110.7,98.2,17.78,4.7724,3,/player/AJ-Price/Summary/1656,1656,G,74,195,Connecticut,19861007.0,,10/7/1986,Orange (NJ),NBA,2004.0,24.0,2004.0,6.0,77.02,76.71,75.75,75.75,2009.0,5,6,2,0,1,,,,,2009.0,52.0,22.0,23.0,0.53,0.25,20.41,2.01,5.58,0.32,101.7,0.74,109.1,1.06,13.92
3,Aaron Brooks,/player/Aaron-Brooks/Summary/53,NBA,2006-07,Oregon,ncaa,Pacific 10 Conference,238.0,2006,4,35,35,36.8,6.09,13.23,0.46,2.29,5.66,0.404,3.29,3.89,0.846,0.89,3.37,4.26,4.26,1.37,0.17,2.51,2.54,17.74,1289.0,213,463,80,198,115,136,31,118,149,149,48,6,88,89,621,1.0,0.0,0.0,0.0,0.0,0.0,0.396,1.67,0.54,0.29,28.0,7.0,0.8,4.7,2.14,6.84,0.589,0.546,2.89,10.92,6.92,23.97,14.43,2.24,0.51,24.38,170.97,0.8,1.34,120.9,99.0,21.11,4.4474,4,/player/Aaron-Brooks/Summary/53,53,G,72,161,Oregon,19850114.0,,1/14/1985,Seattle (WA),NBA,2003.0,32.0,2003.0,30.0,74.37,74.12,76.0,76.0,2007.0,4,10,1,1,2,,,,,2007.0,26.0,21.726027,22.726027,0.535,0.59,23.28,1.13,5.2,0.62,106.5,0.81,105.1,1.42,12.93
4,Aaron Gordon,/player/Aaron-Gordon/Summary/24308,NBA,2013-14,Arizona,ncaa,Pacific 12 Conference,235.0,2013,1,38,38,31.2,4.97,10.03,0.496,0.42,1.18,0.356,2.0,4.74,0.422,2.66,5.29,7.95,1.97,0.89,1.03,2.37,1.45,12.37,1187.0,189,381,16,45,76,180,101,201,302,75,34,39,90,55,470,8.0,0.0,0.0,0.0,0.0,0.0,0.265,1.36,0.62,0.47,33.0,5.0,0.868,2.25,3.35,5.6,0.504,0.517,10.25,19.33,14.91,12.96,10.55,1.78,3.37,23.15,127.38,-0.44,1.23,109.0,87.7,19.37,4.9648,1,/player/Aaron-Gordon/Summary/24308,24308,F,81,220,Arizona,19950916.0,,9/16/1995,San Jose (CA),NBA,2013.0,3.0,2013.0,3.0,83.77,83.77,83.75,83.31,2014.0,1,4,0,0,2,,/cbb/players/aaron-gordon-1.html,,,2014.0,4.0,18.054795,19.054795,0.517,2.23,6.15,1.34,11.86,0.36,102.0,0.65,107.4,1.02,11.31


In [12]:
ncaaClean.weight = pd.to_numeric(ncaaClean.weight)
ncaaClean = ncaaClean.loc[~ncaaClean.age.isnull()]

In [13]:
training_cols = ['grade','mp','fgm','fga','fg_pct','fg3m','fg3a','fg3_pct','ftm','fta','ft_pct','oreb','dreb','reb','ast'
                 ,'stl','blk','pf','tov','pts','dbl_dbl','tpl_dbl','pts40','pts20','ast20','ast_to','stl_to','ft_fga','win_pct'
                 ,'ows','dws','ws','ts_pct','efg_pct','oreb_pct','dreb_pct','ast_pct','tov_pct','stl_pct','blk_pct'
                 ,'usg_pct','ortg','drtg','per','season_count','height','weight','age']
Xtr = ncaaClean.loc[(ncaaClean.year <2016) & (~ncaaClean.ts_pct_nba.isnull())][training_cols]
ytr = ncaaClean.loc[ncaaClean.year <2016].ts_pct_nba
ytr = ytr.loc[~ytr.isnull()]

Xtest = ncaaClean.loc[ncaaClean.year == 2016][training_cols].fillna(0)
ytest = ncaaClean.loc[ncaaClean.year == 2016].ts_pct_nba


Xholdout = ncaaClean.loc[ncaaClean.highest_level_reached == '2017-18'][training_cols].fillna(0)
yholdout = ncaaClean.loc[ncaaClean.highest_level_reached == '2017-18'].ts_pct_nba

In [106]:
def get_kbest(X,y,score_func = f_regression, k = 10):    
    """SElect the best k features using SelectKBest algorithm"""
    X_scaled = StandardScaler().fit_transform(X)
    kbest = SelectKBest(score_func, k=k)
    kbest.fit(X_scaled,y)
    kbest_fts = []
    for label in X.columns[kbest.get_support()]:
        kbest_fts.append(label)
    return kbest_fts

In [15]:
ncaaClean.loc[(ncaaClean.ts_pct_nba.isnull()) & (ncaaClean.highest_level_reached != '2017-18')]

Unnamed: 0,name,realgm_summary_page,highest_level_reached,season,school,league,conference,teamid,year,grade,gp,gs,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,pf,tov,pts,mp_tot,fgm_tot,fga_tot,fg3m_tot,fg3a_tot,ftm_tot,fta_tot,oreb_tot,dreb_tot,reb_tot,ast_tot,stl_tot,blk_tot,pf_tot,tov_tot,pts_tot,dbl_dbl,tpl_dbl,pts40,pts20,ast20,techs,hob,ast_to,stl_to,ft_fga,w,l,win_pct,ows,dws,ws,ts_pct,efg_pct,oreb_pct,dreb_pct,reb_pct,ast_pct,tov_pct,stl_pct,blk_pct,usg_pct,total_s_pct,ppr,pps,ortg,drtg,per,ff,season_count,realgm_link,id,pos,height,weight,final_school,dob_code,added_birthdays,bday,hometown,highest_level,rsci_year,rsci_rank,year247,rank247,max_pred_wingspan,avg_pred_wingspan,max_wingspan,avg_wingspan,last_cbb_year,ncaa_seasons (d-i),nba_seasons,intl_seasons,gl_seasons,sl_seasons,intl_before_college,sports-reference_id_1,sports-reference_id_2,Unnamed: 29,draft_year,pick,age,draft_age,ts_pct_nba,blk_pct_nba,ast_pct_nba,stl_pct_nba,reb_pct_nba,ows_nba,dws_nba,ws_nba,per_nba
8,Aaron Jackson,/player/Aaron-Jackson/Summary/1723,NBA,2008-09,Duquesne,ncaa,Atlantic 10 Conference,17.0,2008,4,34,34,36.7,6.74,12.15,0.554,1.38,3.41,0.405,4.47,5.53,0.809,1.79,3.68,5.47,5.71,1.65,0.09,1.85,3.03,19.32,1249.0,229,413,47,116,152,188,61,125,186,194,56,3,63,103,657,3.0,0.0,1.0,0.0,0.0,1.0,0.452,1.88,0.54,0.46,21.0,13.0,0.618,5.82,1.18,7.01,0.654,0.611,5.71,11.77,8.73,31.43,17.02,2.56,0.25,23.89,176.82,2.00,1.59,127.5,105.4,25.32,5.7104,4,/player/Aaron-Jackson/Summary/1723,1723,G,76,185,Duquesne,19860506.0,,5/6/1986,Hartford (CT),NBA,,,,,78.1,77.88,75.5,75.5,2009.0,4,1,9,0,2,,,,,,99.0,22.421918,,,,,,,,,,
10,Aaron Miles,/player/Aaron-Miles/Summary/1106,NBA,2004-05,Kansas,ncaa,Big 12 Conference,54.0,2004,4,30,30,33.1,2.87,6.27,0.457,1.33,2.67,0.500,2.23,2.83,0.788,0.60,2.93,3.53,7.17,1.67,0.33,1.90,3.07,9.30,992.0,86,188,40,80,67,85,18,88,106,215,50,10,57,92,279,1.0,0.0,0.0,0.0,0.0,0.0,0.376,2.34,0.54,0.45,23.0,7.0,0.767,2.00,2.18,4.18,0.611,0.564,2.14,8.99,5.83,38.10,28.72,3.04,1.10,16.27,174.57,5.03,1.48,114.7,96.2,16.46,7.9011,3,/player/Aaron-Miles/Summary/1106,1106,G,73,175,Kansas,19830413.0,,4/13/1983,Portland (OR),NBA,,,,,75.5,75.22,75.5,75.5,2005.0,4,1,7,1,6,,,,,,99.0,21.484932,,,,,,,,,,
16,Adonis Thomas,/player/Adonis-Thomas/Summary/24150,NBA,2012-13,Memphis,ncaa,Conference USA,301.0,2012,2,36,36,29.3,4.31,10.64,0.405,0.72,2.47,0.292,2.36,3.14,0.752,1.53,2.94,4.47,1.92,0.67,0.69,1.39,1.61,11.69,1056.0,155,383,26,89,85,113,55,106,161,69,24,25,50,58,421,1.0,0.0,0.0,0.0,0.0,0.0,0.230,1.19,0.41,0.30,31.0,5.0,0.861,1.80,1.64,3.45,0.482,0.439,6.15,11.00,8.67,12.41,11.72,1.31,2.42,22.95,144.90,-1.08,1.10,104.0,97.3,14.95,4.3806,2,/player/Adonis-Thomas/Summary/24150,24150,PF,79,240,Memphis,19930325.0,,3/25/1993,Memphis (TN),NBA,2011.0,8.0,2011.0,9.0,82.29,82,85,84.38,2013.0,2,1,2,3,3,,/cbb/players/adonis-thomas-1.html,,,,99.0,19.534247,,,,,,,,,,
26,Alan Williams,/player/Alan-Williams/Summary/31153,NBA,2014-15,UC Santa Barbara,ncaa,Big West Conference,115.0,2014,4,26,26,32.6,6.15,13.42,0.458,0.00,0.12,0.000,4.96,6.46,0.768,3.31,8.54,11.85,1.73,1.19,1.81,2.69,1.96,17.27,847.0,160,349,0,3,129,168,86,222,308,45,31,47,70,51,449,17.0,0.0,0.0,1.0,0.0,0.0,0.320,0.88,0.61,0.48,14.0,12.0,0.538,2.70,2.18,4.88,0.524,0.458,12.15,31.61,21.84,12.86,10.63,2.34,6.47,30.82,122.63,-2.58,1.29,111.2,88.2,28.21,5.3427,4,/player/Alan-Williams/Summary/31153,31153,FC,80,260,UC Santa Barbara,19930128.0,,1/28/1993,Phoenix (AZ),NBA,,,,,84.96,84.59,85.75,85.67,2015.0,4,3,1,2,2,,/cbb/players/alan-williams-2.html,,,,99.0,21.687671,,,,,,,,,,
27,Alando Tucker,/player/Alando-Tucker/Summary/55,NBA,2006-07,Wisconsin,ncaa,Big Ten Conference,106.0,2006,4,36,36,32.8,7.11,15.14,0.470,1.11,3.42,0.325,4.56,6.92,0.659,2.17,3.19,5.36,2.03,0.89,0.28,1.25,1.67,19.89,1180.0,256,545,40,123,164,249,78,115,193,73,32,10,45,60,716,1.0,0.0,0.0,0.0,0.0,0.0,0.379,1.22,0.53,0.46,30.0,6.0,0.833,4.68,2.67,7.34,0.540,0.506,8.22,11.86,10.06,16.15,8.30,1.71,0.95,33.00,145.36,-1.01,1.31,116.2,93.0,25.70,3.9904,5,/player/Alando-Tucker/Summary/55,55,F,78,205,Wisconsin,19840211.0,,2/11/1984,Lockport (IL),NBA,,,,,80.96,80.7,Not Available,Not Available,2007.0,5,3,7,3,3,,,,,2007.0,29.0,22.652055,23.652055,,,,,,,,,
30,Alex Acker,/player/Alex-Acker/Summary/158,NBA,2004-05,Pepperdine,ncaa,West Coast Conference,334.0,2004,3,31,31,37.2,5.90,13.13,0.450,1.90,4.42,0.431,2.94,3.42,0.858,1.42,5.06,6.48,3.68,1.29,0.48,1.87,2.65,16.65,1152.0,183,407,59,137,91,106,44,157,201,114,40,15,58,82,516,2.0,0.0,0.0,0.0,0.0,0.0,0.364,1.39,0.49,0.26,17.0,14.0,0.548,2.89,0.50,3.39,0.564,0.522,2.50,9.18,5.79,19.90,15.20,2.43,1.41,22.15,173.88,-0.58,1.27,113.6,114.4,22.68,4.5478,3,/player/Alex-Acker/Summary/158,158,SG,77,185,Pepperdine,19830121.0,,1/21/1983,Compton (CA),NBA,,,,,79.12,78.95,84,84,2005.0,3,2,10,1,3,,,,,2005.0,60.0,21.709589,22.709589,,,,,,,,,
32,Alex Kirk,/player/Alex-Kirk/Summary/9391,NBA,2013-14,New Mexico,ncaa,Mountain West Conference,210.0,2013,3,32,31,32.0,5.16,10.53,0.490,0.38,1.75,0.214,2.56,3.84,0.667,2.28,6.38,8.66,1.09,0.38,2.66,2.56,1.78,13.25,1025.0,165,337,12,56,82,123,73,204,277,35,12,85,82,57,424,14.0,0.0,0.0,0.0,0.0,0.0,0.251,0.61,0.21,0.36,25.0,7.0,0.781,1.74,2.23,3.97,0.536,0.507,8.83,21.07,15.43,7.55,12.60,0.71,8.86,23.16,137.06,-3.35,1.26,107.7,94.5,19.55,5.1728,3,/player/Alex-Kirk/Summary/9391,9391,C,84,245,New Mexico,19911114.0,,11/14/1991,Los Alamos (NM),NBA,2010.0,103.0,2010.0,90.0,85.98,85.56,87.5,87.25,2014.0,4,1,3,1,3,,/cbb/players/alex-kirk-2.html,,,,99.0,21.895890,,,,,,,,,,
35,Alex Poythress,/player/Alex-Poythress/Summary/24266,NBA,2015-16,Kentucky,ncaa,Southeastern Conference,258.0,2015,4,31,23,23.6,3.84,6.39,0.601,0.23,0.74,0.304,2.32,3.29,0.706,1.77,4.26,6.03,0.32,0.61,0.68,3.61,1.19,10.23,732.0,119,198,7,23,72,102,55,132,187,10,19,21,112,37,317,4.0,0.0,0.0,0.0,0.0,1.0,0.146,0.27,0.51,0.52,23.0,8.0,0.742,2.19,1.39,3.58,0.643,0.619,8.75,19.38,14.28,2.50,13.05,1.50,2.85,18.99,161.12,-4.18,1.60,125.4,96.6,19.99,5.3381,4,/player/Alex-Poythress/Summary/24266,24266,F,79,238,Kentucky,19930906.0,,9/6/1993,Clarksville (TN),NBA,2012.0,8.0,2012.0,8.0,82.89,82.76,85,83.79,2016.0,4,2,0,2,2,,/cbb/players/alex-poythress-1.html,,,,99.0,22.082192,,,,,,,,,,
36,Alex Stepheson,/player/Alex-Stepheson/Summary/2389,NBA,2010-11,USC,ncaa,Pacific 10 Conference,242.0,2010,4,34,34,32.9,4.03,7.29,0.552,0.00,0.00,0.000,1.74,3.38,0.513,3.24,5.97,9.21,0.18,0.38,1.15,2.26,1.53,9.79,1118.0,137,248,0,0,59,115,110,203,313,6,13,39,77,52,333,12.0,0.0,0.0,0.0,0.0,0.0,0.176,0.12,0.25,0.46,19.0,15.0,0.559,1.74,2.18,3.92,0.550,0.552,11.64,22.17,16.82,1.13,14.66,0.72,3.84,17.18,106.55,-4.46,1.34,108.1,96.0,16.36,6.2828,4,/player/Alex-Stepheson/Summary/2389,2389,PF,82,249,North Carolina,19870807.0,,8/7/1987,Los Angeles (CA),NBA,2006.0,40.0,2006.0,39.0,85.26,85.15,84,84,2011.0,4,1,4,1,3,,/cbb/players/alex-stepheson-1.html,,,,99.0,23.167123,,,,,,,,,,
38,Alfonzo McKinnie,/player/Alfonzo-McKinnie/Summary/22478,NBA,2014-15,Green Bay,ncaa,Horizon League,148.0,2014,4,33,20,21.6,3.18,7.03,0.453,0.64,1.94,0.328,1.00,1.97,0.508,1.97,3.36,5.33,0.18,0.48,0.48,3.39,1.39,8.00,713.0,105,232,21,64,33,65,65,111,176,6,16,16,112,46,264,4.0,0.0,0.0,0.0,0.0,1.0,0.134,0.13,0.35,0.28,24.0,9.0,0.727,0.62,1.49,2.11,0.502,0.498,10.73,18.09,14.43,1.76,14.89,1.39,2.49,22.65,128.84,-5.92,1.14,97.4,93.1,13.61,6.1097,4,/player/Alfonzo-McKinnie/Summary/22478,22478,F,80,215,Eastern Illinois,19920917.0,,9/17/1992,Chicago (IL),NBA,,,,,83.67,83.28,Not Available,Not Available,2015.0,4,1,1,2,1,,/cbb/players/alfonzo-mckinnie-1.html,,,,99.0,22.052055,,,,,,,,,,


In [16]:
kbest = get_kbest(Xtr,ytr,k=15)
kbest[0]

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


['fg_pct',
 'oreb',
 'reb',
 'ast',
 'blk',
 'ft_fga',
 'ts_pct',
 'efg_pct',
 'oreb_pct',
 'dreb_pct',
 'ast_pct',
 'blk_pct',
 'per',
 'height',
 'weight']

In [17]:
def avg_score(X,y, model,score="neg_mean_squared_error", cv=5):
    """Average score of a model using k-fold cross-validation, scoreing method and k taken as inputs, as well as model"""
    scores = cross_val_score(model, X, y,
                             scoring=score, cv=cv)
    if score == "neg_mean_squared_error":
        scores = np.sqrt(-scores)
    return scores.mean()

def scaled_ols(): 
    """Pipeline of standard scaler, polynomial features, and ols"""
    sc_ols = Pipeline([
        ("scaler", StandardScaler()),
        ("poly", PolynomialFeatures()),
        ("ols", LinearRegression()),
        ]) 
    return sc_ols

def scaled_ridge():
    """Pipeline of standard scaler, polynomial features, and ridge regularization"""
    sc_ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("poly", PolynomialFeatures()),
        ("ridge", Ridge()),
        ]) 
    return sc_ridge

def scaled_lasso():
    """Pipeline of standard scaler, polynomial features, and lasso regularization"""
    sc_lasso = Pipeline([
        ("scaler", StandardScaler()),
        ("poly", PolynomialFeatures()),        
        ("lasso", Lasso()),
        ]) 
    return sc_lasso

In [18]:
rmse_train = avg_score(Xtr,ytr,scaled_ridge(),cv=10)

rmse_train

0.13342497865187092

In [19]:
from sklearn.model_selection import GridSearchCV

# Create the classifier
ridge = scaled_ridge()
alphas = np.logspace(-3,5,9)
solvers = ['svd', 'cholesky', 'lsqr']
    
param_grid = [
        {'ridge__alpha':alphas,'ridge__solver':solvers}
        ]
    

grid_search = GridSearchCV(ridge, param_grid, scoring = 'neg_mean_squared_error')
                           
grid_search.fit(Xtr, ytr)
grid_search.best_params_

{'ridge__alpha': 10000.0, 'ridge__solver': 'svd'}

In [20]:
#set the best ridge model and see that average score
ridge_best = grid_search.best_estimator_
avg_score(Xtr,ytr,ridge_best,cv=10, score = 'neg_mean_squared_error')

0.05849593083583454

In [21]:
#fit the trainig data to the model
ridge_best.fit(Xtr,ytr)

#get predicted values
y_pred = ridge_best.predict(Xtest)

In [22]:

real_pred = pd.DataFrame(list(zip(y_pred,ytest)), index = ytest.index, columns = ['pred','test'])

In [23]:
ncaaClean[['name']].merge(real_pred, left_index=True, right_index = True).sort_values('pred',ascending=False).head(20)

Unnamed: 0,name,pred,test
99,Austin Wiley,0.554668,
532,Ike Anigbogu,0.553626,
517,Harry Giles,0.553508,
1453,Zach Collins,0.540033,0.475
1344,Tony Bradley,0.539714,
635,Jarrett Allen,0.539528,0.636
104,Bam Adebayo,0.534994,0.57
739,Jordan Bell,0.530134,0.641
956,Mangok Mathiang,0.524703,
712,John Collins,0.523407,0.62


In [24]:
y_pred_h = pd.DataFrame(ridge_best.predict(Xholdout), index = yholdout.index, columns = ['pred'])
ncaaClean[['name']].merge(y_pred_h, left_index=True, right_index = True).sort_values('pred',ascending=False).head(20)

Unnamed: 0,name,pred
534,Ike Obiagu,0.579775
831,Kenny Wooten,0.57712
1308,Tacko Fall,0.574945
1120,Paschal Chukwu,0.573499
488,Gavin Schilling,0.573301
1272,Silvio De Sousa,0.571877
407,Doral Moore,0.570136
401,Donta Hall,0.566088
1362,Trayvon Reed,0.56579
1406,Udoka Azubuike,0.560022


In [25]:
Xtr = ncaaClean.loc[(ncaaClean.year <2016) & (~ncaaClean.ts_pct_nba.isnull())][training_cols]
ytr = ncaaClean.loc[ncaaClean.year <2016].ts_pct_nba
ytr = ytr.loc[~ytr.isnull()]

Xtest = ncaaClean.loc[ncaaClean.year == 2016][training_cols].fillna(0)
ytest = ncaaClean.loc[ncaaClean.year == 2016].ts_pct_nba


Xholdout = ncaaClean.loc[ncaaClean.highest_level_reached == '2017-18'][training_cols].fillna(0)
yholdout = ncaaClean.loc[ncaaClean.highest_level_reached == '2017-18'].ts_pct_nba

In [200]:
def target_stat(stat):
    Xtr = ncaaClean.loc[(ncaaClean.year <2016) & (~ncaaClean[stat].isnull())][training_cols].fillna(0)
    ytr = ncaaClean.loc[(ncaaClean.year <2016) & (~ncaaClean[stat].isnull())][stat]
    Xtest = ncaaClean.loc[ncaaClean.year == 2016 & (~ncaaClean[stat].isnull())][training_cols].fillna(0)
    ytest = ncaaClean.loc[ncaaClean.year == 2016 & (~ncaaClean[stat].isnull())][stat]
    ytest = ytest.loc[~ytest.isnull()]
    Xholdout = ncaaClean.loc[ncaaClean.highest_level_reached == '2017-18'][training_cols].fillna(0)
    yholdout = ncaaClean.loc[ncaaClean.highest_level_reached == '2017-18'][stat]
    return Xtr, ytr, Xtest, ytest, Xholdout, yholdout


def ridge(X,y):
    ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge()),
        ]) 
    
    alphas = np.logspace(-3,5,9)
    solvers = ['svd', 'cholesky', 'lsqr']
    
    param_grid = [
            {'ridge__alpha':alphas,'ridge__solver':solvers}
            ]
    grid_search = GridSearchCV(ridge, param_grid, scoring = 'neg_mean_squared_error')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_estimator_

def lasso(X,y):
    lasso = Pipeline([
        ("scaler", StandardScaler()),
        ("lasso", Lasso()),
        ]) 
    
    alphas = np.logspace(-1,5,6)
    
    param_grid = [
            {'lasso__alpha':alphas}
            ]
    grid_search = GridSearchCV(lasso, param_grid, scoring = 'neg_mean_squared_error')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_estimator_

def testdf(model):
    model.fit(Xtr,ytr)
    y_pred = model.predict(Xtest)
    real_pred = pd.DataFrame(list(zip(y_pred,ytest)), index = ytest.index, columns = ['pred','test'])
    df = ncaaClean[['name']].merge(real_pred, left_index=True, right_index = True).sort_values('pred', ascending=False)
    return df

def holddf(model):
    model.fit(Xtr,ytr)
    y_pred_h = pd.DataFrame(model.predict(Xholdout), index = yholdout.index, columns = ['pred'])
    df = ncaaClean[['name']].merge(y_pred_h, left_index=True, right_index = True).sort_values('pred', ascending=False)
    return df
    

In [27]:
Xtr, ytr, ytest, yholdout = target_stat('ast_pct_nba')
model = ridge(Xtr,ytr)


In [28]:
testdf(model).head(10)

Unnamed: 0,name,pred,test
1453,Zach Collins,26.556304,6.95
651,Jawun Evans,21.304452,18.09
929,Luke Kornet,21.165451,11.24
712,John Collins,21.147137,8.88
731,Jonathan Isaac,20.855294,4.88
904,Lauri Markkanen,19.968781,6.36
657,Jayson Tatum,18.800012,8.38
739,Jordan Bell,17.479343,16.92
927,Luke Kennard,13.404727,12.68
765,Josh Jackson,12.953359,10.16


In [29]:
Xtr, ytr, ytest, yholdout = target_stat('ast_pct_nba')
avg_score(Xtr,ytr,ridge(Xtr,ytr),cv=10, score = 'r2')

0.653643251576363

In [30]:
ncaaClean.loc[ncaaClean.name == 'Zach Collins']

Unnamed: 0,name,realgm_summary_page,highest_level_reached,season,school,league,conference,teamid,year,grade,gp,gs,mp,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,pf,tov,pts,mp_tot,fgm_tot,fga_tot,fg3m_tot,fg3a_tot,ftm_tot,fta_tot,oreb_tot,dreb_tot,reb_tot,ast_tot,stl_tot,blk_tot,pf_tot,tov_tot,pts_tot,dbl_dbl,tpl_dbl,pts40,pts20,ast20,techs,hob,ast_to,stl_to,ft_fga,w,l,win_pct,ows,dws,ws,ts_pct,efg_pct,oreb_pct,dreb_pct,reb_pct,ast_pct,tov_pct,stl_pct,blk_pct,usg_pct,total_s_pct,ppr,pps,ortg,drtg,per,ff,season_count,realgm_link,id,pos,height,weight,final_school,dob_code,added_birthdays,bday,hometown,highest_level,rsci_year,rsci_rank,year247,rank247,max_pred_wingspan,avg_pred_wingspan,max_wingspan,avg_wingspan,last_cbb_year,ncaa_seasons (d-i),nba_seasons,intl_seasons,gl_seasons,sl_seasons,intl_before_college,sports-reference_id_1,sports-reference_id_2,Unnamed: 29,draft_year,pick,age,draft_age,ts_pct_nba,blk_pct_nba,ast_pct_nba,stl_pct_nba,reb_pct_nba,ows_nba,dws_nba,ws_nba,per_nba
1453,Zach Collins,/player/Zach-Collins/Summary/85659,NBA,2016-17,Gonzaga,ncaa,West Coast Conference,332.0,2016,1,39,0,17.2,3.46,5.31,0.652,0.26,0.54,0.476,2.82,3.79,0.743,1.67,4.21,5.87,0.41,0.46,1.77,2.69,1.54,10.0,670.0,135,207,10,21,110,148,65,164,229,16,18,69,105,60,390,1.0,0.0,0.0,0.0,0.0,0.0,0.131,0.27,0.3,0.71,37.0,2.0,0.949,2.69,2.4,5.09,0.703,0.676,12.02,23.3,18.4,4.46,17.79,1.53,9.86,24.93,187.16,-7.27,1.88,125.2,79.0,29.0,7.2284,1,/player/Zach-Collins/Summary/85659,85659,C,84,230,Gonzaga,19971119.0,,11/19/1997,Las Vegas (NV),NBA,2016.0,36.0,2016.0,31.0,86.03,86.03,85,85,2017.0,1,1,0,0,1,,/cbb/players/zach-collins-1.html,,,2017.0,10.0,18.879452,19.879452,0.475,2.41,6.95,0.81,11.54,-0.66,1.18,0.52,7.53


In [31]:
Xtr, ytr, ytest, yholdout = target_stat('ast_pct_nba')
get_kbest(Xtr,ytr,score_func = f_regression, k = 20)[0]

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


['mp',
 'fg_pct',
 'fg3m',
 'fg3a',
 'oreb',
 'dreb',
 'reb',
 'ast',
 'stl',
 'blk',
 'tov',
 'dbl_dbl',
 'ast_to',
 'oreb_pct',
 'dreb_pct',
 'ast_pct',
 'stl_pct',
 'blk_pct',
 'height',
 'weight']

In [196]:
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

def bayes(X,y):
    bayes = Pipeline([
        ("scaler", StandardScaler()),
        ("bayes", BayesianRidge()),
        ]) 
    return bayes.fit(X,y)

def tree(X,y):
    tree = Pipeline([
        ("scaler", StandardScaler()),
        ("tree", DecisionTreeRegressor()),
        ]) 
        
    min_samples_split = range(2, 10)
    n_features = ['auto','sqrt','log2']
    
    param_grid = [
            {'tree__min_samples_split': min_samples_split,'tree__max_features':n_features}
            ]
    grid_search = GridSearchCV(tree, param_grid, scoring = 'neg_mean_squared_error')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_estimator_

def forest(X,y):
    forest = Pipeline([
        ("scaler", StandardScaler()),
        ("forest", RandomForestRegressor()),
        ]) 
        
    n_estimators  = [10,100]
    min_samples_split = range(2,5)
    
    param_grid = [
            {'forest__n_estimators':n_estimators,'forest__min_samples_split': min_samples_split}
            ]
    grid_search = GridSearchCV(forest, param_grid, scoring = 'neg_mean_squared_error')
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_estimator_


In [146]:
def target_stat(stat):
    Xtr = ncaaClean.loc[(ncaaClean.year <2016) & (~ncaaClean[stat].isnull())][training_cols].fillna(0)
    ytr = ncaaClean.loc[(ncaaClean.year <2016) & (~ncaaClean[stat].isnull())][stat]
    Xtest = ncaaClean.loc[ncaaClean.year == 2016][training_cols].fillna(0)
    ytest = ncaaClean.loc[ncaaClean.year == 2016][stat]
    Xholdout = ncaaClean.loc[ncaaClean.highest_level_reached == '2017-18'][training_cols].fillna(0)
    yholdout = ncaaClean.loc[ncaaClean.highest_level_reached == '2017-18'][stat]
    return Xtr, ytr, Xtest, ytest, Xholdout, yholdout


In [189]:
Xtr, ytr, Xtest, ytest, Xholdout, yholdout = target_stat('pick')
x = forest(Xtr,ytr)

print(avg_score(Xtr,ytr,x,cv=10, score = 'neg_mean_squared_error'))
print(avg_score(Xtr,ytr,x,cv=10, score = 'r2'))

27.07380690920807
0.4532358805531887


In [191]:
holddf(x).sort_values('pred').head(30).dropna()

Unnamed: 0,name,pred
1424,"Wendell Carter, Jr.",4.464224
994,Marvin Bagley III,4.814969
326,DeAndre Ayton,6.718524
561,Ja Morant,9.549967
1251,Shai Gilgeous-Alexander,10.773133
1463,Zhaire Smith,12.35359
736,Jontay Porter,16.593824
1044,Miles Bridges,17.222917
846,Kevin Knox,17.772043
1354,Trae Young,19.115981


In [204]:
def scoredf(stat):
    Xtr, ytr, Xtest, ytest, Xholdout, yholdout = target_stat(stat)
    models = [ridge(Xtr, ytr),lasso(Xtr, ytr),tree(Xtr, ytr),forest(Xtr, ytr)]
    mse = []
    r2 = []
    for m in models:
        mse.append(avg_score(Xtr,ytr,m,cv=10, score = 'neg_mean_squared_error'))
        r2.append(avg_score(Xtr,ytr,m,cv=10, score = 'r2'))
        
    m_names = ['ridge','lasso','tree','forest']
    df = pd.DataFrame(list(zip(m_names,mse,r2)),columns = ['model','mse','r2'])
    return models, df

In [205]:
ows = scoredf('ows_nba')
dws = scoredf('dws_nba')

In [207]:
dws[1]

Unnamed: 0,model,mse,r2
0,ridge,0.653664,0.221328
1,lasso,0.678848,0.173025
2,tree,0.908744,-0.414766
3,forest,0.668322,0.20046


In [209]:
ortg = scoredf('ortg_nba')
drtg = scoredf('drtg_nba')
per = scoredf('per_nba')

In [None]:
Xtr, ytr, Xtest, ytest, Xholdout, yholdout = target_stat('ortg_nba')
ridge_ortg = ridge(Xtr,ytr)
lasso_ortg = lasso(Xtr,ytr)
tree_ortg = tree(Xtr,ytr)
forest_ortg = forest(Xtr,ytr)


In [None]:
Xtr, ytr, Xtest, ytest, Xholdout, yholdout = target_stat('drtg_nba')
ridge_drtg = ridge(Xtr,ytr)
lasso_drtg = lasso(Xtr,ytr)
tree_drtg = tree(Xtr,ytr)
forest_drtg = forest(Xtr,ytr)


In [None]:
Xtr, ytr, Xtest, ytest, Xholdout, yholdout = target_stat('per_nba')
ridge_per = ridge(Xtr,ytr)
lasso_per = lasso(Xtr,ytr)
tree_per = tree(Xtr,ytr)
forest_per = forest(Xtr,ytr)