# Fantasy Projection Model Using NBA Per Game Stats

This model takes in a player season and finds the 10 most similar player seasons across the decades. Then using weights based on how similar the player seasons are, takes the averages of each of those players following seasons to predict our current players next season.

In [1]:
# import modules
import numpy as np
import pandas as pd
from sklearn import preprocessing
from math import sqrt
from IPython.display import display
from sklearn.metrics import mean_squared_error
import pprint

In [None]:
# import custom functions
from nba_functions import normalize, vorp, calc_distance, find_player, player_comparison_tool

In [2]:
# read in per game data from csv folder
original_df = pd.read_csv('nba-csv/player_general_traditional_per_game_data.csv', header=0)

In [3]:
# check to see what data looks like
original_df.tail()

Unnamed: 0,player_id,season_id,gp,age,min,fgm,fga,fg_pct,fg3m,fg3a,...,ftm,fta,ft_pct,oreb,dreb,ast,tov,stl,blk,pts
9558,201163,2016-17,71.0,30.0,30.9,6.1,13.2,0.461,1.5,4.6,...,2.0,2.7,0.727,1.5,5.0,2.0,1.6,0.7,0.4,15.7
9559,1627812,2016-17,46.0,24.0,26.0,3.5,8.6,0.406,1.5,3.8,...,1.6,1.9,0.831,0.4,2.0,3.7,1.5,0.9,0.2,10.0
9560,203897,2016-17,47.0,22.0,37.2,6.9,15.1,0.459,2.6,6.6,...,2.5,3.0,0.836,0.4,3.0,3.0,1.8,0.9,0.2,18.9
9561,2216,2016-17,73.0,35.0,24.5,5.9,13.2,0.449,0.3,1.3,...,1.9,2.6,0.731,2.5,5.7,1.7,1.4,0.5,0.1,14.1
9562,2585,2016-17,70.0,33.0,18.1,2.3,4.4,0.534,0.0,0.0,...,1.4,1.8,0.778,2.0,3.9,1.9,1.2,0.8,0.5,6.1


In [4]:
# filter for players who played atleast 9 games
gp_filter = original_df['gp'] > 9
df1 = original_df[gp_filter]

In [5]:
# function to normalize data
def normalize(col):
    return (col - col.min()) / (col.max() - col.min())

In [6]:
# apply normalize function for each column
def vorp(df):
    for col_name in cols_to_norm:
        df['{}_norm'.format(col_name)] = normalize(df[col_name])
    return df

In [7]:
cols_to_norm = ['pts',
                'min',
               'fgm',
               'fga',
               'fg3m',
               'fg3a',
               'ftm',
               'fta',
               'oreb',
               'dreb',
               'ast',
               'stl',
               'tov',
               'blk']

In [8]:
# normalize data be season averages and league totals
df = df1.groupby(['season_id']).apply(vorp)

In [9]:
# season_list for NBA players
season_list = [
    '1996-97',
    '1997-98',
    '1998-99',
    '1999-00',
    '2000-01',
    '2001-02',
    '2002-03',
    '2003-04',
    '2004-05',
    '2005-06',
    '2006-07',
    '2007-08',
    '2008-09',
    '2009-10',
    '2010-11',
    '2011-12',
    '2012-13',
    '2013-14',
    '2014-15',
    '2015-16',
    '2016-17'
]

In [10]:
# function to calculate distance between two points
def calc_distance(u, v):
    dist = np.sqrt(np.sum((u - v)**2))
    return dist

In [11]:
# create a function to find the player and the next season
def find_player(player_id, season):
    # replaces for loop
    for row in df.itertuples():
        if season == row.season_id and player_id == row.player_id:
            return row

In [12]:
def player_comparison_tool(current_player_season, current_player_id):
    for row in df.itertuples():
        if current_player_season == row.season_id and current_player_id == row.player_id:
            current_player_id = row.player_id
            break
            
    if (current_player_id == None):
        print('Can\'t find player with id {0} and season {1}'.format(current_player_id, current_player_season))
        return

    current_player_vector = np.array([
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'pts_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'min_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fgm_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fga_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fg3m_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fg3a_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'ftm_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'fta_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'oreb_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'dreb_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'ast_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'stl_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'tov_norm']).item(),
        (df.loc[(df['player_id'] == current_player_id) & (df['season_id'] == current_player_season), 'blk_norm']).item()
    ])

    print('Projecting player_id {0} for season {1}'.format(current_player_id, season_list[(season_list.index(row.season_id) + 1)]))

    # create a list to store the data
    player_distance = []

    # loop over every row in the dataframe to calculate percent error
    weighted_numbers = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    for row in df.itertuples():
        compared_player_vector = np.array([
        row.pts_norm,
        row.min_norm,
        row.fgm_norm,
        row.fga_norm,
        row.fg3m_norm,
        row.fg3a_norm,
        row.ftm_norm,
        row.fta_norm,
        row.oreb_norm,
        row.dreb_norm,
        row.ast_norm,
        row.stl_norm,
        row.tov_norm,
        row.blk_norm
        ])
        
        vfunc = np.vectorize(calc_distance)
        distance_vect = vfunc(current_player_vector, compared_player_vector)
        weighted_distance = distance_vect * weighted_numbers
        number = np.sum(weighted_distance)
        player_distance.append(number)
        
    # create a new column with error 
    df['distance'] = player_distance

    # sort dataframe by smallest distance
    ranked_df = df.sort_values('distance')
    
    stats = ['pts',
             'min',
             'fgm',
             'fga',
             'fg3m',
             'fg3a',
             'ftm',
             'fta',
             'oreb',
             'dreb',
             'ast',
             'stl',
             'tov',
             'blk'
             ]
    
    # create empty dictionary to put in projected stats
    projected_stats = {}

    for col in stats:
        sum_stat = 0
        sum_weight = 0
        for index, row in ranked_df.iloc[1:11].iterrows():
            # skip over the row if it was 2016-17 season because we can't take the next
            if row.season_id == '2016-17':
                continue
            # get the players next season
            weight = (1 / row.distance)
            next_season = season_list[(season_list.index(row.season_id) + 1)]
            # find the player row with the id and the next season
            player_next_season = find_player(row.player_id, next_season)
            
            # it's not grabbing the column weight
            
            sum_stat += getattr(player_next_season, col) * weight
            sum_weight += weight
        projected_stats['player_id'] = current_player_id
        projected_stats['proj_season_id'] = season_list[(season_list.index(current_player_season) + 1)]
        projected_stats['proj_' + col] = (sum_stat / sum_weight)
    return projected_stats

In [13]:
# small sample of player_ids
player_ids = [
    201939,
    201935,
    201142,
    202326,
    2544,
    203081,
    203076,
    201566,
    1626164,
    101150,
    200768,
    202710,
    202689,
    101108,
    203114
]

### Run for all players when ready

In [None]:
#player_df = pd.read_csv('nba-csv/player_name_player_id_all_seasons_final.csv')

In [None]:
#player_filter = player_df[player_df['season_id'] == '2015-16']

In [None]:
#all_player_ids = player_filter['player_id'].tolist()

In [14]:
# run for loop for each player
final_projections = []
#for baller_id in all_player_ids:
for baller_id in player_ids:
    current_player_id = baller_id
    current_player_season = '2015-16'
    # if function to catch if player is not in player dataframe, if not then don't even try the function
    try:
        projections = player_comparison_tool(current_player_season, current_player_id)
        if (projections == None):
            continue
    except:
        continue
    final_projections.append(projections)

Projecting player_id 201939 for season 2016-17
Projecting player_id 201935 for season 2016-17
Projecting player_id 201142 for season 2016-17
Projecting player_id 202326 for season 2016-17
Projecting player_id 2544 for season 2016-17


In [15]:
proj_columns = [
    'player_id',
    'proj_season_id',
    'proj_pts',
    'proj_min',
    'proj_fgm',
    'proj_fga',
    'proj_fg3m',
    'proj_fg3a',
    'proj_ftm',
    'proj_fta',
    'proj_oreb',
    'proj_dreb',
    'proj_ast',
    'proj_stl',
    'proj_tov',
    'proj_blk'  
]

In [16]:
# convert final projections into a dataframe to split
proj_df = pd.DataFrame(columns=proj_columns, data=final_projections)

In [17]:
# merge dataframes on player_id column and season_ids
final_df = pd.merge(proj_df, df,  how='left', left_on=['player_id','proj_season_id'], right_on = ['player_id','season_id'])

In [18]:
final_df.head(5)

Unnamed: 0,player_id,proj_season_id,proj_pts,proj_min,proj_fgm,proj_fga,proj_fg3m,proj_fg3a,proj_ftm,proj_fta,...,fg3a_norm,ftm_norm,fta_norm,oreb_norm,dreb_norm,ast_norm,stl_norm,tov_norm,blk_norm,distance
0,201939,2016-17,22.470789,36.006073,7.861761,17.56226,2.619288,6.771099,4.147314,5.059809,...,1.0,0.445652,0.422018,0.186047,0.346535,0.589286,0.9,0.517857,0.076923,3.000985
1,201142,2016-17,26.929838,37.070342,9.520661,19.963398,1.534833,4.406998,6.373186,7.782445,...,0.5,0.586957,0.568807,0.139535,0.732673,0.428571,0.55,0.375,0.615385,2.350943
2,202326,2016-17,24.864381,35.642253,8.889318,18.407167,0.749824,2.09087,6.327119,8.169426,...,0.5,0.782609,0.853211,0.488372,0.861386,0.410714,0.7,0.642857,0.5,2.15424
3,2544,2016-17,26.094191,38.914926,9.516962,19.644168,1.298522,3.74163,5.7645,7.666296,...,0.46,0.521739,0.66055,0.302326,0.70297,0.776786,0.6,0.714286,0.230769,1.077072


In [19]:
columns_to_drop = [
    'pts_norm',
    'min_norm',
    'fgm_norm',
    'fga_norm',
    'fg3m_norm',
    'fg3a_norm',
    'ftm_norm',
    'fta_norm',
    'oreb_norm',
    'dreb_norm',
    'ast_norm',
    'stl_norm',
    'tov_norm',
    'blk_norm',
    'distance'
]

In [20]:
final_df.drop(columns = columns_to_drop, inplace = True)

In [21]:
final_df.head(10)

Unnamed: 0,player_id,proj_season_id,proj_pts,proj_min,proj_fgm,proj_fga,proj_fg3m,proj_fg3a,proj_ftm,proj_fta,...,ftm,fta,ft_pct,oreb,dreb,ast,tov,stl,blk,pts
0,201939,2016-17,22.470789,36.006073,7.861761,17.56226,2.619288,6.771099,4.147314,5.059809,...,4.1,4.6,0.898,0.8,3.7,6.6,3.0,1.8,0.2,25.3
1,201142,2016-17,26.929838,37.070342,9.520661,19.963398,1.534833,4.406998,6.373186,7.782445,...,5.4,6.2,0.875,0.6,7.6,4.8,2.2,1.1,1.6,25.1
2,202326,2016-17,24.864381,35.642253,8.889318,18.407167,0.749824,2.09087,6.327119,8.169426,...,7.2,9.3,0.772,2.1,8.9,4.6,3.7,1.4,1.3,27.0
3,2544,2016-17,26.094191,38.914926,9.516962,19.644168,1.298522,3.74163,5.7645,7.666296,...,4.8,7.2,0.674,1.3,7.3,8.7,4.1,1.2,0.6,26.4


In [22]:
# get player name from csv to merge with player id
player_df = pd.read_csv('nba-csv/player_name_player_id_all_seasons_final.csv')
season = player_df['season_id'] == '2016-17'
player_df = player_df[season]

In [23]:
player_proj = pd.merge(final_df, player_df[['player_name', 'player_id']], how = 'left', on = 'player_id').drop_duplicates().reset_index(drop=True)

In [24]:
player_proj.head(10)

Unnamed: 0,player_id,proj_season_id,proj_pts,proj_min,proj_fgm,proj_fga,proj_fg3m,proj_fg3a,proj_ftm,proj_fta,...,fta,ft_pct,oreb,dreb,ast,tov,stl,blk,pts,player_name
0,201939,2016-17,22.470789,36.006073,7.861761,17.56226,2.619288,6.771099,4.147314,5.059809,...,4.6,0.898,0.8,3.7,6.6,3.0,1.8,0.2,25.3,Stephen Curry
1,201142,2016-17,26.929838,37.070342,9.520661,19.963398,1.534833,4.406998,6.373186,7.782445,...,6.2,0.875,0.6,7.6,4.8,2.2,1.1,1.6,25.1,Kevin Durant
2,202326,2016-17,24.864381,35.642253,8.889318,18.407167,0.749824,2.09087,6.327119,8.169426,...,9.3,0.772,2.1,8.9,4.6,3.7,1.4,1.3,27.0,DeMarcus Cousins
3,2544,2016-17,26.094191,38.914926,9.516962,19.644168,1.298522,3.74163,5.7645,7.666296,...,7.2,0.674,1.3,7.3,8.7,4.1,1.2,0.6,26.4,LeBron James


In [25]:
player_info_columns = ['player_name',
                      'player_id',
                      'proj_season_id']

In [26]:
player_info = player_proj[player_info_columns]

In [27]:
player_proj.to_csv('nba-csv/player_proj_df.csv', index=False)
player_info.to_csv('nba-csv/player_info_df.csv', index=False)

In [None]:
print('Done.')