Implementing a basic predictor that estimates player performance using the average of their last n games played

# Importing Necessary Libraries, Setup


In [1]:
import numpy as np
import pandas as pd

In [2]:
# stop the pandas indexing/splicing warning from appearing
import warnings
warnings.filterwarnings('ignore')

In [3]:
# df = pd.read_csv('dataset2.csv')
df = pd.read_csv('dataset_2020-21.csv')
df.shape

(22309, 27)

In [4]:
df

Unnamed: 0,Date,Name,Team,Starter,Position,Salary,FPTS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+-
0,20201222,Reggie Perry,Bkn,0,PF/C,3000,6.00,7.77,1,3,...,1,1,2,1,0,0,0,0,2,-8
1,20201222,Mfiondu Kabengele,LAC,0,PF/C,3000,0.00,0.93,0,0,...,0,0,0,0,0,0,0,0,0,-2
2,20201222,Amir Coffey,LAC,0,SF,3100,0.00,0.93,0,0,...,0,0,0,0,0,0,0,0,0,-2
3,20201222,Quinn Cook,LAL,0,PG,3200,0.00,1.38,0,0,...,0,0,0,0,0,0,0,0,0,2
4,20201222,Brad Wanamaker,GSW,0,PG,3300,8.25,21.67,0,2,...,0,1,1,3,0,0,1,2,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22304,20210701,Kevin Huerter,Atl,1,SG/SF,5800,28.25,37.73,3,12,...,1,2,3,7,2,1,2,4,8,-12
22305,20210701,Clint Capela,Atl,1,C,6300,23.00,21.13,2,3,...,4,4,8,1,0,3,1,2,6,-11
22306,20210701,John Collins,Atl,1,PF/C,6700,35.00,34.13,7,16,...,3,5,8,3,0,0,0,3,19,-16
22307,20210701,Jrue Holiday,Mil,1,PG/SG,8300,56.00,41.92,9,20,...,2,4,6,13,1,0,2,1,25,8


In [5]:
df = df[['Date', 'Name', 'Team', 'Position', 'Salary', 'FPTS', 'PTS', 'TRB', '3P', 'AST', 'STL', 'BLK', 'TOV']]
# df = df[df.FPTS > 0] # drop all players who have negative value
df.reset_index(inplace=True, drop=True)
# df.sort_values(by=['Date'], ascending=True, inplace=True)
df


Unnamed: 0,Date,Name,Team,Position,Salary,FPTS,PTS,TRB,3P,AST,STL,BLK,TOV
0,20201222,Reggie Perry,Bkn,PF/C,3000,6.00,2,2,0,1,0,0,0
1,20201222,Mfiondu Kabengele,LAC,PF/C,3000,0.00,0,0,0,0,0,0,0
2,20201222,Amir Coffey,LAC,SF,3100,0.00,0,0,0,0,0,0,0
3,20201222,Quinn Cook,LAL,PG,3200,0.00,0,0,0,0,0,0,0
4,20201222,Brad Wanamaker,GSW,PG,3300,8.25,3,1,0,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22304,20210701,Kevin Huerter,Atl,SG/SF,5800,28.25,8,3,2,7,2,1,2
22305,20210701,Clint Capela,Atl,C,6300,23.00,6,8,0,1,0,3,1
22306,20210701,John Collins,Atl,PF/C,6700,35.00,19,8,3,3,0,0,0
22307,20210701,Jrue Holiday,Mil,PG/SG,8300,56.00,25,6,3,13,1,0,2


# Helper Functions

In [7]:
scoring = {'PTS':1, '3P':0.5, 'TRB':1.25, 'AST':1.5, 'STL':2, 'BLK':2, 'TOV':-.5}
# note double double +1.5, triple double +3

def rollingAveragePrediction(df1, player, date, n):
  if n < 1:
    return np.NAN
    
  pred = {}
  pts = {}

  player_df = getSinglePlayer(df1, player)
  player_df = player_df[player_df['Date'] < date].tail(n)

  if player_df.shape[0] < n:
    # print(f'not enough games: {player_df.shape[0]}, {n}')
    return np.NAN
  
  # takes the average of the last n games, rounds to the nearest integer
  pred['PTS'] = player_df['PTS'].mean()
  pred['TRB'] = player_df['TRB'].mean()
  pred['3P'] = player_df['3P'].mean()
  pred['AST'] = player_df['AST'].mean()
  pred['STL'] = player_df['STL'].mean()
  pred['BLK'] = player_df['BLK'].mean()
  pred['TOV'] = player_df['TOV'].mean()
  # print(f'Average over {n} games {pred}')

  count = 0
  # need to not count turnovers
  # only count Points, Rebounds, Assists, Blocks, Steals
  for key in pred:
    if key not in ['3P', 'TOV']:
      # print(key, pred[key])
      if pred[key] >= 10:
        count += 1
    pts[key] = pred[key] * scoring[key]
    # print(f'{key}: {pred[key] * scoring[key]} = {pred[key]} * {scoring[key]}')
  

  total_score = sum(pts.values())
  # print(f'sum: {total_score}')
  # player can get points for both double and triple
  if count >= 2:
    total_score += 1.5
  if count >= 3:
    total_score += 3

  # print(f'Prediction: {total_score}')
  # print(count)
  return total_score

In [None]:
def getSinglePlayer(df1, player):
  return df1[df1['Name'] == player]

# Test

In [None]:
def predictAllRollingAvg(df1, n):
  predictions = []
  
  for index, row in df1.iterrows():
    # print(row['Name'], row['Date'])
    name = row['Name']
    date = row['Date']

    prediction = rollingAveragePrediction(df1, name, date, n)
    # print(name, date, prediction)
    predictions.append(prediction)
  return predictions

In [None]:
res3 = predictAllRollingAvg(df, 3)
res5 = predictAllRollingAvg(df, 5)
res7 = predictAllRollingAvg(df, 7)
res10 = predictAllRollingAvg(df, 10)

In [None]:
df['RollingAvg3'] = res3
df['RollingAvg5'] = res5
df['RollingAvg7'] = res7
df['RollingAvg10'] = res10

In [None]:
df

Unnamed: 0,Date,Name,Team,Position,Salary,FPTS,PTS,TRB,3P,AST,STL,BLK,TOV,RollingAvg3,RollingAvg5,RollingAvg7,RollingAvg10
0,20201222,Reggie Perry,Bkn,PF/C,3000,6.00,2,2,0,1,0,0,0,,,,
1,20201222,Mfiondu Kabengele,LAC,PF/C,3000,0.00,0,0,0,0,0,0,0,,,,
2,20201222,Amir Coffey,LAC,SF,3100,0.00,0,0,0,0,0,0,0,,,,
3,20201222,Quinn Cook,LAL,PG,3200,0.00,0,0,0,0,0,0,0,,,,
4,20201222,Brad Wanamaker,GSW,PG,3300,8.25,3,1,0,3,0,0,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22304,20210701,Kevin Huerter,Atl,SG/SF,5800,28.25,8,3,2,7,2,1,2,26.00,28.50,26.25,24.00
22305,20210701,Clint Capela,Atl,C,6300,23.00,6,8,0,1,0,3,1,22.25,27.00,27.00,28.25
22306,20210701,John Collins,Atl,PF/C,6700,35.00,19,8,3,3,0,0,0,20.50,29.75,31.75,31.00
22307,20210701,Jrue Holiday,Mil,PG/SG,8300,56.00,25,6,3,13,1,0,2,38.00,41.50,38.00,36.50


In [None]:
seasonAvgs = []
for index, row in df.iterrows():
  name = row['Name']
  date = row['Date']

  player_df = getSinglePlayer(df, name)
  player_df = player_df[player_df['Date'] < date]

  numGames = player_df.shape[0]

  seasonAvg = rollingAveragePrediction(df, name, date, numGames)
  print(name, numGames, seasonAvg)
  seasonAvgs.append(seasonAvg)

In [None]:
df['SeasonAvg'] = seasonAvgs

In [None]:
df

Unnamed: 0,Date,Name,Team,Position,Salary,FPTS,PTS,TRB,3P,AST,STL,BLK,TOV,RollingAvg3,RollingAvg5,RollingAvg7,RollingAvg10,SeasonAvg
0,20201222,Reggie Perry,Bkn,PF/C,3000,6.00,2,2,0,1,0,0,0,,,,,
1,20201222,Mfiondu Kabengele,LAC,PF/C,3000,0.00,0,0,0,0,0,0,0,,,,,
2,20201222,Amir Coffey,LAC,SF,3100,0.00,0,0,0,0,0,0,0,,,,,
3,20201222,Quinn Cook,LAL,PG,3200,0.00,0,0,0,0,0,0,0,,,,,
4,20201222,Brad Wanamaker,GSW,PG,3300,8.25,3,1,0,3,0,0,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22304,20210701,Kevin Huerter,Atl,SG/SF,5800,28.25,8,3,2,7,2,1,2,26.00,28.50,26.25,24.00,22.75
22305,20210701,Clint Capela,Atl,C,6300,23.00,6,8,0,1,0,3,1,22.25,27.00,27.00,28.25,40.00
22306,20210701,John Collins,Atl,PF/C,6700,35.00,19,8,3,3,0,0,0,20.50,29.75,31.75,31.00,30.50
22307,20210701,Jrue Holiday,Mil,PG/SG,8300,56.00,25,6,3,13,1,0,2,38.00,41.50,38.00,36.50,39.25


In [None]:
df.to_csv('2020-21RollingAvgs.csv', line_terminator='\n', index=False)

# Calculating MSE, RMSE for Rolling Avgs

In [None]:
comb = pd.DataFrame()

In [None]:
soln1 = pd.read_csv('/content/2019-20RollingAvgs.csv')
soln2 = pd.read_csv('/content/2020-21RollingAvgs.csv')

In [None]:
comb = pd.concat([soln1,soln2], ignore_index=True)

In [None]:
comb.to_csv('2019-21RollingAvgs.csv', line_terminator='\n', index=False)

In [None]:
comb

Unnamed: 0,Date,Name,Team,Position,Salary,FPTS,PTS,TRB,3P,AST,STL,BLK,TOV,RollingAvg3,RollingAvg5,RollingAvg7,RollingAvg10,SeasonAvg
0,20191022,Nicolo Melli,Nor,PF/C,3000,24.25,14,5,4,2,0,0,2,,,,,
1,20191022,Kenrich Williams,Nor,SF/PF,3100,20.50,3,6,0,3,1,2,1,,,,,
2,20191022,Jared Dudley,LAL,SF/PF,3100,7.00,6,0,2,0,0,0,0,,,,,
3,20191022,Josh Hart,Nor,SG/SF,3200,33.50,15,10,3,1,0,1,1,,,,,
4,20191022,Jahlil Okafor,Nor,C,3200,12.00,8,2,0,0,0,1,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43477,20210701,Kevin Huerter,Atl,SG/SF,5800,28.25,8,3,2,7,2,1,2,26.00,28.50,26.25,24.00,22.75
43478,20210701,Clint Capela,Atl,C,6300,23.00,6,8,0,1,0,3,1,22.25,27.00,27.00,28.25,40.00
43479,20210701,John Collins,Atl,PF/C,6700,35.00,19,8,3,3,0,0,0,20.50,29.75,31.75,31.00,30.50
43480,20210701,Jrue Holiday,Mil,PG/SG,8300,56.00,25,6,3,13,1,0,2,38.00,41.50,38.00,36.50,39.25


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

In [None]:
data = []

for colName in ['RollingAvg3', 'RollingAvg5', 'RollingAvg7', 'RollingAvg10', 'SeasonAvg']:
  print(colName)
  df = comb.dropna(subset=[colName])
  meansqre_err = mean_squared_error(df['FPTS'], df[colName])
  rmse = math.sqrt(meansqre_err)
  mae = mean_absolute_error(df['FPTS'], df[colName])

  info = {}
  info['Number of Games'] = colName
  info['RMSE'] = rmse
  info['MAE'] = mae

  data.append(info)


soln_df = pd.DataFrame(data)

RollingAvg3
RollingAvg5
RollingAvg7
RollingAvg10
SeasonAvg


In [None]:
soln_df

Unnamed: 0,Number of Games,RMSE,MAE
0,RollingAvg3,10.603127,8.197535
1,RollingAvg5,10.237318,7.934736
2,RollingAvg7,10.122215,7.856673
3,RollingAvg10,10.059993,7.820629
4,SeasonAvg,10.043953,7.764716
