In [2]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

In [3]:
df = pd.read_csv('csvs/player_stats.csv')
del df['Unnamed: 0']
df.head()

Unnamed: 0,player,pos,age,tm,g,gs,mp,fg,fga,fg%,...,pts_max,share,team,w,l,w/l%,gb,ps/g,pa/g,srs
0,a.c. green,PF,27,lal,82,21,26.4,3.1,6.6,0.476,...,0.0,0.0,los angeles lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,byron scott,SG,29,lal,82,82,32.1,6.1,12.8,0.477,...,0.0,0.0,los angeles lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,james worthy,SF,29,lal,78,74,38.6,9.2,18.7,0.492,...,0.0,0.0,los angeles lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,larry drew,PG,32,lal,48,2,10.3,1.1,2.6,0.432,...,0.0,0.0,los angeles lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,magic johnson,PG,31,lal,79,79,37.1,5.9,12.4,0.477,...,960.0,0.518,los angeles lakers,58,24,0.707,5.0,106.3,99.6,6.73


In [17]:
predictors = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg%', '3p',
       '3pa', '3p%', '2p', '2pa', '2p%', 'efg%', 'ft', 'fta', 'ft%', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'year', 'w', 'l', 'w/l%', 'gb', 'ps/g',
       'pa/g', 'srs']

df['gb'] = pd.to_numeric(df['gb'].replace('—', 0))

train = df[df['year'] < 2020]
test = df[df['year'] == 2020]

rdg = Ridge(alpha=0.1)
rdg.fit(train[predictors], train['share'])

predictions = rdg.predict(test[predictors])

predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)

combination = pd.concat([test[['player', 'share', 'year']], predictions], axis=1)
combination = combination.sort_values('share', ascending=False)
combination['rk'] = list(range(1, len(combination) + 1))

combination = combination.sort_values('predictions', ascending=False)
combination['predicted_rk'] = list(range(1, len(combination) + 1))

In [1]:
def find_ap(df):
    actual = combination.sort_values('share', ascending=False).head(5)
    predicted = combination.sort_values('predictions', ascending=False)
    ps = []
    found = 0
    seen = 1
    
    for index, row in predicted.iterrows():
        if row['player'] in actual['player'].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [19]:
find_ap(combination)

0.9428571428571428

In [20]:
years = list(range(1991, 2021))

In [48]:
aps = []
all_prediction = []

for year in years[5:]:
    train = df[df['year'] < year ]
    test = df[df['year'] == year]
    rdg.fit(train[predictors], train['share'])
    y_pred = rdg.predict(test[predictors])
    y_pred = pd.DataFrame(y_pred, columns=['predictions'], index=test.index)
    combination = pd.concat([test[['player', 'share', 'year']], y_pred], axis=1)
    all_prediction.append(combination)
    aps.append(find_ap(combination))

In [49]:
sum(aps) / len(aps)

0.7110422205931348

In [50]:
combination 

Unnamed: 0,player,share,year,predictions
596,aaron gordon,0.0,2020,0.019887
597,al-farouq aminu,0.0,2020,-0.028127
598,b.j. johnson,0.0,2020,-0.014142
599,d.j. augustin,0.0,2020,0.008575
600,evan fournier,0.0,2020,0.029770
...,...,...,...,...
12627,rondae hollis-jefferson,0.0,2020,0.004633
12628,serge ibaka,0.0,2020,0.038976
12629,shamorie ponds,0.0,2020,0.007462
12630,stanley johnson,0.0,2020,0.009776


In [45]:
aps

[0.39584675975653416,
 0.7733333333333333,
 0.386993006993007,
 0.8392857142857142,
 0.9266666666666665,
 0.6815270935960591,
 0.9666666666666666,
 0.6928571428571428,
 0.5644444444444445,
 0.4003208556149732,
 0.6676190476190477,
 0.9428571428571428,
 0.75,
 0.8253968253968254,
 0.7253968253968253,
 0.4323717948717949,
 0.6976190476190476,
 0.6327838827838828,
 0.9266666666666665,
 0.8333333333333333,
 0.7611111111111111,
 0.7909090909090909,
 0.6375757575757576,
 0.9428571428571428]

In [36]:
def add_rank(combination):
    combination = combination.sort_values('predictions', ascending=False)
    combination['predicted_rk'] = list(range(1, len(combination) + 1))
    combination = combination.sort_values('share', ascending=False)
    combination['rk'] = list(range(1, len(combination) + 1))
    combination['diff'] = combination['rk'] - combination['predicted_rk']
    return combination

In [47]:
ranking = add_rank(all_prediction[1])
ranking[ranking['rk'] < 6].sort_values('diff', ascending=False)

Unnamed: 0,player,share,predictions,predicted_rk,rk,diff
1475,karl malone,0.857,0.195454,2,1,-1
10041,michael jordan,0.832,0.172254,3,2,-1
851,grant hill,0.327,0.13056,6,3,-3
4379,tim hardaway,0.207,0.060021,19,4,-15
7496,glen rice,0.117,0.035304,49,5,-44


In [145]:
def backtest(df, model, year, predictors):
    aps = []
    all_predictions = []

    for year in years[5:]:
        train = df[df['year'] < year ]
        test = df[df['year'] == year]
        model.fit(train[predictors], train['share'])
        y_pred = model.predict(test[predictors])
        y_pred = pd.DataFrame(y_pred, columns=['predictions'], index=test.index)
        new_df = pd.concat([test[['player', 'share']], y_pred], axis=1)
        new_df = add_rank(new_df)
        all_predictions.append(new_df)
        aps.append(find_ap(new_df))
        
    return sum(aps)/len(aps), aps, pd.concat(all_prediction)
#     return pd.concat(all_predictions)

In [150]:
mean_ap, aps, all_pred = backtest(df, rdg, years[5:], predictors)

In [152]:
aps

[0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428,
 0.9428571428571428]

In [172]:
def backtest(df, model, year, predictors):
    aps = []
    all_prediction = []

    for year in years[5:]:
        train = df[df['year'] < year ]
        test = df[df['year'] == year]
        rdg.fit(train[predictors], train['share'])
        y_pred = rdg.predict(test[predictors])
        y_pred = pd.DataFrame(y_pred, columns=['predictions'], index=test.index)
        combination = pd.concat([test[['player', 'share']], y_pred], axis=1)
        all_prediction.append(combination)
        aps.append(find_ap(combination))

#     return sum(aps)/len(aps), aps, pd.concat(all_prediction)
    return train

In [173]:
aps = backtest(df, rdg, years[5:], predictors)

In [176]:
aps['year'].unique()

array([1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2017, 2010, 2011, 2012, 2013, 2014, 2015, 2008, 2009,
       2016, 2018, 2019, 2004, 2005, 2006, 2007], dtype=int64)

In [157]:
aps = []
all_prediction = []

for year in years[5:]:
    train = df[df['year'] < year ]
    test = df[df['year'] == year]
    rdg.fit(train[predictors], train['share'])
    y_pred = rdg.predict(test[predictors])
    y_pred = pd.DataFrame(y_pred, columns=['predictions'], index=test.index)
    combination = pd.concat([test[['player', 'share']], y_pred], axis=1)
    all_prediction.append(combination)
    aps.append(find_ap(combination))

In [138]:
sum(aps)/len(aps)

0.7110422205931348