In [1]:
import pandas as pd

from spforge.transformers import LagTransformer, RollingMeanTransformer

df = pd.read_parquet("data/game_player_subsample.parquet")
df  = df[df['minutes']>0]
df['points_per_minute'] = df['points'] / df['minutes']

df.head()

Unnamed: 0,team_id,start_date,game_id,player_id,player_name,start_position,team_id_opponent,points,game_minutes,minutes,won,plus_minus,location,score,score_opponent,points_per_minute
38956,1610612755,2022-10-18,22200001,202699,Tobias Harris,F,1610612738,18.0,48.0,34.233,0,-1.0,away,117,126,0.525808
38957,1610612755,2022-10-18,22200001,200782,P.J. Tucker,F,1610612738,6.0,48.0,33.017,0,-6.0,away,117,126,0.181725
38958,1610612755,2022-10-18,22200001,203954,Joel Embiid,C,1610612738,26.0,48.0,37.267,0,-13.0,away,117,126,0.697668
38959,1610612755,2022-10-18,22200001,1630178,Tyrese Maxey,G,1610612738,21.0,48.0,38.2,0,-6.0,away,117,126,0.549738
38960,1610612755,2022-10-18,22200001,201935,James Harden,G,1610612738,35.0,48.0,37.267,0,1.0,away,117,126,0.939169


In [2]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from spforge.predictor import SklearnPredictor

from spforge import ColumnNames, Pipeline
from spforge.cross_validator import MatchKFoldCrossValidator
from spforge.scorer import SklearnScorer

rm_transformer_window20 = RollingMeanTransformer(
    features=['points', 'points_per_minute', 'minutes'],
    granularity=['player_id'],
    window=20,
    match_id_update_column='game_id'
)
rm_transformer_window10 = RollingMeanTransformer(
    features=['points', 'points_per_minute', 'minutes'],
    granularity=['player_id'],
    window=10,
    match_id_update_column='game_id'
)
rm_transformer_window5 = RollingMeanTransformer(
    features=['points', 'points_per_minute', 'minutes'],
    granularity=['player_id'],
    window=5,
    match_id_update_column='game_id'
)
rm_transformer_window40 = RollingMeanTransformer(
    features=['points', 'points_per_minute', 'minutes'],
    granularity=['player_id'],
    window=40,
    match_id_update_column='game_id'
)

lag_transformer = LagTransformer(
    features=['points', 'points_per_minute', 'minutes'],
    granularity=['player_id'],
    lag_length=5,
    match_id_update_column='game_id'
)


column_names = ColumnNames(
    team_id="team_id",
    match_id="game_id",
    start_date="start_date",
    player_id="player_id"    
)
predictor = SklearnPredictor(estimator=LGBMRegressor(max_depth=4,verbose=-100,alpha=0.5, objective='quantile'), target='points', features = ['team_id_opponent'],
                     
                                       convert_cat_features_to_cat_dtype=True)
pipeline = Pipeline(
    lag_transformers = [lag_transformer, rm_transformer_window5, rm_transformer_window10, rm_transformer_window20, rm_transformer_window40],
    predictor = predictor,
    column_names=column_names
)

cross_validator_cat_feats = MatchKFoldCrossValidator(
    match_id_column_name='game_id',
    date_column_name='start_date',
    predictor = pipeline,
)
mean_absolute_scorer = SklearnScorer(pred_column=pipeline.pred_column, scorer_function=mean_absolute_error, target='points')
df = cross_validator_cat_feats.generate_validation_df(df, add_train_prediction=True)
cross_validator_cat_feats.cross_validation_score(df, scorer=mean_absolute_scorer )


4.4154947108970735

In [None]:
from spforge.transformers import OpponentTransformer

df['net_over_predicted_per_minute'] = (df['points']- df[pipeline.pred_column])/df['minutes']

opp_transformer = OpponentTransformer(
    granularity=['start_position'],
    features=['net_over_predicted_per_minute'],
    window=20,    
    match_id_update_column = 'game_id',
    team_column='team_id',
    prefix='opponent_start_position'
)
df = opp_transformer.transform_historical(df)
df.tail()

In [11]:
df['went_over'] = (df['points']>df[pipeline.pred_column]).astype(int)
df[df[opp_transformer.features_out[0]]>0.1]['went_over'].mean(), df[df[opp_transformer.features_out[0]]<0]['went_over'].mean()

(np.float64(0.496098104793757), np.float64(0.4113162487370258))

In [5]:
pipeline = Pipeline(
    lag_transformers = [lag_transformer, rm_transformer_window5, rm_transformer_window10, rm_transformer_window20, rm_transformer_window40, opp_transformer],
    predictor = predictor,
    column_names=column_names
)

cross_validator = MatchKFoldCrossValidator(
    match_id_column_name='game_id',
    date_column_name='start_date',
    predictor = pipeline,
)
mean_absolute_scorer = SklearnScorer(pred_column=pipeline.pred_column, scorer_function=mean_absolute_error, target='points')
df = cross_validator.generate_validation_df(df, add_train_prediction=True, return_features=True)
cross_validator.cross_validation_score(df, scorer=mean_absolute_scorer )

4.361778527317093

In [6]:
df['is_starting'] = (df['start_position'] != '').astype(int)
opp_transformer_is_starting = OpponentTransformer(
    granularity=['is_starting'],
    features=['points_per_minute'],
    window=20,    
    match_id_update_column = 'game_id',
    team_column='team_id',
    prefix='opponent_is_starting'
)
df = opp_transformer_is_starting.transform_historical(df)
df[['opponent_is_starting_points_per_minute', 'opponent_position_points_per_minute']] = df.groupby(['game_id','team_id_opponent', 'is_starting', 'start_position'])[[*opp_transformer_is_starting.features_out, *opp_transformer.features_out]].transform('mean')
df.tail()

Unnamed: 0,lag_points_per_minute3,lag_points_per_minute2,lag_points4,lag_minutes4,player_id,lag_points5,opponent_is_starting_points_per_minute20,lag_points_per_minute4,lag_minutes3,team_id_opponent,...,lag_minutes2,rolling_mean_points5,lag_minutes5,game_id,start_date,points_per_minute,player_name,rolling_mean_points40,opponent_is_starting_points_per_minute,opponent_position_points_per_minute
16398,0.0,0.508462,2.0,20.933,1630240,2.0,0.422541,0.095543,4.4,1610612737,...,13.767,2.6,9.633,22200777,2023-02-01,0.279291,Saben Lee,6.333333,0.422541,-0.071235
16399,0.419595,0.499469,6.0,31.433,202687,4.0,0.422541,0.190882,9.533,1610612737,...,16.017,4.4,15.817,22200777,2023-02-01,0.2,Bismack Biyombo,4.1,0.422541,-0.071235
16400,0.0,0.170213,9.0,24.167,1629006,4.0,0.422541,0.372409,9.167,1610612737,...,11.75,3.6,13.367,22200777,2023-02-01,0.130149,Josh Okogie,5.55,0.422541,-0.071235
16401,0.34382,0.224165,0.0,2.917,1630688,6.0,0.422541,0.0,5.817,1610612737,...,13.383,3.6,19.467,22200777,2023-02-01,0.211082,Ish Wainright,3.764706,0.422541,-0.071235
16402,1.032134,0.444444,12.0,23.083,1629111,10.0,0.422541,0.519863,14.533,1610612737,...,4.5,8.6,17.117,22200777,2023-02-01,0.570505,Jock Landale,6.8,0.422541,-0.071235


In [7]:
df['opponent_diff_position'] = df['opponent_position_points_per_minute'] - df['opponent_is_starting_points_per_minute']

features = ['opponent_diff_position'] + [c for c in pipeline.features if c != opp_transformer.features_out[0]]

predict_diff_position = SklearnPredictor(estimator=LGBMRegressor(max_depth=4,verbose=-100), target='points', features = features, 
                                       convert_cat_features_to_cat_dtype=True)

cross_validator_diff_position = MatchKFoldCrossValidator(
    match_id_column_name='game_id',
    date_column_name='start_date',
    predictor = predict_diff_position,
)
mean_absolute_scorer = SklearnScorer(pred_column=pipeline.pred_column, scorer_function=mean_absolute_error, target='points')
df = cross_validator_diff_position.generate_validation_df(df, add_train_prediction=True)
cross_validator_diff_position.cross_validation_score(df, scorer=mean_absolute_scorer )



4.432825527731211