In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sqlalchemy import create_engine


In [2]:
engine = create_engine("postgresql://postgres:postgres@127.0.0.1:5432/armagedon")

In [3]:
data_stats = pd.read_csv('stats.csv')

query = """
SELECT 
    g.id, 
    g.date, 
    g.home_team, 
    g.away_team, 
    g.season,
    MAX(CASE WHEN s.team_id = g.home_team THEN s.game_id END) AS last_home_game,
    MAX(CASE WHEN s.team_id = g.away_team THEN s.game_id END) AS last_away_game
FROM 
    games g
LEFT JOIN 
    stats s ON g.id > s.game_id
GROUP BY 
    g.id, g.date, g.home_team, g.away_team, g.season
ORDER BY 
    g.date, g.id;
"""
data_games = pd.read_sql(query,engine)


In [4]:
data_games = data_games.dropna()
data_games['last_home_game'] = data_games['last_home_game'].astype(int)
data_games['last_away_game'] = data_games['last_away_game'].astype(int)
data_games

Unnamed: 0,id,date,home_team,away_team,season,last_home_game,last_away_game
17,127,2015-10-29,15,19,2015,118,120
18,128,2015-10-30,24,1,2015,121,110
19,129,2015-10-30,16,8,2015,125,123
20,130,2015-10-30,7,20,2015,120,117
21,131,2015-10-30,26,25,2015,113,122
...,...,...,...,...,...,...,...
11531,13744,2024-04-14,25,8,2023,13426,13419
11532,13745,2024-04-14,31,10,2023,13425,13422
11533,13746,2024-04-14,11,40,2023,13420,13415
11534,13747,2024-04-14,16,14,2023,13421,13418


In [5]:
data_stats['linked_tag'] = data_stats['game_id'].astype(str) + '/' + data_stats['team_id'].astype(str)
data_games['home_tag'] = data_games['id'].astype(str) + '/' + data_games['home_team'].astype(str)
data_games = data_games.merge(data_stats[['linked_tag','win']], how='left', left_on="home_tag", right_on="linked_tag")
data_games = data_games.drop(columns=['linked_tag','home_tag'])
data_games = data_games.dropna()
data_games

Unnamed: 0,id,date,home_team,away_team,season,last_home_game,last_away_game,win
0,127,2015-10-29,15,19,2015,118,120,False
1,128,2015-10-30,24,1,2015,121,110,False
2,129,2015-10-30,16,8,2015,125,123,True
3,130,2015-10-30,7,20,2015,120,117,True
4,131,2015-10-30,26,25,2015,113,122,False
...,...,...,...,...,...,...,...,...
11231,13419,2024-03-03,8,27,2023,13407,13406,False
11232,13420,2024-03-03,2,11,2023,13407,13408,True
11233,13421,2024-03-03,22,16,2023,13410,13413,False
11234,13422,2024-03-03,26,10,2023,13398,13405,True


In [6]:
print(data_stats.columns[2:27])
data_stats = data_stats.drop(columns=data_stats.columns[2:27])

Index(['points', 'fgm', 'fga', 'fgp', 'ftm', 'fta', 'ftp', 'tpm', 'tpa', 'tpp',
       'offReb', 'defReb', 'totReb', 'assists', 'pFouls', 'steals',
       'turnovers', 'blocks', 'plusMinus', 'home_team', 'date', 'season',
       'win', 'win_home', 'win_away'],
      dtype='object')


In [7]:
data_stats.loc[:,data_stats.columns.str.endswith('cumul')].columns

Index(['points_cumul', 'fgm_cumul', 'fga_cumul', 'fgp_cumul', 'ftm_cumul',
       'fta_cumul', 'ftp_cumul', 'tpm_cumul', 'tpa_cumul', 'tpp_cumul',
       'offReb_cumul', 'defReb_cumul', 'totReb_cumul', 'assists_cumul',
       'pFouls_cumul', 'steals_cumul', 'turnovers_cumul', 'blocks_cumul',
       'plusMinus_cumul', 'home_team_cumul', 'win_cumul', 'win_home_cumul',
       'win_away_cumul'],
      dtype='object')

In [8]:
 data_stats = data_stats.loc[:,~data_stats.columns.str.endswith('cumul')]

In [9]:
data_stats.shape

(22354, 29)

In [10]:
data_games['last_home_game_tag'] = data_games['last_home_game'].astype(str) + '/' + data_games['home_team'].astype(str)
data_games['last_away_game_tag'] = data_games['last_away_game'].astype(str) + '/' + data_games['away_team'].astype(str)

In [11]:
data_stats.head()

Unnamed: 0,game_id,team_id,nb_games,points_avg,fgm_avg,fga_avg,fgp_avg,ftm_avg,fta_avg,ftp_avg,tpm_avg,tpa_avg,tpp_avg,offReb_avg,defReb_avg,totReb_avg,assists_avg,pFouls_avg,steals_avg,turnovers_avg,blocks_avg,plusMinus_avg,home_team_avg,win_avg,win_home_avg,win_away_avg,last_10_games_wins,serie,linked_tag
0,110,1,1,94.0,37.0,82.0,45.1,12.0,15.0,80.0,8.0,27.0,29.6,7.0,33.0,40.0,22.0,25.0,9.0,15.0,4.0,-12.0,1.0,0.0,0.0,,0.0,-1,110/1
1,128,1,2,103.0,39.5,82.5,47.85,15.0,20.5,74.6,9.0,25.5,35.65,7.0,32.5,39.5,24.0,21.5,10.0,15.0,4.0,-0.5,0.5,0.5,0.0,1.0,1.0,1,128/1
2,135,1,3,101.0,38.333333,82.666667,46.366667,15.666667,21.0,75.5,8.666667,24.666667,35.366667,7.333333,34.0,41.333333,23.666667,20.0,9.666667,15.0,4.0,0.666667,0.666667,0.666667,0.5,1.0,2.0,2,135/1
3,148,1,4,99.25,38.0,84.0,45.275,15.0,19.25,79.85,8.25,25.75,32.55,7.75,35.25,43.0,23.25,19.0,9.5,14.0,4.5,1.0,0.5,0.75,0.5,1.0,3.0,3,148/1
4,164,1,5,99.0,37.8,85.2,44.44,15.4,19.8,79.34,8.0,24.8,32.7,9.4,35.2,44.6,23.6,18.0,9.6,14.2,3.8,2.0,0.4,0.8,0.5,1.0,4.0,4,164/1


In [12]:
data_games.head()

Unnamed: 0,id,date,home_team,away_team,season,last_home_game,last_away_game,win,last_home_game_tag,last_away_game_tag
0,127,2015-10-29,15,19,2015,118,120,False,118/15,120/19
1,128,2015-10-30,24,1,2015,121,110,False,121/24,110/1
2,129,2015-10-30,16,8,2015,125,123,True,125/16,123/8
3,130,2015-10-30,7,20,2015,120,117,True,120/7,117/20
4,131,2015-10-30,26,25,2015,113,122,False,113/26,122/25


In [13]:
df = pd.merge(data_games,data_stats, how='inner', left_on='last_home_game_tag', right_on= 'linked_tag', suffixes=('',''))
df = pd.merge(df,data_stats, how='inner', left_on='last_away_game_tag', right_on= 'linked_tag', suffixes=('_home','_away'))

In [14]:
df = df.drop(columns= ['game_id_home', 'team_id_home', 'game_id_away', 'team_id_away', 'last_home_game_tag', 'last_away_game_tag', 'linked_tag_home', 'linked_tag_away'])

In [15]:
x_columns = df.columns.to_list()[8:]
x_columns

['nb_games_home',
 'points_avg_home',
 'fgm_avg_home',
 'fga_avg_home',
 'fgp_avg_home',
 'ftm_avg_home',
 'fta_avg_home',
 'ftp_avg_home',
 'tpm_avg_home',
 'tpa_avg_home',
 'tpp_avg_home',
 'offReb_avg_home',
 'defReb_avg_home',
 'totReb_avg_home',
 'assists_avg_home',
 'pFouls_avg_home',
 'steals_avg_home',
 'turnovers_avg_home',
 'blocks_avg_home',
 'plusMinus_avg_home',
 'home_team_avg_home',
 'win_avg_home',
 'win_home_avg_home',
 'win_away_avg_home',
 'last_10_games_wins_home',
 'serie_home',
 'nb_games_away',
 'points_avg_away',
 'fgm_avg_away',
 'fga_avg_away',
 'fgp_avg_away',
 'ftm_avg_away',
 'fta_avg_away',
 'ftp_avg_away',
 'tpm_avg_away',
 'tpa_avg_away',
 'tpp_avg_away',
 'offReb_avg_away',
 'defReb_avg_away',
 'totReb_avg_away',
 'assists_avg_away',
 'pFouls_avg_away',
 'steals_avg_away',
 'turnovers_avg_away',
 'blocks_avg_away',
 'plusMinus_avg_away',
 'home_team_avg_away',
 'win_avg_away',
 'win_home_avg_away',
 'win_away_avg_away',
 'last_10_games_wins_away',
 'ser

In [16]:
df.shape

(11364, 60)

In [17]:
df = df.reset_index(drop=True)

In [18]:
df.isnull().sum()

id                           0
date                         0
home_team                    0
away_team                    0
season                       0
last_home_game               0
last_away_game               0
win                          0
nb_games_home                0
points_avg_home              0
fgm_avg_home                 0
fga_avg_home                 0
fgp_avg_home                 0
ftm_avg_home                 0
fta_avg_home                 0
ftp_avg_home                 0
tpm_avg_home                 0
tpa_avg_home                 0
tpp_avg_home                 0
offReb_avg_home              0
defReb_avg_home              0
totReb_avg_home              0
assists_avg_home             0
pFouls_avg_home              0
steals_avg_home              0
turnovers_avg_home           0
blocks_avg_home              0
plusMinus_avg_home           0
home_team_avg_home           0
win_avg_home                 0
win_home_avg_home          128
win_away_avg_home           79
last_10_

In [19]:
df = df.fillna(0)
df.isnull().sum()

  df = df.fillna(0)


id                         0
date                       0
home_team                  0
away_team                  0
season                     0
last_home_game             0
last_away_game             0
win                        0
nb_games_home              0
points_avg_home            0
fgm_avg_home               0
fga_avg_home               0
fgp_avg_home               0
ftm_avg_home               0
fta_avg_home               0
ftp_avg_home               0
tpm_avg_home               0
tpa_avg_home               0
tpp_avg_home               0
offReb_avg_home            0
defReb_avg_home            0
totReb_avg_home            0
assists_avg_home           0
pFouls_avg_home            0
steals_avg_home            0
turnovers_avg_home         0
blocks_avg_home            0
plusMinus_avg_home         0
home_team_avg_home         0
win_avg_home               0
win_home_avg_home          0
win_away_avg_home          0
last_10_games_wins_home    0
serie_home                 0
nb_games_away 

In [20]:
X,Y = df[x_columns],df['win'].astype(int)

In [21]:
X.shape, Y.shape

((11364, 52), (11364,))

In [22]:
from sklearn.model_selection import train_test_split

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.1)
X_train, X_test, y_train, y_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True)

In [66]:
from lazypredict.Supervised import LazyClassifier

In [67]:

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)


 97%|██████████████████████████████████████████████████████████████████████████████▏  | 28/29 [02:26<00:06,  6.83s/it]

[LightGBM] [Info] Number of positive: 5826, number of negative: 4401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12009
[LightGBM] [Info] Number of data points in the train set: 10227, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.569669 -> initscore=0.280499
[LightGBM] [Info] Start training from score 0.280499


100%|█████████████████████████████████████████████████████████████████████████████████| 29/29 [02:27<00:00,  5.09s/it]


In [68]:
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
BernoulliNB                        0.65               0.64     0.64      0.65   
RandomForestClassifier             0.66               0.64     0.64      0.65   
NearestCentroid                    0.64               0.64     0.64      0.64   
GaussianNB                         0.65               0.64     0.64      0.65   
LGBMClassifier                     0.65               0.64     0.64      0.65   
ExtraTreesClassifier               0.65               0.63     0.63      0.64   
SVC                                0.66               0.63     0.63      0.64   
XGBClassifier                      0.63               0.62     0.62      0.63   
NuSVC                              0.63               0.62     0.62      0.63   
AdaBoostClassifier                 0.63               0.62     0.62      0.63   
LinearDiscriminantAnalysis  

In [53]:
from xgboost import XGBClassifier
xg_params = {'colsample_bytree': 0.2,
 'learning_rate': 0.5,
 'max_depth': 20,
 'n_estimators': 100}
xg = XGBClassifier(**xg_params,
    random_state= 42,
    tree_method= 'hist', 
    eval_metric= 'mlogloss', 
    verbosity= 2,
)

In [54]:
xg.fit(X_train,y_train)

In [49]:
from sklearn.metrics import log_loss, accuracy_score
y_val_pred_prob = xg.predict_proba(X_test)
y_val_pred= xg.predict(X_test)
logloss = log_loss(y_test, y_val_pred_prob)
acc = accuracy_score(y_test,y_val_pred)
print(f'Logarithmic Loss: {logloss}')
print(f'Accuracy: {logloss}')

Logarithmic Loss: 0.6902169801567836
Accuracy: 0.6902169801567836


In [69]:
xg_params = {'colsample_bytree': 0.2,
 'learning_rate': 0.5,
 'max_depth': 20,
 'n_estimators': 200}

xg = XGBClassifier(**xg_params,
    tree_method= 'hist', 
    eval_metric= 'mlogloss', 
    verbosity= 2,
)
xg.fit(X_train,y_train)
y_val_pred_prob = xg.predict_proba(X_test)
y_val_pred= xg.predict(X_test)
logloss = log_loss(y_test, y_val_pred_prob)
acc = accuracy_score(y_test,y_val_pred)
print(f'Logarithmic Loss: {logloss}')
print(f'Accuracy: {logloss}')

Logarithmic Loss: 0.9567612739819676
Accuracy: 0.9567612739819676


In [70]:
xg.save_model('XGBmodel_name.json')


In [71]:
xg2 = XGBClassifier()
xg2.load_model('XGBmodel_name.json')
y_val_pred_prob = xg2.predict_proba(X_test)
y_val_pred= xg2.predict(X_test)
logloss = log_loss(y_test, y_val_pred_prob)
acc = accuracy_score(y_test,y_val_pred)
print(f'Logarithmic Loss: {logloss}')
print(f'Accuracy: {logloss}')

Logarithmic Loss: 0.9567612739819676
Accuracy: 0.9567612739819676
