In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [2]:
# utils function
def softmax(scores):
    exp_scores = np.exp(scores)
    if scores.shape[0]>1:
        probs = exp_scores / np.sum(exp_scores, axis=0)
    else:
        probs = exp_scores/(1+exp_scores)
    return probs

def predict_proba(model, X, mode = None):
    
    if mode is None:
        logits = model.coef_.dot(X.T) + model.intercept_.reshape((3, 1))
        Y_probs = softmax(logits).T
    
    elif mode == 'bin':
        logits = model.coef_.dot(X.T) + model.intercept_
        Y_probs = softmax(logits).T

    else:
        raise NotImplementedError        

    return Y_probs

In [13]:
path_home_team = "data/Train_Data/train_home_team_statistics_df.csv"
path_away_team = "data/Train_Data/train_away_team_statistics_df.csv"

path_target_wdl = "data/Y_train_1rknArQ.csv" 

path_target_diffgoal="data/Y_train_supp.csv"

In [24]:
data_home_team = pd.read_csv(path_home_team)
data_away_team = pd.read_csv(path_away_team)

target_wdl = pd.read_csv(path_target_wdl)

target_diff_score=pd.read_csv(path_target_diffgoal,index_col="ID")

In [5]:
data_home_team = data_home_team.fillna(data_home_team.mean(numeric_only=True))
data_away_team = data_away_team.fillna(data_away_team.mean(numeric_only=True))

In [6]:
"""
wdl vars
TEAM_GAME_WON_season_average (no std)
TEAM_GAME_DRAW_season_average (no std)
TEAM_GAME_LOST_season_average (no std)

off vars 
TEAM_SHOTS_TOTAL_season_average
TEAM_SHOTS_INSIDEBOX_season_average
TEAM_SHOTS_OFF_TARGET_season_average
TEAM_SHOTS_ON_TARGET_season_average
TEAM_SHOTS_OUTSIDEBOX_season_average
TEAM_ATTACKS_season_average
TEAM_PENALTIES_season_average
TEAM_DANGEROUS_ATTACKS_season_average
TEAM_GOALS_season_average

def vars
TEAM_SAVES_season_average
TEAM_FOULS_season_average
TEAM_YELLOWCARDS_season_average
TEAM_REDCARDS_season_average
TEAM_OFFSIDES_season_average
TEAM_BALL_SAFE_season_average
TEAM_INJURIES_season_average

other vars
TEAM_PASSES_season_average
TEAM_SUCCESSFUL_PASSES_season_average
TEAM_SUCCESSFUL_PASSES_PERCENTAGE_season_average (no sum)
TEAM_SUBSTITUTIONS_season_average
TEAM_CORNERS_season_average
TEAM_BALL_POSSESSION_season_average (no sum)
"""

'\nwdl vars\nTEAM_GAME_WON_season_average (no std)\nTEAM_GAME_DRAW_season_average (no std)\nTEAM_GAME_LOST_season_average (no std)\n\noff vars \nTEAM_SHOTS_TOTAL_season_average\nTEAM_SHOTS_INSIDEBOX_season_average\nTEAM_SHOTS_OFF_TARGET_season_average\nTEAM_SHOTS_ON_TARGET_season_average\nTEAM_SHOTS_OUTSIDEBOX_season_average\nTEAM_ATTACKS_season_average\nTEAM_PENALTIES_season_average\nTEAM_DANGEROUS_ATTACKS_season_average\nTEAM_GOALS_season_average\n\ndef vars\nTEAM_SAVES_season_average\nTEAM_FOULS_season_average\nTEAM_YELLOWCARDS_season_average\nTEAM_REDCARDS_season_average\nTEAM_OFFSIDES_season_average\nTEAM_BALL_SAFE_season_average\nTEAM_INJURIES_season_average\n\nother vars\nTEAM_PASSES_season_average\nTEAM_SUCCESSFUL_PASSES_season_average\nTEAM_SUCCESSFUL_PASSES_PERCENTAGE_season_average (no sum)\nTEAM_SUBSTITUTIONS_season_average\nTEAM_CORNERS_season_average\nTEAM_BALL_POSSESSION_season_average (no sum)\n'

## Step 1: RegLog on WDL vars

In [7]:
base_columns_of_interest = ['TEAM_GAME_WON_season',
                       'TEAM_GAME_DRAW_season',
                       'TEAM_GAME_LOST_season',
                       'TEAM_GAME_WON_5_last_match',
                       'TEAM_GAME_DRAW_5_last_match',
                       'TEAM_GAME_LOST_5_last_match']

columns_of_interest = [col + '_sum' for col in base_columns_of_interest]
columns_of_interest+= ['LEAGUE']

In [8]:
data_home_team_of_interest = data_home_team[columns_of_interest]
data_away_team_of_interest = data_away_team[columns_of_interest]

In [9]:
data_home_away_team_of_interest = data_home_team_of_interest.iloc[:, :-1] - data_away_team_of_interest.iloc[:, :-1]
data_home_away_team_of_interest['LEAGUE'] = data_home_team_of_interest['LEAGUE']
data_home_away_team_of_interest['TARGET'] = target_wdl.HOME_WINS - target_wdl.AWAY_WINS

In [10]:
X, Y = data_home_away_team_of_interest.iloc[:, :6], data_home_away_team_of_interest.iloc[:, 7]
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    stratify=data_home_away_team_of_interest.loc[:, 'LEAGUE'],
                                                    test_size=0.5,
                                                    random_state=10
                                                    )

In [11]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(6151, 6)
(6152, 6)
(6151,)
(6152,)


In [12]:
model_wdl = LinearRegression(fit_intercept=True)
model_wdl.fit(X_train, Y_train)

print(model_wdl.score(X_train, Y_train))
print(model_wdl.score(X_test, Y_test))

0.4706551780198342
0.4869960988296489


In [40]:
model_wdl.predict_proba(X_train)

array([[0.33543039, 0.26748922, 0.39708039],
       [0.17692949, 0.22368332, 0.59938719],
       [0.30051016, 0.24965227, 0.44983757],
       ...,
       [0.18252353, 0.22906848, 0.58840799],
       [0.23600404, 0.25738595, 0.50661001],
       [0.22505004, 0.24502458, 0.52992538]])

In [26]:
Xd, Yd = data_home_away_team_of_interest.iloc[:, :6], target_diff_score
X_traind, X_testd, Y_traind, Y_testd = train_test_split(Xd,
                                                    Yd,
                                                    stratify=data_home_away_team_of_interest.loc[:, 'LEAGUE'],
                                                    test_size=0.5,
                                                    random_state=10
                                                    )

In [29]:
model_wdld = LinearRegression(fit_intercept=True)
model_wdld.fit(X_traind, Y_traind)

print(model_wdld.score(X_traind, Y_traind))
print(model_wdld.score(X_testd, Y_testd))

0.09609012720142807
0.10237211243231303


In [42]:
np.sign(model_wdld.predict(X_traind))

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [43]:
Y_traind

Unnamed: 0_level_0,GOAL_DIFF_HOME_AWAY
ID,Unnamed: 1_level_1
11966,1.0
11620,1.0
9050,1.0
10614,2.0
6599,1.0
...,...
7262,0.0
9882,-4.0
9304,2.0
9449,0.0


In [32]:
Y_testd

Unnamed: 0_level_0,GOAL_DIFF_HOME_AWAY
ID,Unnamed: 1_level_1
8339,-1.0
1974,0.0
4523,-1.0
198,2.0
10030,1.0
...,...
10760,-2.0
8426,1.0
7638,2.0
6837,-4.0


In [29]:
Y_probs = predict_proba(model_wdl, X_test)

good_pred_prob_threshold = 0.54
pred_probs = np.max(Y_probs, axis=1)

good_pred_probs_mask = pred_probs>=good_pred_prob_threshold
print((good_pred_probs_mask).sum()/Y_probs.shape[0])

preds_wdl = np.argmax(Y_probs, axis=1)
preds_wdl[preds_wdl==0] = -1
preds_wdl[preds_wdl==1] = 0
preds_wdl[preds_wdl==2] = 1

print((preds_wdl[good_pred_probs_mask]==Y_test[good_pred_probs_mask]).sum()/good_pred_probs_mask.sum())

0.21326397919375814
0.6341463414634146


### Conclusion on perfomance for WDL vars

L'efficacité de la reglog qur toutes les données de test est de 48%
En moyenne, sur les données de test, la régression logistique uniquement sur les données de W/D/L (season+5 last matchs) est sûr à plus de 54% de sa prédiction pour seulement 21% des données. Sur ces 21% de données, la précision de la prédiction du résultat passe de 48% à 63%.

#### save the bad predicted data for step 2

In [30]:
model_wdl = LogisticRegression(fit_intercept=True)
model_wdl.fit(X, Y)

Y_probs = predict_proba(model_wdl, X)
bad_pred_probs_mask = np.max(Y_probs, axis=1)<good_pred_prob_threshold

In [31]:
np.save("data/add/train/steps/mask_step1.npy", bad_pred_probs_mask)
data_home_team[bad_pred_probs_mask].to_csv('data/add/train/steps/data_home_team_step1.csv', index=False)
data_away_team[bad_pred_probs_mask].to_csv('data/add/train/steps/data_away_team_step1.csv', index=False)
target_wdl[bad_pred_probs_mask].to_csv('data/add/train/steps/target_step1.csv', index=False)

# Step 1: RegLog on offensive vars

In [32]:
data_home_team_step1 = pd.read_csv('data/add/train/steps/data_home_team_step1.csv')
data_away_team_step1 = pd.read_csv('data/add/train/steps/data_away_team_step1.csv')
target_wdl_step1 = pd.read_csv('data/add/train/steps/target_step1.csv')

In [33]:
base_columns_of_interest = [
"TEAM_SHOTS_TOTAL",
"TEAM_SHOTS_INSIDEBOX",
"TEAM_SHOTS_ON_TARGET",
"TEAM_ATTACKS",
"TEAM_PENALTIES",
"TEAM_DANGEROUS_ATTACKS",
"TEAM_GOALS"
]

suffix = ['_season_sum', '_season_average', '_season_std', '_5_last_match_sum', '_5_last_match_average', '_5_last_match_std']

In [34]:
N_suffix = 6
l = np.zeros((N_suffix,))
for base_column in base_columns_of_interest:
    for i in range(N_suffix):
        if base_column+suffix[i] in data_home_team.columns:
            l[i]+=1
l

array([7., 7., 7., 7., 7., 7.])

In [35]:
columns_of_interest = [base_column+suf for suf in suffix for base_column in base_columns_of_interest]
len(columns_of_interest)

42

In [36]:
data_home_away_team_of_interest_step1 = data_home_team_step1[columns_of_interest] -\
                                        data_away_team_step1[columns_of_interest]
data_home_away_team_of_interest_step1['LEAGUE'] = data_home_team_step1['LEAGUE']
data_home_away_team_of_interest_step1['TARGET_wdl'] = target_wdl_step1.HOME_WINS - target_wdl_step1.AWAY_WINS

In [37]:
data_home_away_team_of_interest_step1 

Unnamed: 0,TEAM_SHOTS_TOTAL_season_sum,TEAM_SHOTS_INSIDEBOX_season_sum,TEAM_SHOTS_ON_TARGET_season_sum,TEAM_ATTACKS_season_sum,TEAM_PENALTIES_season_sum,TEAM_DANGEROUS_ATTACKS_season_sum,TEAM_GOALS_season_sum,TEAM_SHOTS_TOTAL_season_average,TEAM_SHOTS_INSIDEBOX_season_average,TEAM_SHOTS_ON_TARGET_season_average,...,TEAM_GOALS_5_last_match_average,TEAM_SHOTS_TOTAL_5_last_match_std,TEAM_SHOTS_INSIDEBOX_5_last_match_std,TEAM_SHOTS_ON_TARGET_5_last_match_std,TEAM_ATTACKS_5_last_match_std,TEAM_PENALTIES_5_last_match_std,TEAM_DANGEROUS_ATTACKS_5_last_match_std,TEAM_GOALS_5_last_match_std,LEAGUE,TARGET_wdl
0,0.0,-1.000000,0.0,-1.0,2.000000,-4.0,-2.0,0.0,-1.000000,0.0,...,2.0,2.0000,-1.000000,4.0,-1.0,6.000000,3.0,-2.0,Serie A,-1
1,-2.0,-1.000000,-3.0,-2.0,-3.000000,-1.0,-4.0,-2.0,-1.000000,-4.0,...,-5.0,-4.0000,-1.000000,-7.0,1.0,-1.000000,2.0,-1.0,Premier League,0
2,6.0,7.000000,2.0,0.0,-2.000000,4.0,2.0,6.0,6.000000,2.0,...,0.0,-5.0000,-4.000000,-4.0,4.0,5.000000,-6.0,-4.0,La Liga,1
3,-1.0,-2.000000,0.0,0.0,-2.000000,-3.0,-2.0,-2.0,-2.000000,0.0,...,-4.0,1.0000,1.000000,-4.0,-3.0,0.000000,7.0,-2.0,Ligue 1,0
4,-10.0,-10.000000,-10.0,10.0,-0.061935,-10.0,-10.0,-10.0,-10.000000,-10.0,...,-10.0,-10.0000,-10.000000,-10.0,-10.0,-0.142229,-10.0,-10.0,Superliga,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9371,-2.0,-4.000000,-3.0,3.0,-10.000000,-1.0,0.0,-3.0,-5.000000,-4.0,...,2.0,-3.0000,-2.000000,-1.0,4.0,-6.000000,-1.0,2.0,League One,-1
9372,1.0,0.000000,0.0,-2.0,-1.000000,-1.0,0.0,1.0,0.000000,0.0,...,0.0,6.0000,2.000000,0.0,0.0,0.000000,4.0,0.0,Liga Portugal,-1
9373,-3.0,-4.000000,-4.0,-5.0,5.000000,-4.0,-4.0,-3.0,-4.000000,-3.0,...,-2.0,0.0000,-4.000000,3.0,5.0,0.000000,-2.0,-2.0,Bundesliga,-1
9374,1.0,-0.078071,-6.0,-2.0,3.000000,-4.0,-3.0,-4.0,-0.076016,-6.0,...,-4.0,-0.0555,-0.064535,-1.0,6.0,0.000000,1.0,-1.0,League One,1


In [38]:
X = data_home_away_team_of_interest_step1.iloc[:, :-2]
Y = data_home_away_team_of_interest_step1.loc[:, 'TARGET_wdl']

print(X.shape, Y.shape)

(9376, 42) (9376,)


In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    stratify=data_home_away_team_of_interest_step1.loc[:, 'LEAGUE'],
                                                    #stratify=Y,
                                                    test_size=0.5,
                                                    random_state=0
                                                    )

In [40]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4688, 42)
(4688, 42)
(4688,)
(4688,)


In [41]:
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, Y_train)

print(model1.score(X_train, Y_train))
print(model1.score(X_test, Y_test))

0.44880546075085326
0.44859215017064846


In [42]:
Y_probs1 = model1.predict_proba(X_test)
good_pred_probs_mask = np.max(Y_probs1, axis=1)>=0.52

print((good_pred_probs_mask).sum()/Y_probs1.shape[0])

preds_wdl = np.argmax(Y_probs1, axis=1)
preds_wdl[preds_wdl==0] = -1
preds_wdl[preds_wdl==1] = 0
preds_wdl[preds_wdl==2] = 1

print((preds_wdl[good_pred_probs_mask]==Y_test[good_pred_probs_mask]).sum()/good_pred_probs_mask.sum())

0.17811433447098976
0.6011976047904192


### Conclusion on perfomance for off vars

L'efficacité de la reglog sur toutes les données de test est de ~45%
En moyenne, sur les données de test, la régression logistique est sûr à plus de 52% de sa prédiction pour seulement 17% des données. Sur ces 17% de données, la précision de la prédiction du résultat passe de 45% à 60%.

In [43]:
model1 = LogisticRegression(fit_intercept=True, max_iter=1000)
model1.fit(X, Y)

Y_probs1 = predict_proba(model1, X)

In [44]:
good_pred_probs_mask = np.max(Y_probs1, axis=1)>=0.52

print((good_pred_probs_mask).sum()/Y_probs1.shape[0])

preds_wdl = np.argmax(Y_probs1, axis=1)
preds_wdl[preds_wdl==0] = -1
preds_wdl[preds_wdl==1] = 0
preds_wdl[preds_wdl==2] = 1

print((preds_wdl[good_pred_probs_mask]==Y[good_pred_probs_mask]).sum()/good_pred_probs_mask.sum())

0.19549914675767918
0.607746863066012


In [45]:
bad_pred_probs_mask = np.max(Y_probs1, axis=1)<0.52

In [46]:
np.save("data/add/train/steps/mask_step2.npy", bad_pred_probs_mask)
data_home_team_step1[bad_pred_probs_mask].to_csv('data/add/train/steps/data_home_team_step2.csv', index=False)
data_away_team_step1[bad_pred_probs_mask].to_csv('data/add/train/steps/data_away_team_step2.csv', index=False)
target_wdl_step1[bad_pred_probs_mask].to_csv('data/add/train/steps/target_step2.csv', index=False)

# Step 2: RegLog on def vars

In [47]:
data_home_team_step2 = pd.read_csv('data/add/train/steps/data_home_team_step2.csv')
data_away_team_step2 = pd.read_csv('data/add/train/steps/data_away_team_step2.csv')
target_wdl_step2 = pd.read_csv('data/add/train/steps/target_step2.csv')

In [48]:
base_columns_of_interest = [
"TEAM_SAVES",
#"TEAM_FOULS",
#"TEAM_YELLOWCARDS",
"TEAM_REDCARDS",
#"TEAM_OFFSIDES",
"TEAM_BALL_SAFE",
"TEAM_INJURIES",
"TEAM_PASSES",
"TEAM_SUCCESSFUL_PASSES",
#"TEAM_SUCCESSFUL_PASSES_PERCENTAGE",
"TEAM_SUBSTITUTIONS",
"TEAM_CORNERS",
"TEAM_BALL_POSSESSION"
]

In [49]:
columns_of_interest = []
N_suffix = 6
l = np.zeros((N_suffix,))
for base_column in base_columns_of_interest:
    for i in range(N_suffix):
        if base_column+suffix[i] in data_home_team.columns:
            l[i]+=1
            columns_of_interest.append(base_column+suffix[i])

print(l)
print(len(columns_of_interest))

[8. 9. 9. 8. 9. 9.]
52


In [50]:
data_home_away_team_of_interest_step2 = data_home_team_step2[columns_of_interest] -\
                                        data_away_team_step2[columns_of_interest]

data_home_away_team_of_interest_step2['LEAGUE'] = data_home_team_step2['LEAGUE']
data_home_away_team_of_interest_step2['TARGET_wdl'] = target_wdl_step2.HOME_WINS - target_wdl_step2.AWAY_WINS

In [51]:
data_home_away_team_of_interest_step2

Unnamed: 0,TEAM_SAVES_season_sum,TEAM_SAVES_season_average,TEAM_SAVES_season_std,TEAM_SAVES_5_last_match_sum,TEAM_SAVES_5_last_match_average,TEAM_SAVES_5_last_match_std,TEAM_REDCARDS_season_sum,TEAM_REDCARDS_season_average,TEAM_REDCARDS_season_std,TEAM_REDCARDS_5_last_match_sum,...,TEAM_CORNERS_season_std,TEAM_CORNERS_5_last_match_sum,TEAM_CORNERS_5_last_match_average,TEAM_CORNERS_5_last_match_std,TEAM_BALL_POSSESSION_season_average,TEAM_BALL_POSSESSION_season_std,TEAM_BALL_POSSESSION_5_last_match_average,TEAM_BALL_POSSESSION_5_last_match_std,LEAGUE,TARGET_wdl
0,1.000000,1.000000,-1.000000,1.000000,1.000000,-1.000000,-1.0,-1.0,-2.0,-3.0,...,-7.0,-3.0,-3.0,3.0,-5.0,-4.0,-4.0,1.0,Serie A,-1
1,0.000000,1.000000,-4.000000,0.000000,0.000000,-3.000000,6.0,6.0,8.0,0.0,...,0.0,0.0,0.0,0.0,-3.0,2.0,-3.0,1.0,Premier League,0
2,8.000000,8.000000,2.000000,10.000000,10.000000,6.000000,1.0,1.0,2.0,0.0,...,0.0,1.0,1.0,9.0,0.0,0.0,0.0,3.0,Ligue 1,0
3,3.000000,3.000000,0.000000,-1.000000,-1.000000,-8.000000,0.0,0.0,0.0,0.0,...,-3.0,3.0,3.0,-5.0,-5.0,-5.0,-1.0,-3.0,Serie A,0
4,3.000000,2.000000,-4.000000,0.000000,0.000000,-4.000000,5.0,4.0,3.0,0.0,...,-1.0,5.0,5.0,0.0,0.0,7.0,1.0,5.0,Ligue 1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7538,9.000000,8.000000,3.000000,9.000000,9.000000,3.000000,6.0,5.0,8.0,5.0,...,0.0,3.0,3.0,-3.0,6.0,-3.0,3.0,-3.0,Pro League,1
7539,-1.000000,0.000000,3.000000,7.000000,7.000000,6.000000,-10.0,-10.0,-7.0,-5.0,...,2.0,-7.0,-7.0,0.0,-1.0,5.0,-5.0,1.0,League One,-1
7540,2.000000,2.000000,2.000000,1.000000,1.000000,-1.000000,7.0,6.0,4.0,8.0,...,5.0,0.0,0.0,0.0,-1.0,6.0,4.0,-1.0,Liga Portugal,-1
7541,2.000000,1.000000,-1.000000,0.000000,0.000000,0.000000,8.0,7.0,4.0,5.0,...,0.0,-6.0,-6.0,0.0,-1.0,0.0,-3.0,2.0,Bundesliga,-1


In [52]:
X = data_home_away_team_of_interest_step2.iloc[:, :-2]
Y = data_home_away_team_of_interest_step2.loc[:, 'TARGET_wdl']

print(X.shape, Y.shape)

(7543, 52) (7543,)


In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    stratify=data_home_away_team_of_interest_step2.loc[:, 'LEAGUE'],
                                                    #stratify=Y,
                                                    test_size=0.5,
                                                    random_state=0
                                                    )
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(3771, 52)
(3772, 52)
(3771,)
(3772,)


In [54]:
model2 = LogisticRegression(max_iter=1000)
model2.fit(X_train, Y_train)

print(model2.score(X_train, Y_train))
print(model2.score(X_test, Y_test))

0.42137364094404667
0.4019088016967126


In [55]:
model2prime = GaussianNB()
model2prime.fit(X_train, Y_train)

print(model2prime.score(X_train, Y_train))
print(model2prime.score(X_test, Y_test))

0.4054627419782551
0.39740190880169673


In [56]:
Y_probs2 = model2.predict_proba(X_test)
good_pred_probs_mask = np.max(Y_probs2, axis=1)>=0.45

print((good_pred_probs_mask).sum()/Y_probs2.shape[0])

preds_wdl = np.argmax(Y_probs2, axis=1)
preds_wdl[preds_wdl==0] = -1
preds_wdl[preds_wdl==1] = 0
preds_wdl[preds_wdl==2] = 1

print((preds_wdl[good_pred_probs_mask]==Y_test[good_pred_probs_mask]).sum()/good_pred_probs_mask.sum())

0.3162778366914104
0.44677284157585917
