In [1]:
import pandas as pd
import numpy as np
import random
import os

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

import sys
sys.path.append("..")
from utils import *
from preprocessing_utils import *
import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [2]:
train_x, train_y, valid_x, valid_y = load_train_valid()

lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
train_x, valid_x= preprocessing(train_x, valid_x)
model = sm.Logit(train_y, train_x)
logis_model = model.fit(method='newton')
print(logis_model.summary())

model = LogisticRegression(max_iter=100,
                        penalty='l2',
                        C=1.0)
model.fit(train_x, train_y) 

prediction = model.predict_proba(valid_x)
multi_loloss = log_loss(valid_y, prediction)
print(multi_loloss)

In [19]:
odds = np.exp(logis_model.params)

for i in range(len(odds)):
    print(f'변수 {train_x.columns[i]}의 logit : {logis_model.params[i] : .3f}')
    print(f'변수 {train_x.columns[i]}의 odds ratio : {odds[i] : .3f}')
    print(f'변수 {train_x.columns[i]}가 1단위 증가할 때, Home 팀이 승리할 로짓이{logis_model.params[i] : .3f}배 증가한다.')
    print(f'변수 {train_x.columns[i]}가 1단위 증가할 때, Home 팀이 승리할 확률이 Away 팀에 비해서 {odds[i] : .3f}배 증가한다.\n')


변수 season의 logit : -0.001
변수 season의 odds ratio :  0.999
변수 season가 1단위 증가할 때, Home 팀이 승리할 로짓이-0.001배 증가한다.
변수 season가 1단위 증가할 때, Home 팀이 승리할 확률이 Away 팀에 비해서  0.999배 증가한다.

변수 homeTeam의 logit :  0.016
변수 homeTeam의 odds ratio :  1.016
변수 homeTeam가 1단위 증가할 때, Home 팀이 승리할 로짓이 0.016배 증가한다.
변수 homeTeam가 1단위 증가할 때, Home 팀이 승리할 확률이 Away 팀에 비해서  1.016배 증가한다.

변수 awayTeam의 logit :  0.007
변수 awayTeam의 odds ratio :  1.007
변수 awayTeam가 1단위 증가할 때, Home 팀이 승리할 로짓이 0.007배 증가한다.
변수 awayTeam가 1단위 증가할 때, Home 팀이 승리할 확률이 Away 팀에 비해서  1.007배 증가한다.

변수 halfTimeGoals(homeTeam)의 logit :  1.915
변수 halfTimeGoals(homeTeam)의 odds ratio :  6.788
변수 halfTimeGoals(homeTeam)가 1단위 증가할 때, Home 팀이 승리할 로짓이 1.915배 증가한다.
변수 halfTimeGoals(homeTeam)가 1단위 증가할 때, Home 팀이 승리할 확률이 Away 팀에 비해서  6.788배 증가한다.

변수 halfTimeGoals(awayTeam)의 logit : -1.834
변수 halfTimeGoals(awayTeam)의 odds ratio :  0.160
변수 halfTimeGoals(awayTeam)가 1단위 증가할 때, Home 팀이 승리할 로짓이-1.834배 증가한다.
변수 halfTimeGoals(awayTeam)가 1단위 증가할 때, Home 팀이 승리할 확률이 Away 팀에 비해

In [25]:
import pandas as pd

data = {
    '변수': ['season', 'homeTeam', 'awayTeam', 'halfTimeGoals(homeTeam)', 'halfTimeGoals(awayTeam)', 
             'shots(homeTeam)', 'shots(awayTeam)', 'shotsOnTarget(homeTeam)', 'shotsOnTarget(awayTeam)', 
             'corners(homeTeam)', 'corners(awayTeam)', 'fouls(homeTeam)', 'fouls(awayTeam)', 
             'yellowCards(homeTeam)', 'yellowCards(awayTeam)', 'redCards(homeTeam)', 'redCards(awayTeam)', 
             'year', 'month', 'day', 'match'],
    'Logit': [-0.001, 0.016, 0.007, 1.915, -1.834, -0.002, -0.019, 0.259, -0.293, 
              -0.054, 0.067, -0.005, 0.007, -0.076, 0.055, -1.228, 0.780, 
              0.106, 0.015, -0.008, -0.001],
    'Odds Ratio': [0.999, 1.016, 1.007, 6.788, 0.160, 0.998, 0.981, 1.295, 0.746, 
                    0.948, 1.069, 0.995, 1.007, 0.927, 1.057, 0.293, 2.182, 
                    1.112, 1.016, 0.992, 0.999]
}

df = pd.DataFrame(data)
df.to_csv("variables_logit_odds.csv", index=False)


In [27]:
sig_level = .05
logistic_variable = pd.concat([logis_model.params, np.exp(logis_model.params), np.round(logis_model.pvalues,6)], axis=1)
logistic_variable.columns = ['logit', 'odds ratio', 'p-value']
logistic_variable.loc[logistic_variable['p-value'] > sig_level, f'above {sig_level*100}%'] = 'No'
logistic_variable.loc[logistic_variable['p-value'] < sig_level, f'above {sig_level*100}%'] = 'Yes'
print(logistic_variable)

                            logit  odds ratio   p-value above 5.0%
season                  -0.001061    0.998940  0.594052         No
homeTeam                 0.015521    1.015642  0.816857         No
awayTeam                 0.007287    1.007314  0.022992        Yes
halfTimeGoals(homeTeam)  1.915145    6.787925  0.000000        Yes
halfTimeGoals(awayTeam) -1.834137    0.159751  0.000000        Yes
shots(homeTeam)         -0.001735    0.998266  0.874529         No
shots(awayTeam)         -0.018722    0.981452  0.134213         No
shotsOnTarget(homeTeam)  0.258833    1.295418  0.000000        Yes
shotsOnTarget(awayTeam) -0.293336    0.745772  0.000000        Yes
corners(homeTeam)       -0.053835    0.947589  0.000144        Yes
corners(awayTeam)        0.066536    1.068799  0.000028        Yes
fouls(homeTeam)         -0.004762    0.995249  0.664376         No
fouls(awayTeam)          0.007208    1.007234  0.490139         No
yellowCards(homeTeam)   -0.076266    0.926570  0.023251       

In [9]:
# date_list = [200001, 200102, 202223, 202122, 202021, 201920, 201819, 201718, 201617,
#        201516, 201415, 201314, 201213, 201112, 201011, 200910, 200809, 200708,
#        200607, 200506, 200405, 200304, 200203, 202324]
# for date in date_list:
#     train_x, train_y, valid_x, valid_y = load_train_valid()
#     train_x, valid_x= preprocessing(train_x, valid_x, date)

#     model = LogisticRegression(max_iter=100,
#                             penalty='l2',
#                             C=1.0)

#     model.fit(train_x, train_y) 
#     prediction = model.predict_proba(valid_x)
#     multi_loloss = log_loss(valid_y, prediction)
#     print(multi_loloss)

#     # sample_submission = pd.read_csv('/home/workspace/DACON/soccer/Data/sample_submission.csv')
#     # sample_submission.iloc[:,1:] = prediction


1.0428031102985622
1.0431914624098098
1.0650379825862843
1.0575858978597097
1.0593347427234419
1.0565014537491515
1.0538346631589712
1.0545490140694609
1.0523365350188418
1.0502191657600557
1.0494087969462564
1.0470706078805796
1.0455536157004404
1.0446862880361203
1.0440733168967582
1.0423921575362405
1.0432448393872098
1.0443252161583338
1.0426716664308613
1.0434118070669376
1.042762210305061
1.0422306634499494
1.0427947313051262
1.0386930397361838
