In [1]:
import pandas as pd
import numpy as np
import random
import os

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
from sklearn.ensemble import RandomForestClassifier

import sys
sys.path.append("..")
from utils import *
from preprocessing_utils import *
import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [2]:
# ema
train, valid = load_train_valid()
train_x, train_y, valid_x, valid_y = preprocessing(train, valid, is_test=False)

100%|██████████| 47/47 [00:00<00:00, 892.55it/s]
100%|██████████| 47/47 [00:00<00:00, 867.82it/s]


In [3]:
train, valid = load_train_valid()
train_x, train_y, valid_x, valid_y = preprocessing(train, valid, is_test=False)

lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
valid_y = lec.transform(valid_y)
# RF
rf = RandomForestClassifier()
rf.fit(train_x, train_y)
prediction_rf = rf.predict_proba(valid_x)
 
# lightGBM
lgbm = lgb.LGBMClassifier()
lgbm.fit(train_x, train_y, eval_metric='logloss')
prediction_lgbm = lgbm.predict_proba(valid_x)

# XGBoost 
xgbm = xgb.XGBClassifier()
xgbm.fit(train_x, train_y, eval_metric='logloss')
prediction_xgbm = xgbm.predict_proba(valid_x)

# Catboost 
catboost = cat.CatBoostClassifier()
catboost.fit(train_x, train_y)
prediction_catboost = catboost.predict_proba(valid_x)

# loss 
multi_loloss_rf = log_loss(valid_y, prediction_rf)
multi_loloss_lgbm = log_loss(valid_y, prediction_lgbm)
multi_loloss_xgbm = log_loss(valid_y, prediction_xgbm)
multi_loloss_cat = log_loss(valid_y, prediction_catboost)

print(multi_loloss_rf)
print(multi_loloss_lgbm)
print(multi_loloss_xgbm)
print(multi_loloss_cat)

100%|██████████| 47/47 [00:00<00:00, 893.99it/s]
100%|██████████| 47/47 [00:00<00:00, 882.43it/s]


Learning rate set to 0.088469
0:	learn: 1.0365947	total: 56.7ms	remaining: 56.7s
1:	learn: 0.9890949	total: 65.6ms	remaining: 32.7s
2:	learn: 0.9484499	total: 107ms	remaining: 35.7s
3:	learn: 0.9153343	total: 148ms	remaining: 36.7s
4:	learn: 0.8854259	total: 168ms	remaining: 33.5s
5:	learn: 0.8612518	total: 172ms	remaining: 28.5s
6:	learn: 0.8388068	total: 174ms	remaining: 24.7s
7:	learn: 0.8184339	total: 176ms	remaining: 21.9s
8:	learn: 0.8024571	total: 181ms	remaining: 20s
9:	learn: 0.7879970	total: 184ms	remaining: 18.2s
10:	learn: 0.7745045	total: 186ms	remaining: 16.7s
11:	learn: 0.7628611	total: 188ms	remaining: 15.5s
12:	learn: 0.7516027	total: 190ms	remaining: 14.4s
13:	learn: 0.7401009	total: 192ms	remaining: 13.5s
14:	learn: 0.7306376	total: 194ms	remaining: 12.8s
15:	learn: 0.7240629	total: 197ms	remaining: 12.1s
16:	learn: 0.7159645	total: 199ms	remaining: 11.5s
17:	learn: 0.7095163	total: 201ms	remaining: 11s
18:	learn: 0.7022618	total: 203ms	remaining: 10.5s
19:	learn: 0.

In [4]:
pd.Series(rf.feature_importances_, index=rf.feature_names_in_).sort_values(ascending=False)

home_winRate_5mean         0.097234
away_winRate_5mean         0.091704
halfTimeGoals(homeTeam)    0.079971
halfTimeGoals(awayTeam)    0.067212
shotsOnTarget(homeTeam)    0.049695
match                      0.049042
awayTeam                   0.048590
homeTeam                   0.046606
shotsOnTarget(awayTeam)    0.046000
shots(awayTeam)            0.040785
day                        0.040626
shots(homeTeam)            0.040557
corners(homeTeam)          0.036616
fouls(awayTeam)            0.036601
fouls(homeTeam)            0.036076
season                     0.034724
year                       0.034007
corners(awayTeam)          0.033693
month                      0.031031
yellowCards(awayTeam)      0.024492
yellowCards(homeTeam)      0.023118
redCards(homeTeam)         0.005956
redCards(awayTeam)         0.005665
dtype: float64

In [2]:
train, valid = load_train_valid()
train_x, train_y, valid_x, valid_y = preprocessing(train, valid, is_test=False)

lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
valid_y = lec.transform(valid_y)
# RF
rf = RandomForestClassifier()
rf.fit(train_x, train_y)
prediction_rf = rf.predict_proba(valid_x)
 
# lightGBM
lgbm = lgb.LGBMClassifier()
lgbm.fit(train_x, train_y, eval_metric='logloss')
prediction_lgbm = lgbm.predict_proba(valid_x)

# XGBoost 
xgbm = xgb.XGBClassifier()
xgbm.fit(train_x, train_y, eval_metric='logloss')
prediction_xgbm = xgbm.predict_proba(valid_x)

# Catboost 
catboost = cat.CatBoostClassifier()
catboost.fit(train_x, train_y)
prediction_catboost = catboost.predict_proba(valid_x)

# loss 
multi_loloss_rf = log_loss(valid_y, prediction_rf)
multi_loloss_lgbm = log_loss(valid_y, prediction_lgbm)
multi_loloss_xgbm = log_loss(valid_y, prediction_xgbm)
multi_loloss_cat = log_loss(valid_y, prediction_catboost)

print(multi_loloss_rf)
print(multi_loloss_lgbm)
print(multi_loloss_xgbm)
print(multi_loloss_cat)

Learning rate set to 0.088469
0:	learn: 1.0547341	total: 75.5ms	remaining: 1m 15s
1:	learn: 1.0145674	total: 126ms	remaining: 1m 2s
2:	learn: 0.9818046	total: 140ms	remaining: 46.6s
3:	learn: 0.9557680	total: 145ms	remaining: 36.1s
4:	learn: 0.9332163	total: 179ms	remaining: 35.5s
5:	learn: 0.9142838	total: 181ms	remaining: 30s
6:	learn: 0.8975192	total: 183ms	remaining: 26s
7:	learn: 0.8821395	total: 186ms	remaining: 23s
8:	learn: 0.8700030	total: 188ms	remaining: 20.7s
9:	learn: 0.8591677	total: 190ms	remaining: 18.8s
10:	learn: 0.8471437	total: 192ms	remaining: 17.3s
11:	learn: 0.8370154	total: 194ms	remaining: 16s
12:	learn: 0.8277292	total: 197ms	remaining: 14.9s
13:	learn: 0.8203734	total: 199ms	remaining: 14s
14:	learn: 0.8172199	total: 200ms	remaining: 13.1s
15:	learn: 0.8111770	total: 202ms	remaining: 12.4s
16:	learn: 0.8048098	total: 204ms	remaining: 11.8s
17:	learn: 0.8002178	total: 206ms	remaining: 11.2s
18:	learn: 0.7951288	total: 208ms	remaining: 10.7s
19:	learn: 0.790293

In [2]:
train, valid = load_train_valid()
train_x, train_y, valid_x, valid_y = preprocessing(train, valid, is_test=False)

lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
valid_y = lec.transform(valid_y)

# lightGBM
lgbm = lgb.LGBMClassifier()
lgbm.fit(train_x, train_y, eval_metric='logloss')
prediction_lgbm = lgbm.predict_proba(valid_x)

# XGBoost 
xgbm = xgb.XGBClassifier()
xgbm.fit(train_x, train_y, eval_metric='logloss')
prediction_xgbm = xgbm.predict_proba(valid_x)

# Catboost 
catboost = cat.CatBoostClassifier()
catboost.fit(train_x, train_y)
prediction_catboost = catboost.predict_proba(valid_x)

# loss 
multi_loloss_lgbm = log_loss(valid_y, prediction_lgbm)
multi_loloss_xgbm = log_loss(valid_y, prediction_xgbm)
multi_loloss_cat = log_loss(valid_y, prediction_catboost)

print(multi_loloss_lgbm)
print(multi_loloss_xgbm)
print(multi_loloss_cat)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 8920/8920 [00:12<00:00, 730.13it/s]
100%|██████████| 1/1 [00:12<00:00, 12.22s/it]
100%|██████████| 8920/8920 [00:12<00:00, 709.07it/s]
100%|██████████| 1/1 [00:12<00:00, 12.58s/it]


Learning rate set to 0.088469
0:	learn: 1.0504128	total: 71ms	remaining: 1m 10s
1:	learn: 1.0148807	total: 75.1ms	remaining: 37.5s
2:	learn: 0.9839577	total: 77.9ms	remaining: 25.9s
3:	learn: 0.9582677	total: 80.7ms	remaining: 20.1s
4:	learn: 0.9369158	total: 114ms	remaining: 22.7s
5:	learn: 0.9172892	total: 134ms	remaining: 22.2s
6:	learn: 0.8999240	total: 153ms	remaining: 21.7s
7:	learn: 0.8851381	total: 176ms	remaining: 21.8s
8:	learn: 0.8735008	total: 182ms	remaining: 20.1s
9:	learn: 0.8618871	total: 185ms	remaining: 18.3s
10:	learn: 0.8517521	total: 187ms	remaining: 16.8s
11:	learn: 0.8407861	total: 189ms	remaining: 15.6s
12:	learn: 0.8326954	total: 191ms	remaining: 14.5s
13:	learn: 0.8242422	total: 193ms	remaining: 13.6s
14:	learn: 0.8184281	total: 195ms	remaining: 12.8s
15:	learn: 0.8126716	total: 197ms	remaining: 12.1s
16:	learn: 0.8065308	total: 199ms	remaining: 11.5s
17:	learn: 0.8014741	total: 201ms	remaining: 11s
18:	learn: 0.7973877	total: 203ms	remaining: 10.5s
19:	learn:

## Test

In [None]:
train_x, train_y, valid_x, valid_y = load_train_valid()
lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
valid_y = lec.transform(valid_y)
train_x, valid_x= preprocessing(train_x, valid_x)

# feature selection
train_x = train_x.drop(columns=['awayTeam', "day"])
valid_x = valid_x.drop(columns=["awayTeam", "day"])

# 함수 사용해서 이상치 값 삭제

oulier_idx_shotsAwayTeam = get_outlier(df=train_x, column='shots(awayTeam)', weight=1.5)
train_x.drop(oulier_idx_shotsAwayTeam, axis=0, inplace=True)
train_y.drop(oulier_idx_shotsAwayTeam, axis=0, inplace=True)
train_x.reset_index(drop= True, inplace= True)
train_y.reset_index(drop= True, inplace= True)

# lightGBM
lgbm = lgb.LGBMClassifier()
lgbm.fit(train_x, train_y)
prediction_lgbm = lgbm.predict_proba(valid_x)

# XGBoost 
xgbm = xgb.XGBClassifier()
xgbm.fit(train_x, train_y)
prediction_xgbm = xgbm.predict_proba(valid_x)

# Catboost 
catboost = cat.CatBoostClassifier()
catboost.fit(train_x, train_y)
prediction_catboost = catboost.predict_proba(valid_x)

# loss 
multi_loloss_lgbm = log_loss(valid_y, prediction_lgbm)
multi_loloss_xgbm = log_loss(valid_y, prediction_xgbm)
multi_loloss_cat = log_loss(valid_y, prediction_catboost)

print(multi_loloss_lgbm)
print(multi_loloss_xgbm)
print(multi_loloss_cat)

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [None]:
train_x, train_y, test = load_train_test()

lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
train_x, test= preprocessing(train_x, test)
train_x = train_x.drop(columns=['awayTeam', "day"])
test = test.drop(columns=["awayTeam", "day"])

#
# lightGBM
lgbm = lgb.LGBMClassifier()
lgbm.fit(train_x, train_y)
prediction_lgbm = lgbm.predict_proba(test)

# XGBoost 
xgbm = xgb.XGBClassifier()
xgbm.fit(train_x, train_y)
prediction_xgbm = xgbm.predict_proba(test)

# Catboost 
catboost = cat.CatBoostClassifier()
catboost.fit(train_x, train_y)
prediction_catboost = catboost.predict_proba(test)

prediction_voting = (prediction_lgbm+prediction_xgbm+prediction_catboost)/3

sample_submission = pd.read_csv('/home/workspace/DACON/soccer/Data/sample_submission.csv')
sample_submission.iloc[:,1:] = prediction_xgbm
sample_submission.to_csv('sample_submission_drop2col_voting.csv', index= False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000539 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 613
[LightGBM] [Info] Number of data points in the train set: 9008, number of used features: 19
[LightGBM] [Info] Start training from score -1.226955
[LightGBM] [Info] Start training from score -1.398356
[LightGBM] [Info] Start training from score -0.776934
Learning rate set to 0.088513
0:	learn: 1.0502294	total: 2.37ms	remaining: 2.37s
1:	learn: 1.0143502	total: 4.62ms	remaining: 2.3s
2:	learn: 0.9816337	total: 6.9ms	remaining: 2.29s
3:	learn: 0.9549788	total: 9.28ms	remaining: 2.31s
4:	learn: 0.9325846	total: 12ms	remaining: 2.38s
5:	learn: 0.9121935	total: 22.6ms	remaining: 3.75s
6:	learn: 0.8948884	total: 25ms	remaining: 3.55s
7:	learn: 0.8814173	total: 27.3ms	remaining: 3.38s
8:	learn: 0.8682488	total: 31.6ms	remaining: 3.48s
9:	l

In [2]:
train, valid = load_train_valid()
train_x, train_y, valid_x, valid_y = preprocessing(train, valid, is_test=False)

AttributeError: 'DataFrame' object has no attribute 'append'

In [4]:
stats_columns = [
    'halfTimeGoals(homeTeam)',
    'halfTimeGoals(awayTeam)',
    'shots(homeTeam)',
    'shots(awayTeam)',
    'shotsOnTarget(homeTeam)',
    'shotsOnTarget(awayTeam)',
    'corners(homeTeam)',
    'corners(awayTeam)',
    'fouls(homeTeam)',
    'fouls(awayTeam)',
    'yellowCards(homeTeam)',
    'yellowCards(awayTeam)',
    'redCards(homeTeam)',
    'redCards(awayTeam)'
    ]
pair_stats = train.groupby('match')[stats_columns].mean().reset_index() 

In [7]:
train.value_counts

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f02025135b0>

In [11]:
train[stats_columns].ewm(alpha=0.3).mean().values[-1]

array([8.13776866e-01, 3.16171825e-01, 1.55607715e+01, 1.27378383e+01,
       5.12417375e+00, 4.33170304e+00, 7.07937428e+00, 4.76062739e+00,
       1.02141220e+01, 1.12110880e+01, 1.62928895e+00, 3.13807611e+00,
       5.33373956e-02, 1.01776226e-02])

In [12]:
len(stats_columns)

14

In [17]:
train['match'].nunique()

1590

In [32]:
temp = train[train['match'] == "Man United-Aston Villa"]
temp['halfTimeGoals(homeTeam)'].ewm(alpha=0.4).mean().values[-1]

0.8109635150721491

In [29]:
temp = train[train['match'] == "Man United-Aston Villa"]
temp['halfTimeGoals(homeTeam)']

231     0
644     0
863     0
1284    2
1754    1
1914    0
2502    3
2973    2
3348    1
3574    0
4035    2
4513    2
4896    3
5248    2
5623    1
6026    1
7359    1
7756    1
8031    0
8689    1
Name: halfTimeGoals(homeTeam), dtype: int64