In [1]:
import pandas as pd
import numpy as np
import random
import os

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import lightgbm as lgb
import xgboost as xgb
import catboost as cat

import sys
sys.path.append("..")
from utils import *
from preprocessing_utils import *
import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [10]:
train, valid = load_train_valid()
train_x, train_y, valid_x, valid_y = preprocessing(train, valid, is_test=False)

lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
valid_y = lec.transform(valid_y)

# lightGBM
lgbm = lgb.LGBMClassifier()
lgbm.fit(train_x, train_y, eval_metric='logloss')
prediction_lgbm = lgbm.predict_proba(valid_x)

# XGBoost 
xgbm = xgb.XGBClassifier()
xgbm.fit(train_x, train_y, eval_metric='logloss')
prediction_xgbm = xgbm.predict_proba(valid_x)

# Catboost 
catboost = cat.CatBoostClassifier()
catboost.fit(train_x, train_y)
prediction_catboost = catboost.predict_proba(valid_x)

# loss 
multi_loloss_lgbm = log_loss(valid_y, prediction_lgbm)
multi_loloss_xgbm = log_loss(valid_y, prediction_xgbm)
multi_loloss_cat = log_loss(valid_y, prediction_catboost)

print(multi_loloss_lgbm)
print(multi_loloss_xgbm)
print(multi_loloss_cat)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 8744, number of used features: 7
[LightGBM] [Info] Start training from score -1.230316
[LightGBM] [Info] Start training from score -1.394102
[LightGBM] [Info] Start training from score -0.777086
Learning rate set to 0.088379
0:	learn: 1.0530430	total: 2.25ms	remaining: 2.25s
1:	learn: 1.0186810	total: 4.49ms	remaining: 2.24s
2:	learn: 0.9879921	total: 14.7ms	remaining: 4.88s
3:	learn: 0.9597381	total: 21.4ms	remaining: 5.34s
4:	learn: 0.9377740	total: 33.3ms	remaining: 6.62s
5:	learn: 0.9178977	total: 38.1ms	remaining: 6.31s
6:	learn: 0.9020458	total: 64.4ms	remaining: 9.14s
7:	learn: 0.8881409	total: 80.9ms	remaining: 10s
8:	learn: 0.8769999	total: 85ms	remaining: 9.36s
9:	

## Test

In [None]:
train_x, train_y, valid_x, valid_y = load_train_valid()
lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
valid_y = lec.transform(valid_y)
train_x, valid_x= preprocessing(train_x, valid_x)

# feature selection
train_x = train_x.drop(columns=['awayTeam', "day"])
valid_x = valid_x.drop(columns=["awayTeam", "day"])

# 함수 사용해서 이상치 값 삭제

oulier_idx_shotsAwayTeam = get_outlier(df=train_x, column='shots(awayTeam)', weight=1.5)
train_x.drop(oulier_idx_shotsAwayTeam, axis=0, inplace=True)
train_y.drop(oulier_idx_shotsAwayTeam, axis=0, inplace=True)
train_x.reset_index(drop= True, inplace= True)
train_y.reset_index(drop= True, inplace= True)

# lightGBM
lgbm = lgb.LGBMClassifier()
lgbm.fit(train_x, train_y)
prediction_lgbm = lgbm.predict_proba(valid_x)

# XGBoost 
xgbm = xgb.XGBClassifier()
xgbm.fit(train_x, train_y)
prediction_xgbm = xgbm.predict_proba(valid_x)

# Catboost 
catboost = cat.CatBoostClassifier()
catboost.fit(train_x, train_y)
prediction_catboost = catboost.predict_proba(valid_x)

# loss 
multi_loloss_lgbm = log_loss(valid_y, prediction_lgbm)
multi_loloss_xgbm = log_loss(valid_y, prediction_xgbm)
multi_loloss_cat = log_loss(valid_y, prediction_catboost)

print(multi_loloss_lgbm)
print(multi_loloss_xgbm)
print(multi_loloss_cat)

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [None]:
train_x, train_y, test = load_train_test()

lec = LabelEncoder()
train_y = lec.fit_transform(train_y)
train_x, test= preprocessing(train_x, test)
train_x = train_x.drop(columns=['awayTeam', "day"])
test = test.drop(columns=["awayTeam", "day"])

#
# lightGBM
lgbm = lgb.LGBMClassifier()
lgbm.fit(train_x, train_y)
prediction_lgbm = lgbm.predict_proba(test)

# XGBoost 
xgbm = xgb.XGBClassifier()
xgbm.fit(train_x, train_y)
prediction_xgbm = xgbm.predict_proba(test)

# Catboost 
catboost = cat.CatBoostClassifier()
catboost.fit(train_x, train_y)
prediction_catboost = catboost.predict_proba(test)

prediction_voting = (prediction_lgbm+prediction_xgbm+prediction_catboost)/3

sample_submission = pd.read_csv('/home/workspace/DACON/soccer/Data/sample_submission.csv')
sample_submission.iloc[:,1:] = prediction_xgbm
sample_submission.to_csv('sample_submission_drop2col_voting.csv', index= False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000539 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 613
[LightGBM] [Info] Number of data points in the train set: 9008, number of used features: 19
[LightGBM] [Info] Start training from score -1.226955
[LightGBM] [Info] Start training from score -1.398356
[LightGBM] [Info] Start training from score -0.776934
Learning rate set to 0.088513
0:	learn: 1.0502294	total: 2.37ms	remaining: 2.37s
1:	learn: 1.0143502	total: 4.62ms	remaining: 2.3s
2:	learn: 0.9816337	total: 6.9ms	remaining: 2.29s
3:	learn: 0.9549788	total: 9.28ms	remaining: 2.31s
4:	learn: 0.9325846	total: 12ms	remaining: 2.38s
5:	learn: 0.9121935	total: 22.6ms	remaining: 3.75s
6:	learn: 0.8948884	total: 25ms	remaining: 3.55s
7:	learn: 0.8814173	total: 27.3ms	remaining: 3.38s
8:	learn: 0.8682488	total: 31.6ms	remaining: 3.48s
9:	l