In [1]:
# Library
import os
import random
import pickle
import gc
import warnings
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib import font_manager, rc
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.cluster import KMeans, MiniBatchKMeans
# from category_encoders import TargetEncoder
from sklearn.preprocessing import (
    StandardScaler, PowerTransformer, OrdinalEncoder,
    OneHotEncoder, FunctionTransformer, PolynomialFeatures, LabelEncoder,
)
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.feature_selection import SelectKBest, SelectPercentile, RFE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import (
    LogisticRegression, LinearRegression, Ridge, Lasso,
    SGDRegressor, ElasticNet
)
from sklearn.model_selection import (
    train_test_split, cross_val_score, cross_validate,
    GridSearchCV, KFold, cross_val_predict
)
from sklearn.metrics import (
    roc_auc_score, mean_squared_error, make_scorer, accuracy_score, log_loss
)
from sklearn import set_config, datasets
# from catboost import (
#     CatBoostRegressor, CatBoostClassifier,
# )
# import category_encoders as ce
# from sklearn.pipeline import (
#     Pipeline, FeatureUnion, make_pipeline
# )
from sklearn.ensemble import (
    RandomForestClassifier, StackingClassifier, StackingRegressor,
    GradientBoostingRegressor, VotingClassifier, VotingRegressor,
    HistGradientBoostingRegressor, GradientBoostingClassifier,
    BaggingClassifier, AdaBoostClassifier, RandomForestRegressor,ExtraTreesRegressor
)
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.svm import SVC, SVR, LinearSVC
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons

import xgboost as xgb
import lightgbm as lgb
import re
import math
import optuna

from scipy.stats import zscore

%matplotlib inline

warnings.filterwarnings("ignore")

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
df = pd.read_pickle('matches_df.pkl')

In [3]:
# train_sample = pd.read_csv('train_sample.csv')

## Preprocessing

In [4]:
X = df
y = df['home_team_result']
train, test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 팀 인코딩 함수  해결?
def team_encoding(train):
    # 홈팀이 이긴 경기 수를 계산하여 인코딩
    train['home_win'] = train['home_team_result'].apply(lambda x: 1 if x=='승' else 0) 
    dic = {}
    # 각 홈팀별 이긴 경기 수를 딕셔너리에 저장
    for team in train['home_team_name'].unique():
        value = train[train['home_team_name'] == team]['home_win'].sum()
        dic[team] = value

    # 팀별 승리 횟수를 기준으로 라벨링
    label_dic={}
    for idx, (team, _) in enumerate(sorted(dic.items(), key= lambda x: x[1])):
        label_dic[team] = idx
    
    return label_dic

# 홈팀 득점 평균 계산 함수
def homeGoal_day_mean(train, test, day):
    # 초기값 설정
    train[f'home_Goal_{day}mean'] = -1
    test[f'home_Goal_{day}mean'] = -1
    
    # 각 팀별로 진행
    teams = train['home_team_name'].unique()
    for team in tqdm(teams):
        team_df = train[train['home_team_name'] == team]
        # 롤링 윈도우 크기 설정
        ch_day = len(team_df) if len(team_df) < day else day
        idx = team_df['home_team_goal_count'].rolling(ch_day).mean().index.values
        val = team_df['home_team_goal_count'].rolling(ch_day).mean().values
        train[f'home_Goal_{day}mean'].loc[idx] = val
        test_idx = test[test['home_team_name'] == team].index
        test[f'home_Goal_{day}mean'].loc[test_idx] = val[-1]
    # 결측값 처리
    train[f'home_Goal_{day}mean'] = train[f'home_Goal_{day}mean'].fillna(0)

# 원정팀 득점 평균 계산 함수
def awayGoal_day_mean(train, test, day):
    # 초기값 설정
    train[f'away_Goal_{day}mean'] = -1
    test[f'away_Goal_{day}mean'] = -1
    
    # 각 팀별로 진행
    teams = train['away_team_name'].unique()
    for team in tqdm(teams):
        team_df = train[train['away_team_name'] == team]
        # 롤링 윈도우 크기 설정
        ch_day = len(team_df) if len(team_df) < day else day
        idx = team_df['away_team_goal_count'].rolling(ch_day).mean().index.values
        val = team_df['away_team_goal_count'].rolling(ch_day).mean().values
        train[f'away_Goal_{day}mean'].loc[idx] = val
        test_idx = test[test['away_team_name'] == team].index
        test[f'away_Goal_{day}mean'].loc[test_idx] = val[-1]
    # 결측값 처리
    train[f'away_Goal_{day}mean'] = train[f'away_Goal_{day}mean'].fillna(0)

# 홈팀 승리율 평균 계산 함수
def homeWin_day_mean(train, test, day):
    # 초기값 설정
    train[f'home_winRate_{day}mean'] = -1
    test[f'home_winRate_{day}mean'] = -1
    train['win'] = train['home_team_result'].apply(lambda x: 1 if x == '승' else 0)
    
    # 각 팀별로 진행
    teams = train['home_team_name'].unique()
    for team in tqdm(teams):
        team_df = train[train['home_team_name'] == team]
        # 롤링 윈도우 크기 설정
        ch_day = len(team_df) if len(team_df) < day else day
        idx = team_df['win'].rolling(ch_day).mean().index.values
        val = team_df['win'].rolling(ch_day).mean().values
        train[f'home_winRate_{day}mean'].loc[idx] = val
        test_idx = test[test['home_team_name'] == team].index
        test[f'home_winRate_{day}mean'].loc[test_idx] = val[-1]
    # 임시 컬럼 제거
    train.drop(columns=['win'], inplace=True)
    # 결측값 처리
    train[f'home_winRate_{day}mean'] = train[f'home_winRate_{day}mean'].fillna(0)

# 원정팀 승리율 평균 계산 함수
def awayWin_day_mean(train, test, day):
    # 초기값 설정
    train[f'away_winRate_{day}mean'] = -1
    test[f'away_winRate_{day}mean'] = -1
    train['win'] = train['home_team_result'].apply(lambda x: 1 if x == '패' else 0)
    
    # 각 팀별로 진행
    teams = train['away_team_name'].unique()
    for team in tqdm(teams):
        team_df = train[train['away_team_name'] == team]
        # 롤링 윈도우 크기 설정
        ch_day = len(team_df) if len(team_df) < day else day
        idx = team_df['win'].rolling(ch_day).mean().index.values
        val = team_df['win'].rolling(ch_day).mean().values
        train[f'away_winRate_{day}mean'].loc[idx] = val
        test_idx = test[test['away_team_name'] == team].index
        test[f'away_winRate_{day}mean'].loc[test_idx] = val[-1]
    # 임시 컬럼 제거
    train.drop(columns=['win'], inplace=True)
    # 결측값 처리
    train[f'away_winRate_{day}mean'] = train[f'away_winRate_{day}mean'].fillna(0)

# 홈팀 평균 계산 함수
def home_day_mean(train, test, columns, day):
    for column in tqdm(columns):
        teams = train['home_team_name'].values
        train[f'home_{column}_{day}mean'] = -1
        test[f'home_{column}_{day}mean'] = -1

        for team in tqdm(teams):
            team_df = train[train['home_team_name'] == team]
            idx = team_df[column].rolling(day).mean().index.values
            val = team_df[column].rolling(day).mean().values
            train[f'home_{column}_{day}mean'].loc[idx] = val
            test_idx = test[test['home_team_name'] == team].index
            test[f'home_{column}_{day}mean'].loc[test_idx] = val[-1]
        # 결측값 처리
        train[f'home_{column}_{day}mean'] = train[f'home_{column}_{day}mean'].fillna(0)
        test[f'home_{column}_{day}mean'] = test[f'home_{column}_{day}mean'].fillna(0)

# 원정팀 평균 계산 함수
def away_day_mean(train, test, columns, day):
    for column in tqdm(columns):
        teams = train['away_team_name'].values
        train[f'away_{column}_{day}mean'] = -1
        test[f'away_{column}_{day}mean'] = -1

        for team in tqdm(teams):
            team_df = train[train['away_team_name'] == team]
            idx = team_df[column].rolling(day).mean().index.values
            val = team_df[column].rolling(day).mean().values
            train[f'away_{column}_{day}mean'].loc[idx] = val
            test_idx = test[test['away_team_name'] == team].index
            test[f'away_{column}_{day}mean'].loc[test_idx] = val[-1]
        # 결측값 처리
        train[f'away_{column}_{day}mean'] = train[f'away_{column}_{day}mean'].fillna(0)
        test[f'away_{column}_{day}mean'] = test[f'away_{column}_{day}mean'].fillna(0)

# 전처리 함수
def preprocessing(train, test, dic):
    # 년과 월일로 나누기
    train['date_GMT'] = train['date_GMT'].dt.strftime('%Y%m%d')
    test['date_GMT'] = test['date_GMT'].dt.strftime('%Y%m%d')
    train['year'] = train['date_GMT'].apply(lambda x : int(x[0:4]))
    train['month'] = train['date_GMT'].apply(lambda x : int(x[5:7]))
    # train['day'] = train['date_GMT'].apply(lambda x : int(x[8:10]))

    test['year'] = test['date_GMT'].apply(lambda x : int(x[0:4]))
    test['month'] = test['date_GMT'].apply(lambda x : int(x[5:7]))
    # test['day'] = test['date_GMT'].apply(lambda x : int(x[8:10]))
    train.drop(columns=['date_GMT'], inplace=True)
    test.drop(columns=['date_GMT'], inplace=True)

    # 팀 인코딩 적용
    label_dic = dic
    train['home_team_name'] = train['home_team_name'].apply(lambda x: label_dic[x])
    train['away_team_name'] = train['away_team_name'].apply(lambda x: label_dic[x])
    test['home_team_name'] = test['home_team_name'].apply(lambda x: label_dic[x])
    test['away_team_name'] = test['away_team_name'].apply(lambda x: label_dic[x])

    # 5일간 홈팀 승리 비율 계산
    homeWin_day_mean(train, test, 5)
    # 5일간 원정팀 승리 비율 계산
    awayWin_day_mean(train, test, 5)

    # 6일간 홈팀 평균 득점 계산
    homeGoal_day_mean(train, test, 6)
    # 6일간 원정팀 평균 득점 계산
    awayGoal_day_mean(train, test, 6)

    # 불필요한 컬럼 제거
    train = train.drop(columns=['home_win', 'index','home_team_goal_count','away_team_goal_count','home_team_result','game_points'])
    test = test.drop(columns=['index','home_team_goal_count','away_team_goal_count','home_team_result','game_points'])

    return train, test


## Seed

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

## Train / Test

In [6]:
hometeam_list = list(train['home_team_name'].unique())
dic = team_encoding(train)
train, test= preprocessing(train, test, dic)
test_idx = test.index.values

# name encoding
cat = ['home_team_name','away_team_name']


# 승무패 인코딩
lec = LabelEncoder()
lec.fit(df['home_team_result'])
y_train = lec.transform(y_train)
y_test = lec.transform(y_test)

num_features = list(set(train.columns) - set(cat))
scaler = StandardScaler()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.fit_transform(test[num_features])

100%|██████████| 17/17 [00:00<00:00, 677.09it/s]
100%|██████████| 17/17 [00:00<00:00, 696.84it/s]
100%|██████████| 17/17 [00:00<00:00, 638.36it/s]
100%|██████████| 17/17 [00:00<00:00, 665.15it/s]


In [7]:
model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
model.fit(train, y_train) 
y_pred = model.predict(test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000125 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 552
[LightGBM] [Info] Number of data points in the train set: 1984, number of used features: 29
[LightGBM] [Info] Start training from score -1.250749
[LightGBM] [Info] Start training from score -0.937430
[LightGBM] [Info] Start training from score -1.132966
Accuracy: 0.5705645161290323
