## Library

In [130]:
# Library
import os
import random
import pickle
import gc
import warnings
import seaborn as sns
import numpy as np
import pandas as pd
import shap
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib import font_manager, rc
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.cluster import KMeans, MiniBatchKMeans
# from category_encoders import TargetEncoder
from sklearn.preprocessing import (
    StandardScaler, PowerTransformer, OrdinalEncoder,
    OneHotEncoder, FunctionTransformer, PolynomialFeatures, LabelEncoder, MinMaxScaler
)
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.feature_selection import SelectKBest, SelectPercentile, RFE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import (
    LogisticRegression, LinearRegression, Ridge, Lasso,
    SGDRegressor, ElasticNet
)
from sklearn.model_selection import (
    train_test_split, cross_val_score, cross_validate,
    GridSearchCV, KFold, cross_val_predict
)
from sklearn.metrics import (
    roc_auc_score, mean_squared_error, make_scorer, accuracy_score, log_loss
)
from sklearn import set_config, datasets
from catboost import (
    CatBoostRegressor, CatBoostClassifier,
)
# import category_encoders as ce
# from sklearn.pipeline import (
#     Pipeline, FeatureUnion, make_pipeline
# )
from sklearn.ensemble import (
    RandomForestClassifier, StackingClassifier, StackingRegressor,
    GradientBoostingRegressor, VotingClassifier, VotingRegressor,
    HistGradientBoostingRegressor, GradientBoostingClassifier,
    BaggingClassifier, AdaBoostClassifier, RandomForestRegressor,ExtraTreesRegressor
)
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.svm import SVC, SVR, LinearSVC
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons

# from tpot import TPOTClassifier
import xgboost as xgb
import lightgbm as lgb
import re
import math
import optuna

from scipy.stats import zscore

%matplotlib inline

warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)
pd.set_option('display.max_columns', None)

## data load

In [131]:
# df = pd.read_pickle('adidas_match_df.pkl')

In [132]:
df = pd.read_pickle('matches_adidas_ver2_df.pkl')

In [133]:
df.head()

Unnamed: 0,index,date_GMT,home_team_name,away_team_name,home_team_corner_count,away_team_corner_count,home_team_shots,away_team_shots,home_team_shots_on_target,away_team_shots_on_target,home_team_fouls,away_team_fouls,home_team_possession,away_team_possession,home_team_free_kick,away_team_free_kick,home_team_offside,away_team_offside,home_team_goal_count,away_team_goal_count,home_team_goal_count_half_time,away_team_goal_count_half_time,home_team_yellow_cards,home_team_red_cards,away_team_yellow_cards,away_team_red_cards,home_team_result,adidas_point_home,adidas_point_away,home_ppg,away_ppg,team_a_xg,team_b_xg,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win,odds_btts_yes,odds_btts_no,홈_드리블 성공%,홈_패스 시도,홈_패스 성공,홈_패스 성공%,홈_키패스,홈_전방패스 시도,홈_전방패스 성공,홈_전방패스 성공%,홈_횡패스 시도,홈_횡패스 성공,홈_횡패스 성공%,홈_공격진영 패스 시도,홈_공격진영 패스 성공,홈_공격진영 패스 성공%,홈_수비진영 패스 시도,홈_수비진영 패스 성공,홈_수비진영 패스 성공%,홈_중앙진영 패스 시도,홈_중앙진영 패스 성공,홈_중앙진영 패스 성공%,홈_롱패스 시도,홈_롱패스 성공,홈_롱패스 성공%,홈_중거리패스 시도,홈_중거리패스 성공,홈_중거리패스 성공%,홈_단거리패스 시도,홈_단거리패스 성공,홈_단거리패스 성공%,홈_크로스 시도,홈_크로스 성공,홈_크로스 성공%,홈_경합 지상 시도,홈_경합 지상 성공,홈_경합 지상 성공%,홈_경합 공중 시도,홈_경합 공중 성공,홈_경합 공중 성공%,홈_태클 시도,홈_태클 성공,홈_태클 성공%,홈_클리어링,홈_인터셉트,홈_차단,홈_획득,홈_블락,홈_볼미스,어웨이_드리블 성공%,어웨이_패스 시도,어웨이_패스 성공,어웨이_패스 성공%,어웨이_키패스,어웨이_전방패스 시도,어웨이_전방패스 성공,어웨이_전방패스 성공%,어웨이_횡패스 시도,어웨이_횡패스 성공,어웨이_횡패스 성공%,어웨이_공격진영 패스 시도,어웨이_공격진영 패스 성공,어웨이_공격진영 패스 성공%,어웨이_수비진영 패스 시도,어웨이_수비진영 패스 성공,어웨이_수비진영 패스 성공%,어웨이_중앙진영 패스 시도,어웨이_중앙진영 패스 성공,어웨이_중앙진영 패스 성공%,어웨이_롱패스 시도,어웨이_롱패스 성공,어웨이_롱패스 성공%,어웨이_중거리패스 시도,어웨이_중거리패스 성공,어웨이_중거리패스 성공%,어웨이_단거리패스 시도,어웨이_단거리패스 성공,어웨이_단거리패스 성공%,어웨이_크로스 시도,어웨이_크로스 성공,어웨이_크로스 성공%,어웨이_경합 지상 시도,어웨이_경합 지상 성공,어웨이_경합 지상 성공%,어웨이_경합 공중 시도,어웨이_경합 공중 성공,어웨이_경합 공중 성공%,어웨이_태클 시도,어웨이_태클 성공,어웨이_태클 성공%,어웨이_클리어링,어웨이_인터셉트,어웨이_차단,어웨이_획득,어웨이_블락,어웨이_볼미스
0,44,2021-04-06 10:30:00,Daegu,Seongnam,3,4,12,9,5,5,11,20,49,51,18,25,6,2,0,0,0,0,1,0,0,0,무,14156,7702,1.58,0.79,1.65,1.54,2.1,3.2,3.9,2.0,1.75,37.5,544,428,78.7,7,245,163,66.5,192,161,83.9,116,72,62.1,148,121,81.8,280,235,83.9,58,29,50.0,205,171,83.4,281,228,81.1,22,3,13.6,104,53,51.0,46,24,52.2,5,2,40.0,7,29,48,87,3,9,33.3,732,638,87.2,12,290,222,76.6,276,252,91.3,163,128,78.5,118,103,87.3,451,407,90.2,88,55,62.5,257,225,87.5,387,358,92.5,19,3,15.8,69,28,40.6,39,28,71.8,4,2,50.0,17,16,33,81,1,5
1,65,2021-04-21 10:30:00,Daegu,Suwon Bluewings,6,5,10,5,4,0,21,21,32,68,20,23,0,0,1,0,0,0,3,0,4,1,승,13149,15574,1.58,1.32,1.33,0.76,2.55,3.1,2.85,2.0,1.74,50.0,527,419,79.5,11,250,182,72.8,163,133,81.6,188,137,72.9,124,99,79.8,215,183,85.1,61,39,63.9,170,143,84.1,296,237,80.1,33,11,33.3,109,52,47.7,55,39,70.9,7,2,28.6,19,21,42,96,2,12,77.8,646,551,85.3,1,249,184,73.9,271,244,90.0,175,128,73.1,124,103,83.1,347,320,92.2,67,45,67.2,231,200,86.6,348,306,87.9,26,2,7.7,70,41,58.6,35,16,45.7,9,8,88.9,17,19,45,66,4,5
2,122,2021-08-01 11:00:00,Daegu,Pohang Steelers,6,5,6,10,6,8,14,13,28,72,15,16,1,4,1,1,0,0,1,0,2,0,무,13651,8936,1.58,1.28,1.15,1.64,2.46,3.4,2.9,1.8,1.95,0.0,414,291,70.3,8,195,105,53.8,141,117,83.0,94,65,69.1,106,67,63.2,214,159,74.3,78,36,46.2,143,104,72.7,193,151,78.2,20,5,25.0,87,39,44.8,67,35,52.2,8,1,12.5,8,21,42,102,2,14,66.7,559,451,80.7,4,238,148,62.2,190,176,92.6,67,44,65.7,131,94,71.8,361,313,86.7,84,38,45.2,177,156,88.1,298,257,86.2,17,3,17.6,78,37,47.4,52,19,36.5,8,4,50.0,14,7,44,85,2,3
3,142,2021-08-20 10:30:00,Daegu,Gwangju,7,5,10,5,5,4,14,5,59,41,16,6,1,0,1,2,0,0,1,0,2,0,패,6015,15422,1.58,0.89,1.38,0.8,1.91,3.2,3.8,2.1,1.71,33.3,343,240,70.0,6,189,106,56.1,91,74,81.3,68,42,61.8,108,60,55.6,167,138,82.6,61,25,41.0,104,79,76.0,178,136,76.4,10,4,40.0,69,41,59.4,39,11,28.2,10,5,50.0,6,15,34,86,1,8,33.3,429,324,75.5,3,211,131,62.1,140,119,85.0,87,64,73.6,120,78,65.0,222,182,82.0,69,25,36.2,124,100,80.6,236,199,84.3,14,1,7.1,74,38,51.4,57,27,47.4,11,7,63.6,21,25,36,79,3,7
4,154,2021-08-28 10:00:00,Daegu,Seongnam,4,4,11,9,6,4,13,6,39,61,14,8,2,2,3,1,2,0,0,0,2,0,승,14654,6154,1.58,0.79,1.43,1.42,2.08,3.01,3.15,2.2,1.65,0.0,404,272,67.3,7,220,121,55.0,129,102,79.1,84,52,61.9,121,69,57.0,199,151,75.9,70,27,38.6,137,102,74.5,197,143,72.6,24,5,20.8,86,48,55.8,80,43,53.8,4,1,25.0,17,19,57,98,1,1,66.7,554,434,78.3,6,257,167,65.0,188,163,86.7,124,82,66.1,83,54,65.1,347,298,85.9,55,25,45.5,210,163,77.6,289,246,85.1,27,6,22.2,91,49,53.8,74,30,40.5,5,3,60.0,14,20,50,112,4,4


In [134]:
# # 'home_team_result' 무 -> 승,패로 업데이트, 추가적인 기준 필요함
# df.loc[(df['home_team_result'] == '무') & (df['home_team_shots_on_target'] >= df['away_team_shots_on_target']), 'home_team_result'] = '승'
# df.loc[(df['home_team_result'] == '무') & (df['home_team_shots_on_target'] < df['away_team_shots_on_target']), 'home_team_result'] = '패'

## Feature function

In [135]:
# feature fun
def team_encoding(train):
    train['home_win'] = train['home_team_result'].apply(lambda x: 1 if x=='승' else 0) # home_win 열 추가, 승리인 경우 1, 아닌 경우 0
    dic = {}
    # 각 홈팀별 이긴 경기 수를 딕셔너리에 저장
    for team in train['home_team_name'].unique():
        value = train[train['home_team_name'] == team]['home_win'].sum() 
        #home_team_name  열에서 고유한 팀 이름을 가져와 각 팀이 홈에서 이긴 경기 수 계산, 이 값을 dic에 저장
        dic[team] = value

    label_dic={}
    # 승리 횧수를 기준으로 오름차순 정렬, 각 팀에 대해 라벨 부여, 승리 횟수가 적은 팀부터 0,1,2 의 라벨을 부여
    for idx, (team, _) in enumerate(sorted(dic.items(), key= lambda x: x[1])):
        label_dic[team] = idx
    
    return label_dic


''' 홈팀 득점 이동평균 계산 함수 '''

def homeGoal_day_mean(train, test, day):
    train[f'home_Goal_{day}_mean'] = -1  # 초기값 -1로 설정
    test[f'home_Goal_{day}_mean'] = -1
    
    teams = train['home_team_name'].unique()
    for team in tqdm(teams): # train에서 고유 팀 이름을 가져오고 이를 시각적으로 표시해줌 : tqdm
        team_df = train[train['home_team_name'] == team]
        # 롤링 윈도우 크기 설정
        ch_day = len(team_df) if len(team_df) < day else day # 팀의 경기 수가 주어진 day 보다 적으면, 경기 수 만큼의 윈도우 크기 사용
        idx = team_df['home_team_goal_count'].rolling(ch_day).mean().index.values # 롤링 윈도우 평균 계산
        val = team_df['home_team_goal_count'].rolling(ch_day).mean().values
        train[f'home_Goal_{day}_mean'].loc[idx] = val
        test_idx = test[test['home_team_name'] == team].index
        test[f'home_Goal_{day}_mean'].loc[test_idx] = val[-1]
    # 결측값 처리
    train[f'home_Goal_{day}_mean'] = train[f'home_Goal_{day}_mean'].fillna(0)


''' 원정팀 득점 이동평균 계산 함수 '''

def awayGoal_day_mean(train, test, day):
    # 초기값 설정
    train[f'away_Goal_{day}_mean'] = -1
    test[f'away_Goal_{day}_mean'] = -1
    
    teams = train['away_team_name'].unique()
    for team in tqdm(teams):
        team_df = train[train['away_team_name'] == team]
        # 롤링 윈도우 크기 설정
        ch_day = len(team_df) if len(team_df) < day else day
        idx = team_df['away_team_goal_count'].rolling(ch_day).mean().index.values
        val = team_df['away_team_goal_count'].rolling(ch_day).mean().values
        train[f'away_Goal_{day}_mean'].loc[idx] = val
        test_idx = test[test['away_team_name'] == team].index
        test[f'away_Goal_{day}_mean'].loc[test_idx] = val[-1]
    # 결측값 처리
    train[f'away_Goal_{day}_mean'] = train[f'away_Goal_{day}_mean'].fillna(0)


# '''홈팀 승리율 평균 계산 함수'''

# def homeWin_day_mean(train, test, day):
#     train[f'home_winRate_{day}_mean'] = -1
#     test[f'home_winRate_{day}_mean'] = -1
#     train['win'] = train['home_team_result'].apply(lambda x: 1 if x == '승' else 0)

#     teams = train['home_team_name'].unique()
#     for team in tqdm(teams):
#         team_df = train[train['home_team_name'] == team]
#         ch_day = len(team_df) if len(team_df) < day else day
#         idx = team_df['win'].rolling(ch_day).mean().index.values
#         val = team_df['win'].rolling(ch_day).mean().values
#         train[f'home_winRate_{day}_mean'].loc[idx] = val
#         test_idx = test[test['home_team_name'] == team].index
#         test[f'home_winRate_{day}_mean'].loc[test_idx] = val[-1]

#     train.drop(columns=['win'], inplace=True)

#     train[f'home_winRate_{day}_mean'] = train[f'home_winRate_{day}_mean'].fillna(0)


# '''원정팀 승리율 평균 계산 함수'''

# def awayWin_day_mean(train, test, day):

#     train[f'away_winRate_{day}_mean'] = -1
#     test[f'away_winRate_{day}_mean'] = -1
#     train['win'] = train['home_team_result'].apply(lambda x: 1 if x == '패' else 0)
    
#     teams = train['away_team_name'].unique()
#     for team in tqdm(teams):
#         team_df = train[train['away_team_name'] == team]

#         ch_day = len(team_df) if len(team_df) < day else day
#         idx = team_df['win'].rolling(ch_day).mean().index.values
#         val = team_df['win'].rolling(ch_day).mean().values
#         train[f'away_winRate_{day}_mean'].loc[idx] = val
#         test_idx = test[test['away_team_name'] == team].index
#         test[f'away_winRate_{day}_mean'].loc[test_idx] = val[-1]

#     train.drop(columns=['win'], inplace=True)

#     train[f'away_winRate_{day}_mean'] = train[f'away_winRate_{day}_mean'].fillna(0)


'''홈팀 평균 계산 함수'''

def home_day_mean(train, test, columns, day):
    for column in tqdm(columns):
        teams = train['home_team_name'].values
        train[f'home_{column}_{day}_mean'] = -1
        test[f'home_{column}_{day}_mean'] = -1

        for team in tqdm(teams):
            team_df = train[train['home_team_name'] == team]
            idx = team_df[column].rolling(day).mean().index.values
            val = team_df[column].rolling(day).mean().values
            train[f'home_{column}_{day}_mean'].loc[idx] = val
            test_idx = test[test['home_team_name'] == team].index
            test[f'home_{column}_{day}_mean'].loc[test_idx] = val[-1]

        train[f'home_{column}_{day}_mean'] = train[f'home_{column}_{day}_mean'].fillna(0)
        test[f'home_{column}_{day}_mean'] = test[f'home_{column}_{day}_mean'].fillna(0)


'''원정팀 평균 계산 함수'''

def away_day_mean(train, test, columns, day):
    for column in tqdm(columns):
        teams = train['away_team_name'].values
        train[f'away_{column}_{day}_mean'] = -1
        test[f'away_{column}_{day}_mean'] = -1

        for team in tqdm(teams):
            team_df = train[train['away_team_name'] == team]
            idx = team_df[column].rolling(day).mean().index.values
            val = team_df[column].rolling(day).mean().values
            train[f'away_{column}_{day}_mean'].loc[idx] = val
            test_idx = test[test['away_team_name'] == team].index
            test[f'away_{column}_{day}_mean'].loc[test_idx] = val[-1]

        train[f'away_{column}_{day}_mean'] = train[f'away_{column}_{day}_mean'].fillna(0)
        test[f'away_{column}_{day}_mean'] = test[f'away_{column}_{day}_mean'].fillna(0)


'''전처리 함수'''

def preprocessing(train, test):
    # 년과 월일로 나누기
    train['date_GMT'] = train['date_GMT'].dt.strftime('%Y%m%d')
    train['year'] = train['date_GMT'].apply(lambda x : int(x[0:4]))
    train['date'] = train['date_GMT'].apply(lambda x : int(x[4:10]))
    
    test['date_GMT'] = test['date_GMT'].dt.strftime('%Y%m%d')
    test['year'] = test['date_GMT'].apply(lambda x : int(x[0:4]))
    test['date'] = test['date_GMT'].apply(lambda x : int(x[4:10]))

    # train.drop(columns=['date_GMT'], inplace=True)
    # test.drop(columns=['date_GMT'], inplace=True)

    # # 팀 인코딩 적용   # 위에서 적용 했음
    # label_dic = dic
    # train['home_team_name'] = train['home_team_name'].apply(lambda x: label_dic[x])
    # train['away_team_name'] = train['away_team_name'].apply(lambda x: label_dic[x])
    # test['home_team_name'] = test['home_team_name'].apply(lambda x: label_dic[x])
    # test['away_team_name'] = test['away_team_name'].apply(lambda x: label_dic[x])

    # # 5일간 홈팀 승리 비율 계산    ### 이거 쓰레기인듯
    # homeWin_day_mean(train, test, 5)
    # # 5일간 원정팀 승리 비율 계산
    # awayWin_day_mean(train, test, 5)

    # 5일간 홈팀 평균 득점 계산
    homeGoal_day_mean(train, test, 5)

    # 5일간 원정팀 평균 득점 계산
    awayGoal_day_mean(train, test, 5)

    # 불필요한 컬럼 제거
    # train = train.drop(columns=['index','home_team_goal_count','away_team_goal_count','game_points'])
    # test = test.drop(columns=['index','home_team_goal_count','away_team_goal_count','home_team_result','game_points'])

    train = train.drop(columns=['index','home_team_goal_count','away_team_goal_count'])
    test = test.drop(columns=['index','home_team_goal_count','away_team_goal_count','home_team_result'])

    return train, test


## Preprocessing

In [136]:
result_label = {
    '승' : 0,
    '패' : 1, 
    '무' : 2,} 
    
X = df.drop(columns=['away_team_possession'])
X['home_team_result'] = X['home_team_result'].map(result_label)
y = X['home_team_result']
# split_index = 1796 # 2021년도까지의 index
# train = X.iloc[:split_index]
# test = X.iloc[split_index:]
# y_train = y.iloc[:split_index]
# y_test = y.iloc[split_index:]

# team_name 인코딩
cat = ['home_team_name','away_team_name']
le = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=int)
X[cat] = le.fit_transform(X[cat])

# # 승무패 인코딩
# lec = LabelEncoder()
# lec.fit(X['home_team_result'])
# y = lec.transform(y)

## Train / Test

In [137]:
# Train,Test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) ## 이 방법에 의문을 품는 바

X_train, X_val= preprocessing(X_train, X_val)
X_val_idx = X_val.index.values
print(X_train.shape, X_val.shape)

100%|██████████| 14/14 [00:00<00:00, 629.65it/s]
100%|██████████| 14/14 [00:00<00:00, 574.70it/s]

(547, 132) (137, 131)





## Feature add

In [138]:
# 홈팀과 원정팀의 공격 효율성을 계산한 피쳐 생성
X_train['home_attack_efficiency'] = X_train['home_Goal_5_mean'] * X_train['home_team_shots_on_target']
X_train['away_attack_efficiency'] = X_train['away_Goal_5_mean'] * X_train['away_team_shots_on_target']

# 홈팀과 원정팀의 공격 효율성 차이를 나타내는 피쳐 생성
X_train['attack_efficiency_difference'] = X_train['home_attack_efficiency'] - X_train['away_attack_efficiency']

# 홈팀과 원정팀의 전반 골 수 차이를 나타내는 피쳐 생성
X_train['goal_count_diff'] = X_train['home_team_goal_count_half_time'] - X_train['away_team_goal_count_half_time']

# 최근 5경기 평균 득점의 표준 편차를 나타내는 피쳐 생성
X_train['home_Goal_5_std'] = X_train['home_Goal_5_mean'].rolling(window=5).std()
X_train['away_Goal_5_std'] = X_train['away_Goal_5_mean'].rolling(window=5).std()

# 결측값을 0으로 대체
X_train = X_train.fillna(0)

# 테스트 데이터에도 동일한 피쳐 생성
X_val['home_attack_efficiency'] = X_val['home_Goal_5_mean'] * X_val['home_team_shots_on_target']
X_val['away_attack_efficiency'] = X_val['away_Goal_5_mean'] * X_val['away_team_shots_on_target']
X_val['attack_efficiency_difference'] = X_val['home_attack_efficiency'] - X_val['away_attack_efficiency']
X_val['goal_count_diff'] = X_val['home_team_goal_count_half_time'] - X_val['away_team_goal_count_half_time']
X_val['home_Goal_5_std'] = X_val['home_Goal_5_mean'].rolling(window=5).std()
X_val['away_Goal_5_std'] = X_val['away_Goal_5_mean'].rolling(window=5).std()
X_val = X_val.fillna(0)

# 학습 데이터에서 목표 변수 'home_team_result' 컬럼 제거
X_train.drop(columns = ['home_team_result'], inplace = True)

## Feature scaler

In [139]:
# scaler
cat = ['home_team_name','away_team_name']

num_features = list(set(X_train.columns) - set(cat))
# scaler = MinMaxScaler()
scaler = StandardScaler()
X_train[num_features] = scaler.fit_transform(X_train[num_features])
X_val[num_features] = scaler.transform(X_val[num_features])

## Over-Sampling

In [140]:
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
import pandas as pd
from collections import Counter

# SMOTE 객체 생성 (각 클래스의 샘플 수를 1000개로 설정)
smote = SMOTE(sampling_strategy={0: 10000, 1: 10000, 2 : 10000}, random_state=42)

# SMOTE-Tomek 객체 생성
smote_tomek = SMOTETomek(smote=smote, random_state=42)

# 오버샘플링 및 언더샘플링 적용
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

# 각 클래스 비율 확인
print(f"Resampled class distribution: {Counter(y_train_resampled)}")
print(f"Original training set shape: {X_train.shape}")
print(f"Resampled training set shape: {X_train_resampled.shape}")

Resampled class distribution: Counter({0: 10000, 2: 10000, 1: 10000})
Original training set shape: (547, 137)
Resampled training set shape: (30000, 137)


In [141]:
X_train_resampled['home_team_result'] = y_train_resampled
X_val['home_team_result'] = y_val

In [142]:
# 'home_team_result' 무 -> 승,패로 업데이트, 추가적인 기준 필요함
X_train_resampled.loc[(X_train_resampled['home_team_result'] == 2) & (X_train_resampled['home_team_shots_on_target'] >= X_train_resampled['away_team_shots_on_target']), 'home_team_result'] = 0
X_train_resampled.loc[(X_train_resampled['home_team_result'] == 2) & (X_train_resampled['home_team_shots_on_target'] < X_train_resampled['away_team_shots_on_target']), 'home_team_result'] = 1
# 'home_team_result' 무 -> 승,패로 업데이트, 추가적인 기준 필요함
X_val.loc[(X_val['home_team_result'] == 2) & (X_val['home_team_shots_on_target'] >= X_val['away_team_shots_on_target']), 'home_team_result'] = 0
X_val.loc[(X_val['home_team_result'] == 2) & (X_val['home_team_shots_on_target'] < X_val['away_team_shots_on_target']), 'home_team_result'] = 1

In [143]:
# target 값 배정
y_train_resampled = X_train_resampled['home_team_result']
y_val = X_val['home_team_result']
# target 값 다시 삭제
X_train_resampled.drop(columns='home_team_result', inplace = True)
X_val.drop(columns='home_team_result', inplace = True)

---

## automl

In [144]:
# from pycaret.classification import *

# setup_clf = setup(data = X_train, target = y_train, session_id = 42)
# model = compare_models(sort = 'Accuracy', fold = 5)

In [145]:
# best_tune = tune_model(model)

In [146]:
# evaluate_model(best_tune)

---

## Shap

### xgb

In [163]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
import shap
SHAP_THRESHOLD = 0.2

# feature_names dimension 조정
X_train_col = X_train.columns
feature_names = X_train_col.to_numpy()

# 모델 학습
model = xgb.XGBClassifier().fit(X_train_resampled, y_train_resampled)

# 모델 예측 및 평가
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

# SHAP 값 계산
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_val)

# SHAP 값 요약
if isinstance(shap_values, list):  # shap_values가 리스트일 경우 (XGBoost >= 1.0.0)
    shap_values = shap_values[1]

shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame({'column_name': feature_names, 'shap_importance': shap_sum})
importance_df = importance_df.sort_values('shap_importance', ascending=False)

# 중요도 임계값 적용 (선택 사항)
importance_df_filtered = importance_df[importance_df['shap_importance'] > SHAP_THRESHOLD]
print("Filtered SHAP Importances:\n", importance_df_filtered)

Validation Accuracy: 0.8248175182481752
Filtered SHAP Importances:
                         column_name  shap_importance
7         home_team_shots_on_target         1.767909
133    attack_efficiency_difference         1.758972
8         away_team_shots_on_target         1.490812
134                 goal_count_diff         1.302669
22                adidas_point_home         1.129522
23                adidas_point_away         1.096887
132          away_attack_efficiency         0.273138
17   away_team_goal_count_half_time         0.272126
67                      홈_경합 지상 성공%         0.265457
9                   home_team_fouls         0.225077


In [164]:
# 지정된(SHAP_THRESHOLD) Shap feature 중요도 이상인 것만 선택
features_selected = importance_df.query('shap_importance > @SHAP_THRESHOLD').column_name.tolist()
shap_xgb_X_train_resampled = X_train_resampled[features_selected]
shap_xgb_X_val = X_val[features_selected]

---

### MLP

In [186]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# 클래스 수 (이진 분류의 경우 1로 설정)
num_classes = 1

# 모델 정의
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='PReLU', input_shape=(shap_xgb_X_train_resampled.shape[1],)),
    tf.keras.layers.Dropout(0.6),  # 드롭아웃 비율을 60%로 증가
    tf.keras.layers.Dense(128, activation='PReLU'),
    tf.keras.layers.Dropout(0.6),  # 드롭아웃 추가
    tf.keras.layers.Dense(64, activation='PReLU'),
    tf.keras.layers.Dropout(0.6),  # 드롭아웃 추가
    tf.keras.layers.Dense(32, activation='PReLU'),
    tf.keras.layers.Dense(num_classes, activation='sigmoid')  # 이진 분류를 위한 시그모이드 출력
])

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 조기 종료 콜백
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 레이블을 원-핫 인코딩에서 이진 레이블로 변경
y_train_resampled_binary = y_train_resampled  # 이진 분류이므로 원-핫 인코딩 필요 없음
y_val_binary = y_val  # 이진 분류이므로 원-핫 인코딩 필요 없음

# 모델 학습
model.fit(shap_xgb_X_train_resampled, y_train_resampled_binary, epochs=50, validation_data=(shap_xgb_X_val, y_val_binary),  callbacks=[early_stopping])

# 모델 평가
test_loss, test_acc = model.evaluate(shap_xgb_X_val, y_val_binary)
print(f"테스트 정확도: {test_acc}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
테스트 정확도: 0.8540145754814148


---

### lgbm

In [None]:
# # LightGBM 모델 학습
# model = lgb.LGBMClassifier().fit(X_train_resampled, y_train_resampled)
# SHAP_THRESHOLD = 0.1

# # 모델 예측 및 평가
# y_pred = model.predict(X_val)
# print("Validation Accuracy:", accuracy_score(y_val, y_pred))

# # SHAP 값 계산
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(X_val)

# shap_sum = np.abs(shap_values).mean(axis=1)[1,:]
# importance_df = pd.DataFrame([X_val.columns.tolist(), shap_sum.tolist()]).T
# importance_df.columns = ['column_name', 'shap_importance']
# importance_df = importance_df.sort_values('shap_importance', ascending=False);
# importance_df

# # SHAP 값 데이터프레임 생성 (각 피쳐별 SHAP 값)
# shap_values_df = pd.DataFrame(shap_values[1], columns=X_val.columns)
# shap_values_df

# # SHAP 값의 평균 절대값 계산
# shap_abs_mean = pd.DataFrame(shap_values[1], columns=X_val.columns).abs().mean().sort_values(ascending=False)
# # 중요도 임계값 적용 (선택 사항)
# importance_df_filtered = importance_df[importance_df['shap_importance'] > SHAP_THRESHOLD]
# print("Filtered SHAP Importances:\n", importance_df_filtered)
# # # SHAP 값 평균 절대값 시각화
# # plt.figure(figsize=(10, 8))
# # shap_abs_mean.plot(kind='barh')
# # plt.title("Mean Absolute SHAP Values for Features")
# # plt.xlabel("Mean Absolute SHAP Value")
# # plt.ylabel("Features")
# # plt.gca().invert_yaxis()
# # plt.show()

In [None]:
# # 지정된(SHAP_THRESHOLD) Shap feature 중요도 이상인 것만 선택
# features_selected = importance_df.query('shap_importance > @SHAP_THRESHOLD').column_name.tolist()
# shap_lgbm_X_train_resampled = X_train_resampled[features_selected]
# shap_lgbm_X_val = X_val[features_selected]

## Oputna

### lgbm

In [None]:
# import optuna
# from sklearn.model_selection import cross_val_score
# from lightgbm import LGBMClassifier

# def lgbm_objective(trial):
#     # 하이퍼파라미터 범위 설정
#     max_depth = trial.suggest_int('max_depth', 3, 7)
#     learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
#     n_estimators = trial.suggest_int('n_estimators', 50, 200)
#     subsample = trial.suggest_float('subsample', 0.5, 0.9)
#     colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 0.9)
#     min_child_weight = trial.suggest_int('min_child_weight', 4, 10)
#     reg_alpha = trial.suggest_float('reg_alpha', 0, 10)  # 추가: L2 정규화
#     reg_lambda = trial.suggest_float('reg_lambda', 0, 10)  # 추가: L1 정규화
    
#     # LGBMClassifier 모델 정의
#     clf = LGBMClassifier(
#         max_depth=max_depth,
#         learning_rate=learning_rate,
#         n_estimators=n_estimators,
#         subsample=subsample,
#         colsample_bytree=colsample_bytree,
#         min_child_weight=min_child_weight,
#         reg_alpha=reg_alpha,  # 추가
#         reg_lambda=reg_lambda,  # 추가
#         force_col_wise=True
#     )
    
#     # 교차 검증 점수 계산
#     scores = cross_val_score(clf, shap_lgbm_X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    
#     return scores.mean()

# # Optuna 스터디 생성 및 최적화
# lgbm_study = optuna.create_study(direction='maximize')
# lgbm_study.optimize(lgbm_objective, n_trials=5)

# # 최적 하이퍼파라미터 출력
# lgbm_best_params = lgbm_study.best_params
# print('최적화된 하이퍼파라미터:', lgbm_best_params)
# print('최적 교차 검증 점수:', lgbm_study.best_value)


In [None]:
# # LGBMClassifier 모델 정의 및 학습
# model_logis = LGBMClassifier(**lgbm_best_params)
# model_logis.fit(shap_lgbm_X_train_resampled, y_train_resampled)

# # 예측 수행
# y_pred = model_logis.predict(shap_lgbm_X_val)

# # 정확도 계산
# accuracy = accuracy_score(y_val, y_pred)
# print("Accuracy =", accuracy)

### xgb

In [None]:
# xgboostclassifier
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

def xgb_objective(trial):
    # 하이퍼파라미터 범위 설정
    max_depth = trial.suggest_int('max_depth', 3, 7)  # max_depth의 범위를 줄임
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    n_estimators = trial.suggest_int('n_estimators', 50, 200)  # n_estimators의 범위를 줄임
    subsample = trial.suggest_float('subsample', 0.5, 0.9)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 0.9)
    min_child_weight = trial.suggest_int('min_child_weight', 4, 10)  # 추가
    gamma = trial.suggest_float('gamma', 0, 5)  # 추가
    
    # XGBClassifier 모델 정의
    clf = XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        min_child_weight=min_child_weight,  # 추가
        gamma=gamma,  # 추가
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    # 교차 검증 점수 계산
    scores = cross_val_score(clf, shap_xgb_X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    
    return scores.mean()

# Optuna 스터디 생성 및 최적화
xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(xgb_objective, n_trials=50)  # 최적화 반복 횟수는 필요에 따라 조절

# 최적 하이퍼파라미터 출력
xgb_best_params = xgb_study.best_params
print(' ')
print(xgb_study.best_value)
print(xgb_best_params)

In [None]:
model_logis=xgb.XGBClassifier(**xgb_best_params)
model_logis.fit(shap_xgb_X_train_resampled, y_train_resampled)
y_pred = model_logis.predict(shap_xgb_X_val)

accuracy = accuracy_score(y_val, y_pred)
print("Accuracy=", accuracy)

### Logistic

In [None]:
# # LogisticRegression
# def logreg_objective(trial):

#     r = trial.suggest_float('l1_ratio', 0.3, 0.8, log=False)  # 범위를 0.1에서 0.9로 좁힘
#     c = trial.suggest_float('C', 1e-4, 1e2, log=True)
#     max_iter = trial.suggest_int('max_iter', 500, 2000, step=500)  # max_iter 튜닝 추가
    
#     clf =  LogisticRegression(max_iter=max_iter, solver='saga', penalty='elasticnet', l1_ratio=r, C=c)
#     scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    
#     return scores.mean()
    
# logreg_study = optuna.create_study(direction='maximize')
# logreg_study.optimize(logreg_objective, n_trials=20)

# logreg_best_params = logreg_study.best_params

In [None]:
# model_logis=LogisticRegression(**logreg_best_params)
# model_logis.fit(X_train_resampled,y_train_resampled)
# y_pred = model_logis.predict(X_val)

# accuracy = accuracy_score(y_val, y_pred)
# print("Accuracy:", accuracy)

## Ridge

In [None]:
# from sklearn.linear_model import RidgeClassifier
# from sklearn.preprocessing import label_binarize

# # Ridge Classifier의 목적 함수
# def ridge_classifier_objective(trial):

#     alpha = trial.suggest_float('alpha', 1e-5, 1e3, log=True)
#     clf = RidgeClassifier(alpha=alpha)
    
#     # 교차 검증을 통한 모델 평가 (AUC 스코어)
#     # 다중 클래스의 경우, 'ovr' 또는 'ovo' 스키마를 사용
#     # if len(set(y_train_resampled)) > 2:
#     #     scoring = 'roc_auc_ovr'
#     # else:
#     #     scoring = 'roc_auc'
    
#     scores = cross_val_score(clf, shap_xgb_X_train_resampled, y_train_resampled, cv=6, scoring='roc_auc')
#     return scores.mean()

# ridge_classifier_study = optuna.create_study(direction='maximize')
# ridge_classifier_study.optimize(ridge_classifier_objective, n_trials=200)
# ridge_classifier_best_params = ridge_classifier_study.best_params

In [None]:
# # 최적의 하이퍼파라미터로 최종 모델 학습 및 평가
# # model_ridge_classifier = RidgeClassifier(**ridge_classifier_best_params)
# model_ridge_classifier = RidgeClassifier()
# model_ridge_classifier.fit(X_train_resampled, y_train_resampled)

# # 검증 데이터로 예측
# y_pred = model_ridge_classifier.predict(X_val)

# # 정확도 계산
# accuracy = accuracy_score(y_val, y_pred)
# print("Accuracy:", accuracy)

In [None]:
# # 최적의 하이퍼파라미터로 최종 모델 학습 및 평가
# model_ridge_classifier = RidgeClassifier(**ridge_classifier_best_params)
# model_ridge_classifier.fit(shap_xgb_X_train_resampled, y_train_resampled)

# # 검증 데이터로 예측
# y_pred = model_ridge_classifier.predict(shap_xgb_X_val)
# y_pred_proba = model_ridge_classifier.decision_function(shap_xgb_X_val)

# # 다중 클래스의 경우 ROC AUC 스코어 계산
# if len(set(y_val)) > 2:
#     y_val_bin = label_binarize(y_val, classes=list(set(y_val)))
#     auc_score = roc_auc_score(y_val_bin, y_pred_proba, multi_class='ovr')
# else:
#     auc_score = roc_auc_score(y_val, y_pred_proba)

# print("AUC Score:", auc_score)

---

## Randomforest feature selection  큰 효과가 없는 듯 함

In [None]:
# model = RandomForestClassifier()
# model.fit(train_resampled, y_train_resampled)
# y_pred = model.predict(test)

# rn_features = []
# importances = model.feature_importances_
# feature_names = train.columns

# # 피처 중요도를 기준으로 정렬하여 상위 피처 선택
# indices = np.argsort(importances)[::-1]

# # 중요도가 0.01 이상인 피처만 선택
# # top_number = 40
# top_num_indices = [idx for idx in indices if importances[idx] >= 0.005] #[:top_number]
# top_features = feature_names[top_num_indices]

# for i, feature in enumerate(top_features):
#     print(f"{i+1}. {feature} (중요도: {importances[top_num_indices[i]]})")
#     rn_features.append(feature)

# rn_train_resampled = train_resampled[rn_features]
# rn_test = test[rn_features]

# model_logis=LogisticRegression(**logreg_best_params)
# model_logis.fit(rn_train_resampled,y_train_resampled)
# y_pred = model_logis.predict(rn_test)

# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

## 상관계수 계산

In [None]:
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
corr_number = 0.9
to_drop = [column for column in upper.columns if any(upper[column] > corr_number)]
to_drop
# # 특징 제거
corr_X_train_resampled = X_train_resampled.drop(columns=to_drop)  
corr_X_val = X_val.drop(columns=to_drop)

In [None]:
def logreg_objective(trial):
    
    r = trial.suggest_float('l1_ratio', 0, 1, log=False)
    c = trial.suggest_float('C', 1e-4, 1e2, log=True)
     
    clf =  LogisticRegression(max_iter=5000, solver='saga', penalty='elasticnet', l1_ratio=r, C=c)
    scores = cross_val_score(clf, corr_X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    
    return scores.mean()
    
logreg_study = optuna.create_study(direction='maximize')
logreg_study.optimize(logreg_objective, n_trials=5)

logreg_best_params = logreg_study.best_params


In [None]:
model_logis=LogisticRegression(**logreg_best_params)
model_logis.fit(corr_X_train_resampled,y_train_resampled)
y_pred = model_logis.predict(corr_X_val)

accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

## L1 규제(Lasso)

In [None]:
# alpha 값 후보군 설정
alpha_values = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 1, 10, 100]
param_grid = {'alpha': alpha_values}

# Lasso 모델과 GridSearchCV 설정
lasso = Lasso()
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')

# 최적의 alpha 값 찾기
grid_search.fit(X_train_resampled, y_train_resampled)
best_alpha = grid_search.best_params_['alpha']
best_alpha

In [None]:
lasso = Lasso(alpha=best_alpha)  # 위에서 나온 alpha 값으로 조정한 거임
lasso.fit(X_train_resampled, y_train_resampled)

# 가중치가 0이 아닌 특징 선택
selected_features = X_train_resampled.columns[lasso.coef_ != 0]
selected_features

In [None]:
L1_X_train_resampled = X_train_resampled[selected_features]
L1_X_val = X_val[selected_features]

In [None]:
def logreg_objective(trial):
    
    r = trial.suggest_float('l1_ratio', 0, 1, log=False)
    c = trial.suggest_float('C', 1e-4, 1e2, log=True)
     
    clf =  LogisticRegression(max_iter=5000, solver='saga', penalty='elasticnet', l1_ratio=r, C=c)
    scores = cross_val_score(clf, L1_X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    
    return scores.mean()
    
logreg_study = optuna.create_study(direction='maximize')
logreg_study.optimize(logreg_objective, n_trials=5)

logreg_best_params = logreg_study.best_params

## model select

# 2013~2023  
### 많긴 하지만 정확도 떨어질 것으로 예상됨

# 2020~2023
### 데이터 수가 급격히 줄어듬