In [1]:
# Library
import os
import random
import pickle
import gc
import warnings
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib import font_manager, rc
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.cluster import KMeans, MiniBatchKMeans
# from category_encoders import TargetEncoder
from sklearn.preprocessing import (
    StandardScaler, PowerTransformer, OrdinalEncoder,
    OneHotEncoder, FunctionTransformer, PolynomialFeatures, LabelEncoder, MinMaxScaler
)
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.feature_selection import SelectKBest, SelectPercentile, RFE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import (
    LogisticRegression, LinearRegression, Ridge, Lasso,
    SGDRegressor, ElasticNet
)
from sklearn.model_selection import (
    train_test_split, cross_val_score, cross_validate,
    GridSearchCV, KFold, cross_val_predict
)
from sklearn.metrics import (
    roc_auc_score, mean_squared_error, make_scorer, accuracy_score, log_loss
)
from sklearn import set_config, datasets
from catboost import (
    CatBoostRegressor, CatBoostClassifier,
)
# import category_encoders as ce
# from sklearn.pipeline import (
#     Pipeline, FeatureUnion, make_pipeline
# )
from sklearn.ensemble import (
    RandomForestClassifier, StackingClassifier, StackingRegressor,
    GradientBoostingRegressor, VotingClassifier, VotingRegressor,
    HistGradientBoostingRegressor, GradientBoostingClassifier,
    BaggingClassifier, AdaBoostClassifier, RandomForestRegressor,ExtraTreesRegressor
)
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.svm import SVC, SVR, LinearSVC
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons

import xgboost as xgb
import lightgbm as lgb
import re
import math
import optuna

from scipy.stats import zscore

%matplotlib inline

warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)
pd.set_option('display.max_columns', None)

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
df = pd.read_pickle('matches_df.pkl')

## Preprocessing

In [3]:
X = df
y = df['home_team_result']

split_index = 2024 # 2021년도까지의 index

train = X.iloc[:split_index]
test = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]


# train, test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) ## 이 방법에 의문을 품는 바이요!!

In [4]:
def team_encoding(train):
    train['home_win'] = train['home_team_result'].apply(lambda x: 1 if x=='승' else 0) # home_win 열 추가, 승리인 경우 1, 아닌 경우 0
    dic = {}
    # 각 홈팀별 이긴 경기 수를 딕셔너리에 저장
    for team in train['home_team_name'].unique():
        value = train[train['home_team_name'] == team]['home_win'].sum() 
        #home_team_name  열에서 고유한 팀 이름을 가져와 각 팀이 홈에서 이긴 경기 수 계산, 이 값을 dic에 저장
        dic[team] = value

    label_dic={}
    # 승리 횧수를 기준으로 오름차순 정렬, 각 팀에 대해 라벨 부여, 승리 횟수가 적은 팀부터 0,1,2 의 라벨을 부여
    for idx, (team, _) in enumerate(sorted(dic.items(), key= lambda x: x[1])):
        label_dic[team] = idx
    
    return label_dic


''' 홈팀 득점 이동평균 계산 함수 '''

def homeGoal_day_mean(train, test, day):
    train[f'home_Goal_{day}_mean'] = -1  # 초기값 -1로 설정
    test[f'home_Goal_{day}_mean'] = -1
    
    teams = train['home_team_name'].unique()
    for team in tqdm(teams): # train에서 고유 팀 이름을 가져오고 이를 시각적으로 표시해줌 : tqdm
        team_df = train[train['home_team_name'] == team]
        # 롤링 윈도우 크기 설정
        ch_day = len(team_df) if len(team_df) < day else day # 팀의 경기 수가 주어진 day 보다 적으면, 경기 수 만큼의 윈도우 크기 사용
        idx = team_df['home_team_goal_count'].rolling(ch_day).mean().index.values # 롤링 윈도우 평균 계산
        val = team_df['home_team_goal_count'].rolling(ch_day).mean().values
        train[f'home_Goal_{day}_mean'].loc[idx] = val
        test_idx = test[test['home_team_name'] == team].index
        test[f'home_Goal_{day}_mean'].loc[test_idx] = val[-1]
    # 결측값 처리
    train[f'home_Goal_{day}_mean'] = train[f'home_Goal_{day}_mean'].fillna(0)


''' 원정팀 득점 이동평균 계산 함수 '''

def awayGoal_day_mean(train, test, day):
    # 초기값 설정
    train[f'away_Goal_{day}_mean'] = -1
    test[f'away_Goal_{day}_mean'] = -1
    
    teams = train['away_team_name'].unique()
    for team in tqdm(teams):
        team_df = train[train['away_team_name'] == team]
        # 롤링 윈도우 크기 설정
        ch_day = len(team_df) if len(team_df) < day else day
        idx = team_df['away_team_goal_count'].rolling(ch_day).mean().index.values
        val = team_df['away_team_goal_count'].rolling(ch_day).mean().values
        train[f'away_Goal_{day}_mean'].loc[idx] = val
        test_idx = test[test['away_team_name'] == team].index
        test[f'away_Goal_{day}_mean'].loc[test_idx] = val[-1]
    # 결측값 처리
    train[f'away_Goal_{day}_mean'] = train[f'away_Goal_{day}_mean'].fillna(0)


'''홈팀 승리율 평균 계산 함수'''

def homeWin_day_mean(train, test, day):
    train[f'home_winRate_{day}_mean'] = -1
    test[f'home_winRate_{day}_mean'] = -1
    train['win'] = train['home_team_result'].apply(lambda x: 1 if x == '승' else 0)

    teams = train['home_team_name'].unique()
    for team in tqdm(teams):
        team_df = train[train['home_team_name'] == team]
        ch_day = len(team_df) if len(team_df) < day else day
        idx = team_df['win'].rolling(ch_day).mean().index.values
        val = team_df['win'].rolling(ch_day).mean().values
        train[f'home_winRate_{day}_mean'].loc[idx] = val
        test_idx = test[test['home_team_name'] == team].index
        test[f'home_winRate_{day}_mean'].loc[test_idx] = val[-1]

    train.drop(columns=['win'], inplace=True)

    train[f'home_winRate_{day}_mean'] = train[f'home_winRate_{day}_mean'].fillna(0)


'''원정팀 승리율 평균 계산 함수'''

def awayWin_day_mean(train, test, day):

    train[f'away_winRate_{day}_mean'] = -1
    test[f'away_winRate_{day}_mean'] = -1
    train['win'] = train['home_team_result'].apply(lambda x: 1 if x == '패' else 0)
    
    teams = train['away_team_name'].unique()
    for team in tqdm(teams):
        team_df = train[train['away_team_name'] == team]

        ch_day = len(team_df) if len(team_df) < day else day
        idx = team_df['win'].rolling(ch_day).mean().index.values
        val = team_df['win'].rolling(ch_day).mean().values
        train[f'away_winRate_{day}_mean'].loc[idx] = val
        test_idx = test[test['away_team_name'] == team].index
        test[f'away_winRate_{day}_mean'].loc[test_idx] = val[-1]

    train.drop(columns=['win'], inplace=True)

    train[f'away_winRate_{day}_mean'] = train[f'away_winRate_{day}_mean'].fillna(0)


'''홈팀 평균 계산 함수'''

def home_day_mean(train, test, columns, day):
    for column in tqdm(columns):
        teams = train['home_team_name'].values
        train[f'home_{column}_{day}_mean'] = -1
        test[f'home_{column}_{day}_mean'] = -1

        for team in tqdm(teams):
            team_df = train[train['home_team_name'] == team]
            idx = team_df[column].rolling(day).mean().index.values
            val = team_df[column].rolling(day).mean().values
            train[f'home_{column}_{day}_mean'].loc[idx] = val
            test_idx = test[test['home_team_name'] == team].index
            test[f'home_{column}_{day}_mean'].loc[test_idx] = val[-1]

        train[f'home_{column}_{day}_mean'] = train[f'home_{column}_{day}_mean'].fillna(0)
        test[f'home_{column}_{day}_mean'] = test[f'home_{column}_{day}_mean'].fillna(0)


'''원정팀 평균 계산 함수'''

def away_day_mean(train, test, columns, day):
    for column in tqdm(columns):
        teams = train['away_team_name'].values
        train[f'away_{column}_{day}_mean'] = -1
        test[f'away_{column}_{day}_mean'] = -1

        for team in tqdm(teams):
            team_df = train[train['away_team_name'] == team]
            idx = team_df[column].rolling(day).mean().index.values
            val = team_df[column].rolling(day).mean().values
            train[f'away_{column}_{day}_mean'].loc[idx] = val
            test_idx = test[test['away_team_name'] == team].index
            test[f'away_{column}_{day}_mean'].loc[test_idx] = val[-1]

        train[f'away_{column}_{day}_mean'] = train[f'away_{column}_{day}_mean'].fillna(0)
        test[f'away_{column}_{day}_mean'] = test[f'away_{column}_{day}_mean'].fillna(0)


'''전처리 함수'''

def preprocessing(train, test, dic):
    # 년과 월일로 나누기
    train['date_GMT'] = train['date_GMT'].dt.strftime('%Y%m%d')
    test['date_GMT'] = test['date_GMT'].dt.strftime('%Y%m%d')
    train['year'] = train['date_GMT'].apply(lambda x : int(x[0:4]))
    train['month'] = train['date_GMT'].apply(lambda x : int(x[5:7]))
    # train['day'] = train['date_GMT'].apply(lambda x : int(x[8:10]))

    test['year'] = test['date_GMT'].apply(lambda x : int(x[0:4]))
    test['month'] = test['date_GMT'].apply(lambda x : int(x[5:7]))
    # test['day'] = test['date_GMT'].apply(lambda x : int(x[8:10]))
    train.drop(columns=['date_GMT'], inplace=True)
    test.drop(columns=['date_GMT'], inplace=True)

    # 팀 인코딩 적용
    label_dic = dic
    train['home_team_name'] = train['home_team_name'].apply(lambda x: label_dic[x])
    train['away_team_name'] = train['away_team_name'].apply(lambda x: label_dic[x])
    test['home_team_name'] = test['home_team_name'].apply(lambda x: label_dic[x])
    test['away_team_name'] = test['away_team_name'].apply(lambda x: label_dic[x])

    # 5일간 홈팀 승리 비율 계산
    homeWin_day_mean(train, test, 5)
    # 5일간 원정팀 승리 비율 계산
    awayWin_day_mean(train, test, 5)

    # 5일간 홈팀 평균 득점 계산
    homeGoal_day_mean(train, test, 5)
    # 5일간 원정팀 평균 득점 계산
    awayGoal_day_mean(train, test, 5)

    # 불필요한 컬럼 제거
    # train = train.drop(columns=['home_win', 'index','home_team_goal_count','away_team_goal_count','home_team_result','game_points'])
    train = train.drop(columns=['home_win', 'index','home_team_goal_count','away_team_goal_count','game_points'])
    test = test.drop(columns=['index','home_team_goal_count','away_team_goal_count','home_team_result','game_points'])

    return train, test


## Train / Test

In [5]:
hometeam_list = list(train['home_team_name'].unique())
dic = team_encoding(train)
train, test= preprocessing(train, test, dic)
test_idx = test.index.values


# 승무패 인코딩
lec = LabelEncoder()
lec.fit(df['home_team_result'])
y_train = lec.transform(y_train)
y_test = lec.transform(y_test)

100%|██████████| 17/17 [00:00<00:00, 663.39it/s]
100%|██████████| 17/17 [00:00<00:00, 617.12it/s]
100%|██████████| 17/17 [00:00<00:00, 671.82it/s]
100%|██████████| 17/17 [00:00<00:00, 686.04it/s]


In [6]:
# 홈팀과 원정팀의 공격 효율성을 계산한 피쳐 생성
train['home_attack_efficiency'] = train['home_Goal_5_mean'] * train['home_team_shots_on_target']
train['away_attack_efficiency'] = train['away_Goal_5_mean'] * train['away_team_shots_on_target']
# 홈팀과 원정팀의 공격 효율성 차이를 나타내는 피쳐 생성
train['attack_efficiency_difference'] = train['home_attack_efficiency'] - train['away_attack_efficiency']

# 홈팀과 원정팀의 점유율 비율을 나타내는 피쳐 생성
train['possession_ratio'] = train['home_team_possession'] / train['away_team_possession']

# 최근 5경기 평균 득점의 표준 편차를 나타내는 피쳐 생성
train['home_Goal_5_std'] = train['home_Goal_5_mean'].rolling(window=5).std()
train['away_Goal_5_std'] = train['away_Goal_5_mean'].rolling(window=5).std()

# 결측값을 0으로 대체
train = train.fillna(0)

# 테스트 데이터에도 동일한 피쳐 생성
test['home_attack_efficiency'] = test['home_Goal_5_mean'] * test['home_team_shots_on_target']
test['away_attack_efficiency'] = test['away_Goal_5_mean'] * test['away_team_shots_on_target']
test['attack_efficiency_difference'] = test['home_attack_efficiency'] - test['away_attack_efficiency']
test['possession_ratio'] = test['home_team_possession'] / test['away_team_possession']
test['home_Goal_5_std'] = test['home_Goal_5_mean'].rolling(window=5).std()
test['away_Goal_5_std'] = test['away_Goal_5_mean'].rolling(window=5).std()
test = test.fillna(0)

# 학습 데이터에서 목표 변수 'home_team_result' 컬럼 제거
train.drop(columns = ['home_team_result'], inplace = True)


In [7]:
# import sweetviz as sv

# report = sv.analyze(train)
# report.show_html("new_features.html")

## Feature scaler

In [8]:
cat = ['home_team_name','away_team_name','year','month']

num_features = list(set(train.columns) - set(cat))
# scaler = MinMaxScaler()
scaler = StandardScaler()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.fit_transform(test[num_features])

## model train

In [9]:
def logreg_objective(trial):
    
    r = trial.suggest_float('l1_ratio', 0, 1, log=False)
    c = trial.suggest_float('C', 1e-4, 1e2, log=True)
     
    clf =  LogisticRegression(max_iter=5000, solver='saga', penalty='elasticnet', l1_ratio=r, C=c)
    scores = cross_val_score(clf, train, y_train, cv=5, scoring='accuracy')
    
    return scores.mean()
    
logreg_study = optuna.create_study(direction='maximize')
logreg_study.optimize(logreg_objective, n_trials=20)

logreg_best_params = logreg_study.best_params

[I 2024-06-26 19:46:21,402] A new study created in memory with name: no-name-8954265a-264d-4ede-97c0-e8f527f7c969
[I 2024-06-26 19:46:50,275] Trial 0 finished with value: 0.6669991443588803 and parameters: {'l1_ratio': 0.9493245743449642, 'C': 0.14653781842432864}. Best is trial 0 with value: 0.6669991443588803.
[I 2024-06-26 19:47:20,317] Trial 1 finished with value: 0.6630509717638431 and parameters: {'l1_ratio': 0.9015387152063188, 'C': 1.2553922223118843}. Best is trial 0 with value: 0.6669991443588803.
[I 2024-06-26 19:47:51,128] Trial 2 finished with value: 0.6640386260848308 and parameters: {'l1_ratio': 0.7234999102627218, 'C': 26.678012580016507}. Best is trial 0 with value: 0.6669991443588803.
[I 2024-06-26 19:48:20,486] Trial 3 finished with value: 0.6660114900378927 and parameters: {'l1_ratio': 0.23650168966301022, 'C': 0.06754834723441912}. Best is trial 0 with value: 0.6669991443588803.
[I 2024-06-26 19:48:42,666] Trial 4 finished with value: 0.5800354479892433 and paramet

In [17]:
model_logis=LogisticRegression(**logreg_best_params)
model_logis=LogisticRegression()
model_logis.fit(train,y_train)
y_pred = model_logis.predict(test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5153508771929824


In [18]:
model = RandomForestClassifier()
model.fit(train, y_train)
# y_pred = model.predict(test)

## Randomforest feature seletion

In [12]:
rn_features = []
importances = model.feature_importances_
feature_names = train.columns

# 피처 중요도를 기준으로 정렬하여 상위 피처 선택
indices = np.argsort(importances)[::-1]

# 중요도가 0.01 이상인 피처만 선택
# top_number = 40            # 33개 안에서 고르면 될 듯
top_num_indices = [idx for idx in indices if importances[idx] >= 0.015] #[:top_number]
top_features = feature_names[top_num_indices]

for i, feature in enumerate(top_features):
    print(f"{i+1}. {feature} (중요도: {importances[top_num_indices[i]]})")
    rn_features.append(feature)

# 이 피쳐로 모델 돌려보기

1. attack_efficiency_difference (중요도: 0.07679128897986834)
2. away_winRate_5_mean (중요도: 0.05019924986932356)
3. home_winRate_5_mean (중요도: 0.04679307994708357)
4. 5_games_result (중요도: 0.04591759857746539)
5. home_attack_efficiency (중요도: 0.04551382741253214)
6. away_attack_efficiency (중요도: 0.04193029469524437)
7. away_team_goal_count_half_time (중요도: 0.03764227929481992)
8. home_team_goal_count_half_time (중요도: 0.033371662999508785)
9. home_team_shots_on_target (중요도: 0.030542073731065815)
10. away_Goal_5_std (중요도: 0.029906640239556196)
11. home_Goal_5_std (중요도: 0.029389806082744965)
12. home_team_possession (중요도: 0.028547386737770374)
13. month (중요도: 0.027859325396847733)
14. possession_ratio (중요도: 0.02777269516636719)
15. away_team_possession (중요도: 0.027206135857921927)
16. away_team_free_kick (중요도: 0.027019250538931887)
17. away_team_fouls (중요도: 0.026330997513591615)
18. away_team_shots_on_target (중요도: 0.026293516202548654)
19. away_Goal_5_mean (중요도: 0.026157329993515662)
20. home_team_n

In [21]:
rn_train = train[rn_features]
rn_test = test[rn_features]

In [22]:
model_logis=LogisticRegression(**logreg_best_params)
model_logis.fit(rn_train,y_train)
y_pred = model_logis.predict(rn_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5109649122807017


In [None]:
# def logreg_objective(trial):
    
#     r = trial.suggest_float('l1_ratio', 0, 1, log=False)
#     c = trial.suggest_float('C', 1e-4, 1e2, log=True)
     
#     clf =  LogisticRegression(max_iter=5000, solver='saga', penalty='elasticnet', l1_ratio=r, C=c)
#     scores = cross_val_score(clf, rn_train, y_train, cv=5, scoring='accuracy')
    
#     return scores.mean()
    
# logreg_study = optuna.create_study(direction='maximize')
# logreg_study.optimize(logreg_objective, n_trials=20)

# logreg_best_params = logreg_study.best_params


## 상관계수 계산

In [24]:
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
corr_number = 0.9
to_drop = [column for column in upper.columns if any(upper[column] > corr_number)]
to_drop
# # 특징 제거
rn_train = rn_train.drop(columns=to_drop)  
rn_test = rn_test.drop(columns=to_drop)

KeyError: "['away_team_possession', 'away_team_free_kick', 'possession_ratio'] not found in axis"

In [25]:
def logreg_objective(trial):
    
    r = trial.suggest_float('l1_ratio', 0, 1, log=False)
    c = trial.suggest_float('C', 1e-4, 1e2, log=True)
     
    clf =  LogisticRegression(max_iter=5000, solver='saga', penalty='elasticnet', l1_ratio=r, C=c)
    scores = cross_val_score(clf, rn_train, y_train, cv=5, scoring='accuracy')
    
    return scores.mean()
    
logreg_study = optuna.create_study(direction='maximize')
logreg_study.optimize(logreg_objective, n_trials=20)

logreg_best_params = logreg_study.best_params


[I 2024-06-26 19:56:58,918] A new study created in memory with name: no-name-f770dac9-9dec-4a44-b5ae-bd15a6f41afc
[I 2024-06-26 19:57:21,649] Trial 0 finished with value: 0.6768891333577802 and parameters: {'l1_ratio': 0.5710772829440637, 'C': 0.16750989695982324}. Best is trial 0 with value: 0.6768891333577802.
[I 2024-06-26 19:57:44,042] Trial 1 finished with value: 0.6754125412541254 and parameters: {'l1_ratio': 0.9152599264694679, 'C': 0.11754076619775668}. Best is trial 0 with value: 0.6768891333577802.
[I 2024-06-26 19:58:03,408] Trial 2 finished with value: 0.5899168805769466 and parameters: {'l1_ratio': 0.5580026202023411, 'C': 0.0033542880894964714}. Best is trial 0 with value: 0.6768891333577802.
[I 2024-06-26 19:58:07,739] Trial 3 finished with value: 0.4258770321476592 and parameters: {'l1_ratio': 0.9803406103788604, 'C': 0.0023039935957463106}. Best is trial 0 with value: 0.6768891333577802.
[I 2024-06-26 19:58:30,755] Trial 4 finished with value: 0.6754052071873854 and pa

In [27]:
model_logis=LogisticRegression(**logreg_best_params)
model_logis.fit(rn_train,y_train)
y_pred = model_logis.predict(rn_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.49122807017543857


## L1 규제(Lasso)

In [28]:
# alpha 값 후보군 설정
alpha_values = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 1, 10, 100]
param_grid = {'alpha': alpha_values}

# Lasso 모델과 GridSearchCV 설정
lasso = Lasso()
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')

# 최적의 alpha 값 찾기
grid_search.fit(rn_train, y_train)
best_alpha = grid_search.best_params_['alpha']
best_alpha

0.005

In [29]:
lasso = Lasso(alpha=best_alpha)  # 위에서 나온 alpha 값으로 조정한 거임
lasso.fit(rn_train, y_train)

# 가중치가 0이 아닌 특징 선택
selected_features = rn_train.columns[lasso.coef_ != 0]
selected_features

Index(['away_winRate_5_mean', 'home_winRate_5_mean', '5_games_result',
       'away_team_goal_count_half_time', 'home_team_shots_on_target',
       'away_Goal_5_std', 'home_Goal_5_std', 'home_team_possession', 'month',
       'away_team_shots_on_target', 'away_Goal_5_mean', 'home_team_name',
       'home_team_shots', 'away_team_name', 'home_team_free_kick',
       'home_Goal_5_mean', 'away_team_shots', 'home_team_corner_count',
       'away_team_corner_count', 'year', 'away_team_yellow_cards'],
      dtype='object')

In [31]:
rn_train = rn_train[selected_features]
rn_test = rn_test[selected_features]

In [32]:
def logreg_objective(trial):
    
    r = trial.suggest_float('l1_ratio', 0, 1, log=False)
    c = trial.suggest_float('C', 1e-4, 1e2, log=True)
     
    clf =  LogisticRegression(max_iter=5000, solver='saga', penalty='elasticnet', l1_ratio=r, C=c)
    scores = cross_val_score(clf, rn_train, y_train, cv=5, scoring='accuracy')
    
    return scores.mean()
    
logreg_study = optuna.create_study(direction='maximize')
logreg_study.optimize(logreg_objective, n_trials=20)

logreg_best_params = logreg_study.best_params

[I 2024-06-26 20:03:47,867] A new study created in memory with name: no-name-d178cbf4-6834-4287-8157-0991915ef894
[I 2024-06-26 20:04:05,594] Trial 0 finished with value: 0.6546620217577314 and parameters: {'l1_ratio': 0.3073118236939879, 'C': 0.05707887487016698}. Best is trial 0 with value: 0.6546620217577314.
[I 2024-06-26 20:04:16,880] Trial 1 finished with value: 0.6220388705537221 and parameters: {'l1_ratio': 0.05110685210328936, 'C': 0.0016827113094853119}. Best is trial 0 with value: 0.6546620217577314.
[I 2024-06-26 20:04:31,672] Trial 2 finished with value: 0.48812370125901483 and parameters: {'l1_ratio': 0.5454458500490001, 'C': 0.002068708243429115}. Best is trial 0 with value: 0.6546620217577314.
[I 2024-06-26 20:04:49,563] Trial 3 finished with value: 0.6551570712626817 and parameters: {'l1_ratio': 0.06290667873304856, 'C': 8.236099117783326}. Best is trial 3 with value: 0.6551570712626817.
[I 2024-06-26 20:05:05,153] Trial 4 finished with value: 0.5568084586236403 and pa

## model select

In [33]:
model_logis=LogisticRegression(**logreg_best_params)
model_logis.fit(rn_train,y_train)

y_pred = model_logis.predict(rn_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.4956140350877193


In [None]:
# model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
# model.fit(train, y_train) 
# y_pred = model.predict(test)

# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

# 2013~2023  
### 많긴 하지만 정확도 떨어질 것으로 예상됨

# 2020~2023
### 데이터 수가 급격히 줄어듬