In [20]:
# Library
import os
import random
import pickle
import gc
import warnings
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib import font_manager, rc
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.cluster import KMeans, MiniBatchKMeans
# from category_encoders import TargetEncoder
from sklearn.preprocessing import (
    StandardScaler, PowerTransformer, OrdinalEncoder,
    OneHotEncoder, FunctionTransformer, PolynomialFeatures, LabelEncoder,
)
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import (
    LogisticRegression, LinearRegression, Ridge, Lasso,
    SGDRegressor, ElasticNet
)
from sklearn.model_selection import (
    train_test_split, cross_val_score, cross_validate,
    GridSearchCV, KFold, cross_val_predict
)
from sklearn.metrics import (
    roc_auc_score, mean_squared_error, make_scorer, accuracy_score, log_loss
)
from sklearn import set_config, datasets
# from catboost import (
#     CatBoostRegressor, CatBoostClassifier,
# )
# import category_encoders as ce
# from sklearn.pipeline import (
#     Pipeline, FeatureUnion, make_pipeline
# )
from sklearn.ensemble import (
    RandomForestClassifier, StackingClassifier, StackingRegressor,
    GradientBoostingRegressor, VotingClassifier, VotingRegressor,
    HistGradientBoostingRegressor, GradientBoostingClassifier,
    BaggingClassifier, AdaBoostClassifier, RandomForestRegressor,ExtraTreesRegressor
)
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.svm import SVC, SVR, LinearSVC
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons

import xgboost as xgb
import lightgbm as lgb
import re
import math
import optuna

from scipy.stats import zscore

%matplotlib inline

warnings.filterwarnings("ignore")

In [21]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(42)

In [22]:
df_2013= pd.read_csv("2013.csv", encoding ='cp949')
df_2014= pd.read_csv("2014.csv", encoding ='cp949')
df_2015= pd.read_csv("2015.csv", encoding ='cp949')
df_2016= pd.read_csv("2016.csv", encoding ='cp949')
df_2017= pd.read_csv("2017.csv", encoding ='cp949')
df_2018= pd.read_csv("2018.csv", encoding ='cp949')
df_2019= pd.read_csv("2019.csv", encoding ='cp949')
df_2020= pd.read_csv("2020.csv", encoding ='cp949')
df_2021= pd.read_csv("2021.csv", encoding ='cp949')
df_2022= pd.read_csv("2022.csv", encoding ='cp949')
df_2023= pd.read_csv("2023.csv", encoding ='cp949')

In [23]:
def calculate_5_games_result(df, date_column, team_column, result_column, points_column):

    # df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    df[date_column] = pd.to_datetime(df[date_column])
    df = df.sort_values(by=[team_column, date_column])
    
    df[points_column] = df[result_column].map({'승': 3, '무': 0, '패': -3})
    
    df['5_games_result'] = df.groupby(team_column)[points_column].rolling(window=5).sum().reset_index(level=0, drop=True)
   
    df['5_games_result'] = df['5_games_result'].fillna(0)
    
    return df



In [24]:
# 모든년도에 추가
df2013 = calculate_5_games_result(df_2013, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2014 = calculate_5_games_result(df_2014, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2015 = calculate_5_games_result(df_2015, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2016 = calculate_5_games_result(df_2016, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2017 = calculate_5_games_result(df_2017, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2018 = calculate_5_games_result(df_2018, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2019 = calculate_5_games_result(df_2019, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2020 = calculate_5_games_result(df_2020, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2021 = calculate_5_games_result(df_2021, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2022 = calculate_5_games_result(df_2022, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')
df2023 = calculate_5_games_result(df_2023, 'date_GMT', 'home_team_name', 'home_team_result', 'game_points')

# 위/아래로 합치기 - 행 기준
df = pd.concat([df2013, df2014, df2015, df2016, df2017, df2018, df2019, df2020, df2021, df2022, df2023, ], axis = 0, ignore_index= True)

df.to_pickle("./matches_df.pkl")

In [26]:
df2020['date_GMT']

8     2020-05-16 10:00:00
21    2020-05-30 10:00:00
27    2020-06-06 10:00:00
41    2020-06-17 10:00:00
50    2020-06-27 10:00:00
              ...        
113   2020-09-06 10:00:00
115   2020-09-12 10:00:00
138   2020-10-02 08:00:00
155   2020-10-25 07:30:00
159   2020-11-01 06:00:00
Name: date_GMT, Length: 162, dtype: datetime64[ns]

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2480 entries, 0 to 2479
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   index                           2480 non-null   int64         
 1   date_GMT                        2480 non-null   datetime64[ns]
 2   home_team_name                  2480 non-null   object        
 3   away_team_name                  2480 non-null   object        
 4   home_team_corner_count          2480 non-null   int64         
 5   away_team_corner_count          2480 non-null   int64         
 6   home_team_shots                 2480 non-null   int64         
 7   away_team_shots                 2480 non-null   int64         
 8   home_team_shots_on_target       2480 non-null   int64         
 9   away_team_shots_on_target       2480 non-null   int64         
 10  home_team_fouls                 2480 non-null   int64         
 11  away

In [None]:
# 궁금한 거
# pre-match-ppg랑 home-ppg의 차이점
# 저장되지 않는 값이 -1로 되어 있는데 이걸 어떻게 처리할지

# 피처 이름
- timestamp - 경기가 시작되는 타임스탬프
- date_GMT  - 경기 날짜 및 시간
- status - 경기 상태
- attendance - 관중 수
- home_team_name - 홈팀 이름
- away_team_name - 원정팀 이름
- referee - 심판 이름
- Game_week - 경기 주차
- Pre-match_ppg(home) - 홈팀의 경기 전 평균 포인트
- Pre-match_ppg(away) - 원정팀의 경기 전 평균 포인트
- home_ppg
- away_ppg
- average_goals_per_match_pre_match 양팀 합친 경기당 평균 득점
- btts_percentage_pre_match 양팀의 평균 BTTS %. 경기 전에 계산됨 (btts:양팀이 모두 득점할 상황)
- oods_ft_home_team_win - 풀타임 홈팀 승리 배당률
- oods_ft_draw - 풀타임 무승부 배당률
- oods_ft_away_team_win - 풀타임 원정팀 승리 배당률
- oods_ft_over1.5 - 1.5골 이상 배당률
- oods_ft_over2.5 - 2.5골 이상 배당률
- oods_ft_over3.5 - 3.5골 이상 배당률
- oods_ft_over4.5 - 4.5골 이상 배당률
- oods_btts_yes - 양팀 모두 득점 배당률
- oods_btts_no - 양팀 모두 무득점 배당률
- stadium_name - 경기장 이름

In [None]:
df.info()
df.isnull().sum()

In [None]:

date_objects = []
for date_str in df['date_GMT']:
    # 날짜 문자열을 datetime 객체로 파싱
    date_obj = datetime.strptime(date_str, "%b %d %Y - %I:%M%p")
    # 원하는 형식으로 날짜를 문자열로 변환
    formatted_date = date_obj.strftime("%Y-%m-%d")
    date_objects.append(formatted_date)
df['date_GMT'] = date_objects
df

df.to_pickle(.matches_df.pkl)  

In [None]:
# import sweetviz as sv
# # Sweetviz 보고서 생성

# report = sv.analyze(df)
# report.show_html("report_df.html")