In [1]:
from load_data import load_data, num_attribs, cat_attribs, get_camp_fixtures
from export_football_data import label_columns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
import machine_learning.utility as utility
import enums
import time
from datetime import datetime

football_data = load_data(enums.SportsType.Football, enums.CampType.Any)
football_data.head()

football file count: 122
football data shape: (375234, 37)


Unnamed: 0,camp_type,league_type,my_camp_3matches_gf_avg,my_camp_3matches_ga_avg,my_6matches_gf_avg,my_6matches_ga_avg,my_camp_3matches_pts,my_6matches_pts,op_camp_3matches_gf_avg,op_camp_3matches_ga_avg,...,1.5_under,1.5_over,2.5_under,2.5_over,3.5_under,3.5_over,4.5_under,4.5_over,5.5_under,5.5_over
0,0,0,1.667,0.667,2.0,0.833,9,18,1.333,0.333,...,1,0,1,0,1,0,1,0,1,0
1,1,0,1.333,0.333,1.167,0.333,9,15,1.667,0.667,...,1,0,1,0,1,0,1,0,1,0
2,0,1,1.667,1.0,1.333,0.833,6,10,0.667,2.333,...,0,1,0,1,1,0,1,0,1,0
3,1,1,0.667,2.333,0.667,1.667,1,5,1.667,1.0,...,0,1,0,1,1,0,1,0,1,0
4,0,1,2.333,1.667,1.5,2.167,7,7,1.0,1.667,...,0,1,1,0,1,0,1,0,1,0


In [2]:
home_football_data = get_camp_fixtures(enums.CampType.Home, football_data)
away_football_data = get_camp_fixtures(enums.CampType.Away, football_data)

In [3]:
# 학습 데이터셋
home_train = home_football_data.drop(label_columns, axis=1)
home_train_result = home_football_data['result_type']  # 경기 결과 (승=0, 무=1, 패=2)

away_train = away_football_data.drop(label_columns, axis=1)
away_train_result = away_football_data['result_type']  # 경기 결과 (승=0, 무=1, 패=2)

In [4]:
# 특성 처리
columns_pipeline = ColumnTransformer([
        ("num", StandardScaler(), num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

feature_pipeline = Pipeline([
        # ("binning", utility.BinningRestPeriod()),
        ('columns', columns_pipeline),
    ])

In [5]:
# 모델 훈련 (Home)
# Input 데이터 준비
prepared_home_train = feature_pipeline.fit_transform(home_train)

# 모델
rtf_clf_home = RandomForestClassifier(n_estimators=122, random_state=42, class_weight='balanced')

In [9]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# 그리드 서치
param_grid = [
    # 12(=3×4)개의 하이퍼파라미터 조합을 시도합니다.
    {'min_samples_split': [2, 4, 8, 16, 32]}
  ]

grid_search = GridSearchCV(rtf_clf_home, param_grid, cv=3,
                           scoring='accuracy',
                           return_train_score=True,
                           n_jobs=4,
                           verbose=2)
grid_search.fit(prepared_home_train, home_train_result)

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

print(f"best_params: {grid_search.best_params_}")
print(f"best_estimator: {grid_search.best_estimator_}")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:  3.1min finished
0.472963537419317 {'min_samples_leaf': 2}
0.46335353406141233 {'min_samples_leaf': 4}
0.45300265967369696 {'min_samples_leaf': 8}
0.44505561862730986 {'min_samples_leaf': 16}
0.439038040262876 {'min_samples_leaf': 32}
best_params: {'min_samples_leaf': 2}
best_estimator: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=122,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
              