In [1]:
cd ..

/Users/satouwataru/Desktop/DiveIntoCode/git/diveintocode-ml/GraduationWork/v4/codes/model


In [2]:
cd ..

/Users/satouwataru/Desktop/DiveIntoCode/git/diveintocode-ml/GraduationWork/v4/codes


In [3]:
cd ..

/Users/satouwataru/Desktop/DiveIntoCode/git/diveintocode-ml/GraduationWork/v4


In [10]:
import numpy as np
import pandas as pd
import codes.common as c
from sklearn.model_selection import train_test_split
import optuna.integration.lightgbm as lgb_o
from sklearn.metrics import mean_squared_error
import pickle

'''
model : rightGBM
description:試合結果の3値分類（勝分敗）
''' 
class model():
    def __init__(self):
        self.common = c.common()
        self.common.PY_NAME = 'model_1'
        self.y_col = 'y_H_result'
        
        self.x_train, self.x_val, self.x_test = None, None, None
        self.y_train, self.y_val = None, None
        self.model = None
        self.f_model_name = None
        
    def get_y_pred(self):
        
        self.preprocessing()
        
        self.set_model()
        
        y_pred, y_pred_proba = self.predict()
        
        df_y = pd.concat([pd.DataFrame(y_pred_proba, columns = ['proba_0_m1', 'proba_1_m1', 'proba_2_m1']), pd.DataFrame(y_pred, columns = ['pred_m1'])], axis = 1)
        
        return df_y
        
    def predict(self):
        y_pred_proba = self.model.predict(self.x_test, num_iteration=self.model.best_iteration)
        y_pred = np.argmax(y_pred_proba, axis=1)
        return y_pred, y_pred_proba
    
    def set_model(self):
        year = str(self.x_test[:1]['年月日'].values[0])[:4]
        self.f_model_name = 'data/model/base_models/model_1/model_for_' + year +'.sav'
        try:
            self.model = pickle.load(open(self.f_model_name, 'rb'))
        except:
            self.fit()
            
    def fit(self):
        lgb_train = lgb_o.Dataset(self.x_train, self.y_train)
        lgb_eval = lgb_o.Dataset(self.x_val, self.y_val) 
        # 学習用パラメータ
        lgbm_params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'num_class': 3
        }
        # 学習
        model = lgb_o.train(lgbm_params,
                        lgb_train,
                        valid_sets=lgb_eval,
                        verbose_eval=200,)
        self.model = model
        # 保存
        pickle.dump(self.model, open(self.f_model_name, 'wb'))
        # Accuracy の計算
        y_pred_proba = self.model.predict(self.x_val, num_iteration=self.model.best_iteration)
        y_pred = np.argmax(y_pred_proba, axis=1)
        accuracy = sum(self.y_val == y_pred) / len(self.y_val)
        print('accuracy:', accuracy)

    def preprocessing(self):
        # 読み込み
        df = pd.read_csv("data/model/base_models/preprocessing/preprocessed_1.csv", index_col=0)
        # カテゴリ列処理
        category_columns = ['カテゴリ', 'H_team', 'A_team', 'H_監督', 'A_監督']
        df[category_columns] = df[category_columns].astype('category')
        # 目的変数のデータ数を揃える
        train = df[df['train_test']=='train'].drop(columns = ['train_test'])
        train = train.sort_values('年月日', ascending=False)
        train_0 = train[train['y_H_result'] == 0]
        train_1 = train[train['y_H_result'] == 1]
        train_2 = train[train['y_H_result'] == 2]
        n_row_0 = train_0.shape[0]
        n_row_1 = train_1.shape[0]
        n_row_2 = train_2.shape[0]
        n_row = min(n_row_0, n_row_1, n_row_2)
        train = pd.concat([train_0.iloc[:n_row], train_1.iloc[:n_row], train_2.iloc[:n_row]])
        # 不要な列を削除
        train = self.common.drop_y_col(train, self.y_col)
        # train, val, testに分割
        x_train = train.drop(columns = self.y_col)
        y_train = train[self.y_col]
        self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(x_train, y_train, stratify = y_train)
        test = df[df['train_test']=='test'].drop(columns = ['train_test'])
        test = self.common.drop_y_col(test, self.y_col)
        self.x_test = test.drop(columns = self.y_col)

In [11]:
m = model()
df = m.get_y_pred()

[32m[I 2021-06-09 08:11:39,670][0m A new study created in memory with name: no-name-70a681df-f345-4cd3-887c-25e6cb17416b[0m



  0%|          | 0/7 [00:00<?, ?it/s][A[A[A




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1603
[LightGBM] [Info] Number of data points in the train set: 2643, number of used features: 36
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[200]	valid_0's multi_logloss: 1.23039
[400]	valid_0's multi_logloss: 1.50973


feature_fraction, val_score: inf:   0%|          | 0/7 [00:31<?, ?it/s]

[600]	valid_0's multi_logloss: 1.75758





[800]	valid_0's multi_logloss: 1.97762
[1000]	valid_0's multi_logloss: 2.16682





feature_fraction, val_score: 2.166817:   0%|          | 0/7 [00:03<?, ?it/s][A[A[A


feature_fraction, val_score: 2.166817:  14%|#4        | 1/7 [00:03<00:19,  3.17s/it][A[A[A[32m[I 2021-06-09 08:11:42,847][0m Trial 0 finished with value: 2.1668168915617936 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 2.1668168915617936.[0m



feature_fraction, val_score: 2.166817:  14%|#4        | 1/7 [00:03<00:19,  3.17s/it][A[A[A[33m[W 2021-06-09 08:11:42,899][0m Trial 1 failed because of the following error: LightGBMError('Check failed: (best_split_info.left_count) > (0) at /tmp/pip-req-build-vigvzzq1/compile/src/treelearner/serial_tree_learner.cpp, line 651 .\n')
Traceback (most recent call last):
  File "<ipython-input-10-9984028503c2>", line 45, in set_model
    self.model = pickle.load(open(self.f_model_name, 'rb'))
FileNotFoundError: [Errno 2] No such file or directory: 'data/model/base_models/model_1/model_for_2017.sav'

During handling of the above 

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1603
[LightGBM] [Info] Number of data points in the train set: 2643, number of used features: 36
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


LightGBMError: Check failed: (best_split_info.left_count) > (0) at /tmp/pip-req-build-vigvzzq1/compile/src/treelearner/serial_tree_learner.cpp, line 651 .


In [None]:
df