02で前処理をしたデータの読み込みとモデルの学習を行うためのnotebookです。  
ここで作成したモデルは **src/models/** フォルダに格納して推論の際に使うようにして下さい。

## 必要なライブラリのimport

In [2]:
import warnings
import time
import pickle

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

In [3]:
# データの読み込み
train = pd.read_csv('../data/processed/processed20240619_train.csv')
test = pd.read_csv('../data/processed/processed20240619_test.csv')

# 目的変数と説明変数の作成
target = train['target']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month', 'target']]
categorical_feats = ['feature_2', 'feature_3']


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
import pickle
from sklearn.metrics import mean_squared_error
import time

# データの読み込み
train = pd.read_csv('../data/processed/processed20240620_train.csv')
test = pd.read_csv('../data/processed/processed20240620_test.csv')

# 目的変数と説明変数の作成
target = train['target']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month', 'target']]
categorical_feats = []

# KFoldで5分割して学習
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()

param = {'num_leaves': 111,
         'min_data_in_leaf': 149, 
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2634,
         "random_state": 133,
         "verbosity": -1}

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print(f"Fold {fold_+1}/5")
    
    trn_data = lgb.Dataset(train.iloc[trn_idx][features],
                           label=target.iloc[trn_idx],
                           categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][features],
                           label=target.iloc[val_idx],
                           categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(params=param,
                    train_set=trn_data,
                    num_boost_round=num_round,
                    valid_sets=[val_data],
                    callbacks=[lgb.early_stopping(stopping_rounds=200),
                               lgb.log_evaluation(100)])

    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    # モデルを保存
    with open(f'../src/models/model_fold_{fold_}.pkl', 'wb') as f:
        pickle.dump(clf, f)

print(f"CV score: {mean_squared_error(oof, target)**0.5:.5f}")

Fold 1/5
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 3.7764
[200]	valid_0's rmse: 3.72857
[300]	valid_0's rmse: 3.70481
[400]	valid_0's rmse: 3.6922
[500]	valid_0's rmse: 3.6848
[600]	valid_0's rmse: 3.68041
[700]	valid_0's rmse: 3.67747
[800]	valid_0's rmse: 3.67551
[900]	valid_0's rmse: 3.67409
[1000]	valid_0's rmse: 3.67305
[1100]	valid_0's rmse: 3.67235
[1200]	valid_0's rmse: 3.67201
[1300]	valid_0's rmse: 3.67126
[1400]	valid_0's rmse: 3.67107
[1500]	valid_0's rmse: 3.67094
[1600]	valid_0's rmse: 3.67078
[1700]	valid_0's rmse: 3.67061
[1800]	valid_0's rmse: 3.6703
[1900]	valid_0's rmse: 3.67052
Early stopping, best iteration is:
[1779]	valid_0's rmse: 3.67026
Fold 2/5
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 3.70514
[200]	valid_0's rmse: 3.66213
[300]	valid_0's rmse: 3.64103
[400]	valid_0's rmse: 3.63048
[500]	valid_0's rmse: 3.62498
[600]	valid_0's rmse: 3.62146
[700]	valid_0's rmse: 3.61909
[800]	

In [6]:
# テストデータの予測
for fold_ in range(5):
    with open(f'../src/models/model_fold_{fold_}.pkl', 'rb') as f:
        clf = pickle.load(f)
    fold_predictions = clf.predict(test[features])
    predictions += fold_predictions

# 平均化
predictions /= 5

# 提出用のCSVファイルを作成
sub_df = pd.DataFrame({"card_id":test["card_id"].values})
sub_df["target"] = predictions
sub_df.to_csv("submit_original_lightGBM.csv", index=False)