In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import time

In [2]:
from sklearn.model_selection import train_test_split, cross_val_predict, TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression
import lightgbm as lgb

In [3]:
from src.metrics import cosine_similarity

# 環境設定

In [4]:
DATA_PATH = "./data"
RESULT_PATH = "./results"

# 関数定義

In [5]:
# KFoldを実装
# cv splitterは， split関数でインデックスの配列を返す必要があるぽい
class GroupTimeSeriesSplit:
    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def split(self, X, y, groups=None):
        n = X.shape[0]
        group_samples = 10000
        group_count = (n + group_samples - 1) // group_samples
        groups = np.arange(group_count) # グループ数

        def group_idx_to_idx(idx):
            """group_idxをidxに変換する.
            group_idx_to_idx(1)
            >>> np.array([10000, 19999])
            """
            return np.concatenate([
                np.arange(group_samples * g, min(n, group_samples * (g + 1)))
                for g in idx
            ])

        cv = TimeSeriesSplit(self.n_splits)
        for train_index, test_index in cv.split(groups):
            yield group_idx_to_idx(train_index), group_idx_to_idx(test_index)

    def get_n_splits(self):
        return self.n_splits

In [6]:
def time_series_cross_val_predict(model, X_train, y_train, cv) -> tuple[np.ndarray, np.ndarray]: 
    """cvの分割方法に従って予測値を算出する."""
    y_train_ = np.array([])
    y_val = np.array([])
    for i, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        print(f"CV{i}回目")
        # REVIEW: fitが複数回呼ばれた際に，HP等は保持されたままか分からない
        model.fit(X_train[train_idx], y_train[train_idx])
        y_val = np.concatenate([y_val, model.predict(X_train[val_idx])])
        y_train_ = np.concatenate([y_train_, y_train[val_idx]])
    return y_train_, y_val

# 分析

## データ読み込み

In [7]:
train_data = pd.read_parquet(Path(DATA_PATH, "train.parquet"))
test_data = pd.read_parquet(Path(DATA_PATH, "test.parquet"))

## モデルの学習

In [8]:
# データ分割
# TODO: 時系列データであることを考慮すべき？
feature_cols = train_data.drop(columns=["id", "target"]).columns
target_col = "target"
X_train, X_valid, y_train, y_valid = train_test_split(
    train_data[feature_cols], 
    train_data[target_col], 
    test_size=0.3
)

print(f'X_trainのshape:{X_train.shape}')
print(f'y_trainのshape:{y_train.shape}')
print(f'X_validのshape:{X_valid.shape}')
print(f'y_validのshape:{y_valid.shape}')

X_trainのshape:(462000, 695)
y_trainのshape:(462000,)
X_validのshape:(198000, 695)
y_validのshape:(198000,)


In [9]:
# 特徴量選択
selector = SelectKBest(score_func=f_regression, k=100) 
selector.fit(X_train, y_train)
X_train = selector.transform(X_train)
X_valid = selector.transform(X_valid)

print(f'X_trainのshape:{X_train.shape}')
print(f'X_validのshape:{X_valid.shape}')

X_trainのshape:(462000, 100)
X_validのshape:(198000, 100)


In [10]:
# パラメータの準備
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric': 'rmse',
    'num_leaves':16,
    'learning_rate':0.1,
    'n_estimators':100000,
    'random_state':0
}

In [12]:
%%time
# モデルの設定
model = lgb.LGBMRegressor(**params)

# モデルの学習
model.fit(
    X_train, 
    y_train,
    eval_set=[(X_train, y_train),(X_valid, y_valid)],
    early_stopping_rounds=10
)



[1]	training's rmse: 0.999728	valid_1's rmse: 0.999801
[2]	training's rmse: 0.999499	valid_1's rmse: 0.999613
[3]	training's rmse: 0.999283	valid_1's rmse: 0.999451
[4]	training's rmse: 0.999051	valid_1's rmse: 0.999262
[5]	training's rmse: 0.998869	valid_1's rmse: 0.999113
[6]	training's rmse: 0.998714	valid_1's rmse: 0.998992
[7]	training's rmse: 0.998559	valid_1's rmse: 0.99887
[8]	training's rmse: 0.998407	valid_1's rmse: 0.998757
[9]	training's rmse: 0.998259	valid_1's rmse: 0.998653
[10]	training's rmse: 0.998118	valid_1's rmse: 0.998575
[11]	training's rmse: 0.998003	valid_1's rmse: 0.998483
[12]	training's rmse: 0.997883	valid_1's rmse: 0.998409
[13]	training's rmse: 0.997752	valid_1's rmse: 0.998345
[14]	training's rmse: 0.997647	valid_1's rmse: 0.998268
[15]	training's rmse: 0.997549	valid_1's rmse: 0.998232
[16]	training's rmse: 0.997456	valid_1's rmse: 0.99817
[17]	training's rmse: 0.997336	valid_1's rmse: 0.99809
[18]	training's rmse: 0.997248	valid_1's rmse: 0.99803
[19]	

KeyboardInterrupt: 

In [57]:
# 精度
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)
print(f"trainの精度：{cosine_similarity(y_train, y_train_pred)}")
print(f"validの精度：{cosine_similarity(y_valid, y_valid_pred)}")

trainの精度：0.30204719310715356
validの精度：0.10335585223513896


## テストデータの予測

In [58]:
# 予測
id_test = test_data["id"].values.reshape(-1)
X_test = test_data.drop(columns=["id"]).copy()
X_test_selected = selector.transform(X_test)

In [59]:
y_test_pred = model.predict(X_test_selected)

In [60]:
# 保存
result = pd.DataFrame(data=y_test_pred, index=id_test, columns=["target"]).reset_index().rename(columns={"index": "id"})
result.to_csv(Path(RESULT_PATH, "20230820_LGBM_feature_selection.csv"), index=False)

# 参考サイト
- https://qiita.com/rockhopper/items/a68ceb3248f2b3a41c89