In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import time

In [5]:
from sklearn.model_selection import cross_val_predict, KFold, TimeSeriesSplit
from sklearn.linear_model import Ridge
import lightgbm as lgb

In [6]:
from src.metrics import cosine_similarity

# 環境設定

In [7]:
DATA_PATH = "./data"
RESULT_PATH = "./results"

# 関数定義

In [17]:
# KFoldを実装
# cv splitterは， split関数でインデックスの配列を返す必要があるぽい
class GroupTimeSeriesSplit:
    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def split(self, X, y, groups=None):
        n = X.shape[0]
        group_samples = 10000
        group_count = (n + group_samples - 1) // group_samples
        groups = np.arange(group_count) # グループ数

        def group_idx_to_idx(idx):
            """group_idxをidxに変換する.
            group_idx_to_idx(1)
            >>> np.array([10000, 19999])
            """
            return np.concatenate([
                np.arange(group_samples * g, min(n, group_samples * (g + 1)))
                for g in idx
            ])

        cv = TimeSeriesSplit(self.n_splits)
        for train_index, test_index in cv.split(groups):
            train_index = train_index[~np.isin(train_index, [
                test_index[0] - 1, test_index[0] + 1,
            ])]
            yield group_idx_to_idx(train_index), group_idx_to_idx(test_index)

    def get_n_splits(self):
        return self.n_splits

In [18]:
def time_series_cross_val_predict(model, X_train, y_train, cv) -> tuple[np.ndarray, np.ndarray]: 
    """cvの分割方法に従って予測値を算出する."""
    y_train_ = np.array([])
    y_val = np.array([])
    for train_idx, val_idx in cv.split(X_train, y_train):
        # REVIEW: fitが複数回呼ばれた際に，HP等は保持されたままか分からない
        model.fit(X_train[train_idx], y_train[train_idx])
        y_val = np.concatenate([y_val, model.predict(X_train[val_idx])])
        y_train_ = np.concatenate([y_train_, y_train[val_idx]])
    return y_train_, y_val

# 分析

## データ読み込み

In [10]:
train_data = pd.read_parquet(Path(DATA_PATH, "train.parquet"))
test_data = pd.read_parquet(Path(DATA_PATH, "test.parquet"))

## モデルの学習

In [11]:
# データ分割
# TODO: 時系列データであることを考慮すべき？
feature_cols = train_data.drop(columns=["id", "target"]).columns
target_col = "target"
X_train = train_data[feature_cols]
y_train = train_data[target_col]
print(f'X_trainのshape:{X_train.shape}')
print(f'y_trainのshape:{y_train.shape}')

X_trainのshape:(660000, 695)
y_trainのshape:(660000,)


In [19]:
# モデルの設定
model = Ridge()

# 学習モデルの評価を行います。
y_train_, y_train_pred = time_series_cross_val_predict(
    model,
    X_train.values,
    y_train.values,
    cv=GroupTimeSeriesSplit(),
)
print(f'cosine similarity {cosine_similarity(y_train_, y_train_pred)}')

before: train_index: [ 0  1  2  3  4  5  6  7  8  9 10]
before: test_index: [11 12 13 14 15 16 17 18 19 20 21]
after: train_index: [0 1 2 3 4 5 6 7 8 9]
after: test_index: [11 12 13 14 15 16 17 18 19 20 21]
before: train_index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]
before: test_index: [22 23 24 25 26 27 28 29 30 31 32]
after: train_index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
after: test_index: [22 23 24 25 26 27 28 29 30 31 32]
before: train_index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32]
before: test_index: [33 34 35 36 37 38 39 40 41 42 43]
after: train_index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]
after: test_index: [33 34 35 36 37 38 39 40 41 42 43]
before: train_index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43]
bef

In [36]:
model.fit(X_train, y_train)

## テストデータの予測

In [37]:
# 予測
id_test = test_data["id"].values.reshape(-1)
X_test = test_data.drop(columns=["id"]).copy()
y_test_pred = model.predict(X_test)

In [38]:
pd.DataFrame(y_test_pred)

Unnamed: 0,0
0,0.087330
1,-0.103369
2,0.007209
3,-0.098552
4,0.015430
...,...
261656,-0.044642
261657,0.033134
261658,0.048891
261659,-0.079927


In [41]:
# 保存
result = pd.DataFrame(data=y_test_pred, index=id_test, columns=["target"]).reset_index().rename(columns={"index": "id"})
result.to_csv(Path(RESULT_PATH, "20230818_ridge.csv"), index=False)

# 参考サイト
- fitが複数回呼ばれた際の挙動について
    - https://stackoverflow.com/questions/49841324/what-does-calling-fit-multiple-times-on-the-same-model-do