In [53]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import time

In [54]:
from sklearn.model_selection import cross_val_predict, KFold, TimeSeriesSplit
from sklearn.linear_model import Ridge
import lightgbm as lgb

In [55]:
from src.metrics import cosine_similarity

# 環境設定

In [56]:
DATA_PATH = "./data"
RESULT_PATH = "./results"

# 関数定義

In [57]:
# KFoldを実装
# cv splitterは， split関数でインデックスの配列を返す必要があるぽい
class MyKFold:
    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def split(self, X, y, groups=None):
        n = X.shape[0]
        group_samples = 10000
        group_count = (n + group_samples - 1) // group_samples
        groups = np.arange(group_count)

        def group_idx_to_idx(idx):
            return np.concatenate([
                np.arange(group_samples * g, min(n, group_samples * (g + 1)))
                for g in idx
            ])

        cv = KFold(self.n_splits)
        for train_index, test_index in cv.split(groups):
            # remove the group next to the test group
            train_index = train_index[~np.isin(train_index, [
                test_index[0] - 1, test_index[0] + 1,
                test_index[-1] - 1, test_index[-1] + 1,
            ])]
            yield group_idx_to_idx(train_index), group_idx_to_idx(test_index)

    def get_n_splits(self):
        return self.n_splits

In [58]:
for i, j in MyKFold(n_splits=5).split(np.arange(100000), np.arange(100000)):
    print(i)
    print(j)

[30000 30001 30002 ... 99997 99998 99999]
[    0     1     2 ... 19997 19998 19999]
[    0     1     2 ... 99997 99998 99999]
[20000 20001 20002 ... 39997 39998 39999]
[    0     1     2 ... 99997 99998 99999]
[40000 40001 40002 ... 59997 59998 59999]
[    0     1     2 ... 99997 99998 99999]
[60000 60001 60002 ... 79997 79998 79999]
[    0     1     2 ... 69997 69998 69999]
[80000 80001 80002 ... 99997 99998 99999]


# 分析

## データ読み込み

In [59]:
train_data = pd.read_parquet(Path(DATA_PATH, "train.parquet"))
test_data = pd.read_parquet(Path(DATA_PATH, "test.parquet"))

## モデルの学習

In [60]:
# サンプリング（時間短縮のため）
sample_train_data = train_data.sample(frac=1)

In [63]:
# データ分割
# TODO: 時系列データであることを考慮すべき？
feature_cols = train_data.drop(columns=["id", "target"]).columns
target_col = "target"
X_train = train_data[feature_cols]
y_train = train_data[target_col]
print(f'X_trainのshape:{X_train.shape}')
print(f'y_trainのshape:{y_train.shape}')

X_trainのshape:(660000, 695)
y_trainのshape:(660000,)


In [64]:
# モデルの設定
model = Ridge()

# 学習モデルの評価を行います。
y_train_pred = cross_val_predict(
    model,
    X_train,
    y_train,
    cv=MyKFold(),
)
print(f'cosine similarity {cosine_similarity(y_train, y_train_pred)}')

cosine similarity 0.033795105656280744


In [38]:
model.fit(X_train, y_train)

## テストデータの予測

In [39]:
# 予測
id_test = test_data["id"].values.reshape(-1)
X_test = test_data.drop(columns=["id"]).copy()
y_test_pred = model.predict(X_test)

In [40]:
pd.DataFrame(y_test_pred)

Unnamed: 0,0
0,0.087330
1,-0.103369
2,0.007209
3,-0.098552
4,0.015430
...,...
261656,-0.044642
261657,0.033134
261658,0.048891
261659,-0.079927


In [41]:
# 保存
result = pd.DataFrame(data=y_test_pred, index=id_test, columns=["target"]).reset_index().rename(columns={"index": "id"})
result.to_csv(Path(RESULT_PATH, "20230818_ridge.csv"), index=False)