In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import time
import pickle

In [2]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import shap

In [3]:
from src.metrics import cosine_similarity

# 環境設定

In [4]:
DATA_PATH = "./data"
RESULT_PATH = "./results"

# 関数定義

# 分析

## データ読み込み

In [5]:
train_data = pd.read_parquet(Path(DATA_PATH, "train.parquet"))
test_data = pd.read_parquet(Path(DATA_PATH, "test.parquet"))

## 特徴量選択

In [11]:
# データ分割
# TODO: 時系列データであることを考慮すべき？
feature_cols = train_data.drop(columns=["id", "target"]).columns
target_col = "target"
X_train, X_valid, y_train, y_valid = train_test_split(
    train_data[feature_cols], 
    train_data[target_col], 
    test_size=0.3
)
print(f'X_trainのshape:{X_train.shape}')
print(f'y_trainのshape:{y_train.shape}')
print(f'X_validのshape:{X_valid.shape}')
print(f'y_validのshape:{y_valid.shape}')

X_trainのshape:(462000, 695)
y_trainのshape:(462000,)
X_validのshape:(198000, 695)
y_validのshape:(198000,)


In [10]:
model_load = pickle.load(open("./models/20230822_lightgbm_ver3.sav", 'rb'))

In [18]:
importance = pd.DataFrame(
    model_load.feature_importances_,
    index=X_train.columns, 
    columns=['importance'],
)
importance = importance.sort_values('importance', ascending=False).reset_index().rename(columns={"index": "feature"})
print(importance.shape)
importance.head()

(695, 2)


Unnamed: 0,feature,importance
0,feature_f_000,2613
1,feature_a_205,1828
2,feature_f_001,1791
3,feature_a_186,1752
4,feature_a_210,1746


In [26]:
feature_names = importance[0:100]["feature"]
X_train_selected = X_train[feature_names].copy()
X_valid_selected = X_valid[feature_names].copy()
print(f"X_train_selected:{X_train_selected.shape}")
print(f"X_valid_selected:{X_valid_selected.shape}")

X_train_selected:(462000, 100)
X_valid_selected:(198000, 100)


## モデルの学習

In [28]:
# パラメータの準備
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric': 'rmse',
    'num_leaves':16,
    'learning_rate':0.1,
    'n_estimators':30000, # ここで計算時間の上限をある程度決められる
    'random_state':0
}

In [29]:
# インスタンスの作成
clf = lgb.LGBMRegressor(**params)

In [31]:
%%time
# モデルの学習
clf.fit(
    X_train_selected, 
    y_train,
    eval_set=[(X_train_selected, y_train),(X_valid_selected, y_valid)],
    early_stopping_rounds=10
)

[1]	training's rmse: 0.999085	valid_1's rmse: 1.00125
[2]	training's rmse: 0.998846	valid_1's rmse: 1.00108
[3]	training's rmse: 0.998598	valid_1's rmse: 1.0009
[4]	training's rmse: 0.998368	valid_1's rmse: 1.0007
[5]	training's rmse: 0.998163	valid_1's rmse: 1.00054
[6]	training's rmse: 0.99794	valid_1's rmse: 1.00041
[7]	training's rmse: 0.997729	valid_1's rmse: 1.00026
[8]	training's rmse: 0.997532	valid_1's rmse: 1.00012
[9]	training's rmse: 0.997344	valid_1's rmse: 1.00002
[10]	training's rmse: 0.997147	valid_1's rmse: 0.999886
[11]	training's rmse: 0.996927	valid_1's rmse: 0.999683
[12]	training's rmse: 0.996749	valid_1's rmse: 0.999553
[13]	training's rmse: 0.996592	valid_1's rmse: 0.999428
[14]	training's rmse: 0.996441	valid_1's rmse: 0.999322
[15]	training's rmse: 0.996259	valid_1's rmse: 0.999193
[16]	training's rmse: 0.996099	valid_1's rmse: 0.999097
[17]	training's rmse: 0.995918	valid_1's rmse: 0.998991
[18]	training's rmse: 0.995776	valid_1's rmse: 0.99889
[19]	training'

In [33]:
# 精度
y_train_pred = clf.predict(X_train_selected)
y_valid_pred = clf.predict(X_valid_selected)
print(f"trainの精度：{cosine_similarity(y_train, y_train_pred)}")
print(f"validの精度：{cosine_similarity(y_valid, y_valid_pred)}")

trainの精度：0.9507245239718416
validの精度：0.7590429137448513


## テストデータの予測

In [34]:
# 予測
id_test = test_data["id"].values.reshape(-1)
X_test = test_data.drop(columns=["id"]).copy()
X_test_selected = X_test[feature_names].copy()
y_test_pred = clf.predict(X_test_selected)

In [35]:
# 保存
result = pd.DataFrame(data=y_test_pred, index=id_test, columns=["target"]).reset_index().rename(columns={"index": "id"})
result.to_csv(Path(RESULT_PATH, "20230823_result.csv"), index=False)