In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import time

In [2]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# 環境設定

In [3]:
DATA_PATH = "./data"
RESULT_PATH = "./results"

# 関数定義

In [4]:
def cosine_similarity(y_true, y_pred):
    if isinstance(y_true, pd.Series):
        y_true = y_true.values.reshape(-1)
    if isinstance(y_pred, pd.Series):
        y_pred = y_pred.values.reshape(-1)
    score = np.dot(y_true, y_pred) / (np.linalg.norm(y_true) * np.linalg.norm(y_pred))
    return score

# 分析

## データ読み込み

In [5]:
train_data = pd.read_parquet(Path(DATA_PATH, "train.parquet"))
test_data = pd.read_parquet(Path(DATA_PATH, "test.parquet"))

## モデルの学習

In [12]:
# サンプリング（時間短縮のため）
sample_train_data = train_data.sample(frac=0.2)

In [13]:
# データ分割
# TODO: 時系列データであることを考慮すべき？
feature_cols = sample_train_data.drop(columns=["id", "target"]).columns
target_col = "target"
X_train, X_valid, y_train, y_valid = train_test_split(
    sample_train_data[feature_cols], 
    sample_train_data[target_col], 
    test_size=0.3
)
print(f'X_trainのshape:{X_train.shape}')
print(f'y_trainのshape:{y_train.shape}')
print(f'X_validのshape:{X_valid.shape}')
print(f'y_validのshape:{y_valid.shape}')

X_trainのshape:(462000, 695)
y_trainのshape:(462000,)
X_validのshape:(198000, 695)
y_validのshape:(198000,)


In [14]:
# パラメータの準備
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric': 'rmse',
    'num_leaves':16,
    'learning_rate':0.1,
    'n_estimators':100000,
    'random_state':0
}

In [15]:
# インスタンスの作成
clf = lgb.LGBMRegressor(**params)

In [16]:
# モデルの学習
start_time = time.time()
clf.fit(
    X_train, 
    y_train,
    eval_set=[(X_train, y_train),(X_valid, y_valid)],
    early_stopping_rounds=100
)
print(f"学習にかかった時間：{time.time() - start_time}")



[1]	training's rmse: 0.999121	valid_1's rmse: 1.00086
[2]	training's rmse: 0.998823	valid_1's rmse: 1.00063
[3]	training's rmse: 0.9985	valid_1's rmse: 1.00039
[4]	training's rmse: 0.998214	valid_1's rmse: 1.00016
[5]	training's rmse: 0.99796	valid_1's rmse: 0.999965
[6]	training's rmse: 0.997724	valid_1's rmse: 0.999764
[7]	training's rmse: 0.997486	valid_1's rmse: 0.999577
[8]	training's rmse: 0.997255	valid_1's rmse: 0.999388
[9]	training's rmse: 0.997013	valid_1's rmse: 0.999202
[10]	training's rmse: 0.996826	valid_1's rmse: 0.999062
[11]	training's rmse: 0.99663	valid_1's rmse: 0.998897
[12]	training's rmse: 0.996399	valid_1's rmse: 0.998734
[13]	training's rmse: 0.99619	valid_1's rmse: 0.998586
[14]	training's rmse: 0.995988	valid_1's rmse: 0.998431
[15]	training's rmse: 0.995821	valid_1's rmse: 0.998297
[16]	training's rmse: 0.995634	valid_1's rmse: 0.998187
[17]	training's rmse: 0.995461	valid_1's rmse: 0.998035
[18]	training's rmse: 0.995297	valid_1's rmse: 0.997943
[19]	train

KeyboardInterrupt: 

In [11]:
# 精度
y_train_pred = clf.predict(X_train)
y_valid_pred = clf.predict(X_valid)
print(f"trainの精度：{cosine_similarity(y_train, y_train_pred)}")
print(f"validの精度：{cosine_similarity(y_valid, y_valid_pred)}")

trainの精度：0.6031205117146435
validの精度：0.18877603184573888


## テストデータの予測

In [28]:
# 予測
id_test = test_data["id"].values.reshape(-1)
X_test = test_data.drop(columns=["id"]).copy()
y_test_pred = clf.predict(X_test)

In [29]:
pd.DataFrame(y_test_pred)

Unnamed: 0,0
0,-0.537965
1,-0.062307
2,-0.049181
3,-0.126975
4,-0.271614
...,...
261656,-0.057380
261657,0.096382
261658,-0.125886
261659,-0.077150


In [31]:
# 保存
result = pd.DataFrame(data=y_test_pred, index=id_test, columns=["target"]).reset_index().rename(columns={"index": "id"})
result.to_csv(Path(RESULT_PATH, "20230817_result.csv"), index=False)