In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import time

In [2]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [3]:
from src.metrics import cosine_similarity

# 環境設定

In [4]:
DATA_PATH = "./data"
RESULT_PATH = "./results"

# 関数定義

# 分析

## データ読み込み

In [5]:
train_data = pd.read_parquet(Path(DATA_PATH, "train.parquet"))
test_data = pd.read_parquet(Path(DATA_PATH, "test.parquet"))

## 次元削減

In [6]:
# サンプリング（時間短縮のため）
sample_train_data = train_data.sample(frac=1)

In [21]:
# データ分割
feature_cols = sample_train_data.drop(columns=["id", "target"]).columns
target_col = "target"
X = sample_train_data[feature_cols]
y = sample_train_data[target_col]

In [33]:
# 主成分分析
pca = PCA(n_components=30)
X_pca = pca.fit_transform(X)

In [34]:
print(f"pca前のshape: {X.shape}")
print(f"pca後のshape: {X_pca.shape}")

pca前のshape: (660000, 695)
pca後のshape: (660000, 30)


## モデルの学習

In [35]:
X_train, X_valid, y_train, y_valid = train_test_split(X_pca, y, test_size=0.3)
print(f'X_trainのshape:{X_train.shape}')
print(f'y_trainのshape:{y_train.shape}')
print(f'X_validのshape:{X_valid.shape}')
print(f'y_validのshape:{y_valid.shape}')

X_trainのshape:(462000, 30)
y_trainのshape:(462000,)
X_validのshape:(198000, 30)
y_validのshape:(198000,)


In [36]:
# パラメータの準備
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric': 'rmse',
    'num_leaves':16,
    'learning_rate':0.1,
    'n_estimators':100000,
    'random_state':0
}

In [37]:
# インスタンスの作成
clf = lgb.LGBMRegressor(**params)

In [38]:
# モデルの学習
start_time = time.time()
clf.fit(
    X_train, 
    y_train,
    eval_set=[(X_train, y_train),(X_valid, y_valid)],
    early_stopping_rounds=100
)
print(f"学習にかかった時間：{time.time() - start_time}")



[1]	training's rmse: 1.00045	valid_1's rmse: 0.998201
[2]	training's rmse: 1.00026	valid_1's rmse: 0.998089
[3]	training's rmse: 1.00006	valid_1's rmse: 0.997994
[4]	training's rmse: 0.999868	valid_1's rmse: 0.9979
[5]	training's rmse: 0.999706	valid_1's rmse: 0.997816
[6]	training's rmse: 0.999537	valid_1's rmse: 0.997707
[7]	training's rmse: 0.999396	valid_1's rmse: 0.997614
[8]	training's rmse: 0.999247	valid_1's rmse: 0.997525
[9]	training's rmse: 0.999114	valid_1's rmse: 0.997454
[10]	training's rmse: 0.998974	valid_1's rmse: 0.997403
[11]	training's rmse: 0.998865	valid_1's rmse: 0.997342
[12]	training's rmse: 0.99872	valid_1's rmse: 0.997266
[13]	training's rmse: 0.998603	valid_1's rmse: 0.997214
[14]	training's rmse: 0.99849	valid_1's rmse: 0.997171
[15]	training's rmse: 0.998369	valid_1's rmse: 0.997134
[16]	training's rmse: 0.998251	valid_1's rmse: 0.99709
[17]	training's rmse: 0.998145	valid_1's rmse: 0.997036
[18]	training's rmse: 0.998019	valid_1's rmse: 0.997004
[19]	trai

In [39]:
# 精度
y_train_pred = clf.predict(X_train)
y_valid_pred = clf.predict(X_valid)
print(f"trainの精度：{cosine_similarity(y_train, y_train_pred)}")
print(f"validの精度：{cosine_similarity(y_valid, y_valid_pred)}")

trainの精度：0.7165653300734072
validの精度：0.18483027607534092


## テストデータの予測

In [40]:
# 予測
id_test = test_data["id"].values.reshape(-1)
X_test = test_data.drop(columns=["id"]).copy()
X_test_pca = pca.transform(X_test)
y_test_pred = clf.predict(X_test_pca)

In [41]:
pd.DataFrame(y_test_pred)

Unnamed: 0,0
0,0.317946
1,0.163241
2,-0.069377
3,0.152284
4,-0.087966
...,...
261656,-0.103450
261657,0.477344
261658,0.032422
261659,-0.018614


In [42]:
# 保存
result = pd.DataFrame(data=y_test_pred, index=id_test, columns=["target"]).reset_index().rename(columns={"index": "id"})
result.to_csv(Path(RESULT_PATH, "20230818_result_pca_30%.csv"), index=False)