In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px

In [20]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# 環境設定

In [2]:
DATA_PATH = "./data"

# 関数定義

In [6]:
def count_feature_xs(df, feature_x: str):
    return len([col for col in df.columns if feature_x in col])

In [58]:
def cosine_similarity(y_true, y_pred):
    if isinstance(y_true, pd.Series):
        y_true = y_true.values.reshape(-1)
    if isinstance(y_pred, pd.Series):
        y_pred = y_pred.values.reshape(-1)
    score = np.dot(y_true, y_pred) / (np.linalg.norm(y_true) * np.linalg.norm(y_pred))
    return "cosine_similarity", score, True

# 分析

## データ読み込み

In [4]:
train_data = pd.read_csv(Path(DATA_PATH, "train.csv"))

## EDA

In [5]:
train_data.head()

Unnamed: 0,id,feature_a_000,feature_a_001,feature_a_002,feature_a_003,feature_a_004,feature_a_005,feature_a_006,feature_a_007,feature_a_008,...,feature_d_131,feature_d_132,feature_d_133,feature_d_134,feature_f_000,feature_f_001,feature_f_002,feature_f_003,feature_f_004,target
0,0,4,5,4,5,5,1,3,1,4,...,4,2,3,2,4,2,4,1,4,0.000415
1,1,5,4,5,4,5,3,2,3,3,...,3,1,2,1,5,1,3,1,3,0.000415
2,2,1,1,5,1,2,1,1,3,1,...,4,4,3,2,1,2,3,1,3,3.039352
3,3,5,3,4,5,4,5,5,2,3,...,2,1,3,2,3,4,2,4,2,-0.512833
4,4,1,4,1,4,2,1,1,1,2,...,2,3,2,4,2,5,1,5,1,0.000415


In [16]:
print(f"学習データのshape：{train_data.shape}")
print(f"ユニークID数：{train_data['id'].nunique()}")
feature_xs = ["feature_a", "feature_b", "feature_c", "feature_d", "feature_e", "feature_f"]
for feature_x in feature_xs:
    print(f'{feature_x}の個数：{count_feature_xs(train_data, feature_x)}')

学習データのshape：(660000, 697)
ユニークID数：660000
feature_aの個数：285
feature_bの個数：135
feature_cの個数：135
feature_dの個数：135
feature_eの個数：0
feature_fの個数：5


## モデルの学習

In [47]:
# サンプリング（時間短縮のため）
sample_train_data = train_data.sample(frac=0.1)

In [48]:
# データ分割
feature_cols = sample_train_data.drop(columns=["id", "target"]).columns
target_col = "target"
X_train, X_valid, y_train, y_valid = train_test_split(
    sample_train_data[feature_cols], 
    sample_train_data[target_col], 
    test_size=0.3
)
print(f'X_trainのshape:{X_train.shape}')
print(f'y_trainのshape:{y_train.shape}')
print(f'X_validのshape:{X_valid.shape}')
print(f'y_validのshape:{y_valid.shape}')

X_trainのshape:(46200, 695)
y_trainのshape:(46200,)
X_validのshape:(19800, 695)
y_validのshape:(19800,)


In [49]:
# パラメータの準備
params = {
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'None',
    'num_leaves':16,
    'learning_rate':0.1,
    'n_estimators':100000,
    'random_state':0
}

In [50]:
# インスタンスの作成
clf = lgb.LGBMRegressor(**params)

In [51]:
# モデルの学習
clf.fit(
    X_train, 
    y_train,
    eval_set=[(X_train, y_train),(X_valid, y_valid)],
    eval_metric=cosine_similarity,
    early_stopping_rounds=100
)



[1]	training's cosine_similarity: 0.0974995	valid_1's cosine_similarity: 0.0320802
[2]	training's cosine_similarity: 0.132258	valid_1's cosine_similarity: 0.0325134
[3]	training's cosine_similarity: 0.154967	valid_1's cosine_similarity: 0.043343
[4]	training's cosine_similarity: 0.167521	valid_1's cosine_similarity: 0.0390904
[5]	training's cosine_similarity: 0.182614	valid_1's cosine_similarity: 0.0397032
[6]	training's cosine_similarity: 0.190145	valid_1's cosine_similarity: 0.0400236
[7]	training's cosine_similarity: 0.201617	valid_1's cosine_similarity: 0.0408597
[8]	training's cosine_similarity: 0.208086	valid_1's cosine_similarity: 0.0394109
[9]	training's cosine_similarity: 0.21875	valid_1's cosine_similarity: 0.0415791
[10]	training's cosine_similarity: 0.224351	valid_1's cosine_similarity: 0.041644
[11]	training's cosine_similarity: 0.229944	valid_1's cosine_similarity: 0.0422214
[12]	training's cosine_similarity: 0.237375	valid_1's cosine_similarity: 0.0463554
[13]	training's

## 予測

In [61]:
# 精度
y_train_pred = clf.predict(X_train)
y_valid_pred = clf.predict(X_valid)
print(f"trainの精度：{cosine_similarity(y_train, y_train_pred)[1]}")
print(f"validの精度：{cosine_similarity(y_valid, y_valid_pred)[1]}")

trainの精度：0.9790831427870641
validの精度：0.17378829580373475
