In [None]:
import pandas as pd
# LightGBMをインポート
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# CSVファイルを読み込む
df = pd.read_csv('../data/mufg/train.csv')
# カテゴリカルな特徴量をカテゴリカル型に変換
categorical_features = [
    'Educ_background', 'Incumbent', 'Mar_relationship', 'Mortgage',
    'Fin_loan', 'Default', 'Prev_result', 'Month', 'Cont_means'
]
df[categorical_features] = df[categorical_features].astype('category')

# 特徴量と目的変数にデータを分割
X = df.drop(["Id","Y"], axis=1)  # 'y'列を除いた特徴量
y = df['Y']               # 目的変数


# データを訓練用と検証用に分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBMのデータセットを作成
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# ハイパーパラメータの設定
params = {
    'objective': 'regression',    # 回帰タスク
    'metric': 'rmse',             # 評価指標（Root Mean Squared Error）
}

# モデルの学習
num_round = 1000  # 学習のイテレーション数
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data])

# 検証データを使って予測
y_pred = bst.predict(X_val, num_iteration=bst.best_iteration)

# RMSEを計算して表示
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')


In [2]:
!pip install lightgbm


Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/f6/9d/fae632fd823b407448b9cd2b28288172c040415e2c9ab401cca9e67b4192/lightgbm-4.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata
  Downloading lightgbm-4.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Downloading lightgbm-4.0.0-py3-none-manylinux_2_28_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.0.0
[0m