↑「ドライブにコピー」をクリックしてください。  
自分のGoogleドライブにコピーされるので、  
次にアクセスするときは、その自分のGoogleドライブにあるものを使ってください。  
「j-league.ipynb のコピー」のようなファイルが、  
「Colab Notebooks」フォルダの中にあると思いますので。  
  
CSVなどは、SIGNATEのコンペサイトの  
「データ」にあります。  
コードに関しては「ナレッジ」にあります。

# SIGNATEのJリーグの観客動員数予測向けColab

## ライブラリ等の準備  
ここにアクセスするたびに実行する必要あり。  
途中で  
　Press <enter> to keep the current choice[*], or type selection number:  
と聞かれれば、「1」を入力する。

In [None]:
# ython 3.10をインストール
!sudo apt-get update -y
!sudo apt-get install python3.10 python3.10-distutils

# デフォルトのpython3を3.10に切り替える
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
!sudo update-alternatives --config python3

# pipをインストール
!curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
!python3 get-pip.py

# バージョンを確認
!python3 --version

# ライブラリをインストール
!pip install numpy==1.23.1 pandas==1.4.4 matplotlib==3.6.1 scikit-learn==1.1.1


0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done

Note, selecting 'python3-distutils' instead of 'python3.10-distutils'
python3-distutils is already t

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

print("データの読み込み完了")

# --------------------------------------------------
# 2. データの結合 (マージ)
# --------------------------------------------------
# 学習用データと追加データを縦に結合
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
# 試合詳細データも縦に結合
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

# スタジアム情報の結合（列名が異なるため left_on, right_on を指定）
# train/testの 'stadium' 列と、stadium.csvの 'name' 列を紐付けます
train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

print("データの結合完了")
print(f"学習データ数: {len(train_all)}, 評価データ数: {len(test_all)}")
# --------------------------------------------------
# 3. 特徴量の作成 (修正版)
# --------------------------------------------------
# 日付処理
def process_gameday(df):
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    return df

train_all = process_gameday(train_all)
test_all = process_gameday(test_all)

# TV放送局の数をカウント（欠損値対策を追加）
def count_tv(x):
    if pd.isnull(x): return 0
    return len(str(x).split("／"))

train_all["tv_num"] = train_all["tv"].apply(count_tv)
test_all["tv_num"] = test_all["tv"].apply(count_tv)

# 天気の集約
def process_weather(x):
    if pd.isnull(x): return "other"
    if "雨" in x: return "rain"
    elif "晴" in x: return "sunny"
    else: return "other"

train_all["weather_simple"] = train_all["weather"].apply(process_weather)
test_all["weather_simple"] = test_all["weather"].apply(process_weather)

# データの結合
train_all["is_train"] = 1
test_all["is_train"] = 0
combined = pd.concat([train_all.drop("y", axis=1), test_all], sort=False)

# --- 【重要】ダミー変数化の修正 ---
# そのままだと "home_team" などの文字列と名前が被るため、
# "D_" という接頭辞(prefix)を付けて区別します
cols_to_dummy = ["week", "stage", "weather_simple", "home", "away"]
prefixes = ["D_week", "D_stage", "D_weather", "D_home", "D_away"]

combined_dummies = pd.get_dummies(combined, columns=cols_to_dummy, prefix=prefixes)

# --------------------------------------------------
# 4. 学習用データの準備 (修正版)
# --------------------------------------------------
# 特徴量を選択
# "D_" で始まる列（今回作ったダミー変数）だけを選びます
dummy_cols = [c for c in combined_dummies.columns if c.startswith("D_")]
features = ["capa", "month", "tv_num"] + dummy_cols

X_train = combined_dummies[combined_dummies["is_train"] == 1][features]
y_train = train_all["y"]
X_test = combined_dummies[combined_dummies["is_train"] == 0][features]

# 数値型に変換（念のため）し、欠損値を埋める
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

print(f"特徴量の数: {len(features)}")
print("学習データの準備完了")

# --------------------------------------------------
# 5. モデルの学習と予測
# --------------------------------------------------
from sklearn.ensemble import RandomForestRegressor

# モデル作成
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 学習データでの精度確認
pred_train = model.predict(X_train)
rmse = np.sqrt(MSE(y_train, pred_train))
print(f"学習データのRMSEスコア: {rmse}")

# テストデータ予測
pred_test = model.predict(X_test)

# --------------------------------------------------
# 6. 提出ファイルの作成
# --------------------------------------------------
sample_submit[1] = pred_test
sample_submit.to_csv("submit_rf_fixed.csv", index=False, header=False)

print("提出用ファイル 'submit_rf_fixed.csv' を作成しました。")

データの読み込み完了
データの結合完了
学習データ数: 1953, 評価データ数: 313
特徴量の数: 101
学習データの準備完了
学習データのRMSEスコア: 1288.4882168910697
提出用ファイル 'submit_rf_fixed.csv' を作成しました。


In [None]:
# --------------------------------------------------
# 線形回帰で実行するコード
# --------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression # 線形回帰
from sklearn.metrics import mean_squared_error as MSE

# ※データ読み込み（train, testなど）は完了している前提で進めます

# --- 1. 特徴量の作成（前処理） ---
# 日付処理
def process_gameday(df):
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    return df

train_all = process_gameday(train_all)
test_all = process_gameday(test_all)

# TV放送局の数をカウント
def count_tv(x):
    if pd.isnull(x): return 0
    return len(str(x).split("／"))

train_all["tv_num"] = train_all["tv"].apply(count_tv)
test_all["tv_num"] = test_all["tv"].apply(count_tv)

# 天気の集約
def process_weather(x):
    if pd.isnull(x): return "other"
    if "雨" in x: return "rain"
    elif "晴" in x: return "sunny"
    else: return "other"

train_all["weather_simple"] = train_all["weather"].apply(process_weather)
test_all["weather_simple"] = test_all["weather"].apply(process_weather)

# データの結合
train_all["is_train"] = 1
test_all["is_train"] = 0
combined = pd.concat([train_all.drop("y", axis=1), test_all], sort=False)

# --- 2. ダミー変数化（エラー対策済み） ---
# 文字列データが混ざらないよう、prefix（接頭辞）を付けます
cols_to_dummy = ["week", "stage", "weather_simple", "home", "away"]
prefixes = ["D_week", "D_stage", "D_weather", "D_home", "D_away"]

combined_dummies = pd.get_dummies(combined, columns=cols_to_dummy, prefix=prefixes)

# --- 3. 学習用データの準備 ---
# 使用する特徴量を選択
dummy_cols = [c for c in combined_dummies.columns if c.startswith("D_")]
features = ["capa", "month", "tv_num"] + dummy_cols

X_train = combined_dummies[combined_dummies["is_train"] == 1][features]
y_train = train_all["y"]
X_test = combined_dummies[combined_dummies["is_train"] == 0][features]

# 線形回帰は欠損値に弱いため、確実に0で埋めます
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

print(f"特徴量の数: {len(features)}")

# --- 4. モデルの学習と予測（線形回帰） ---
model = LinearRegression()
model.fit(X_train, y_train)

# 学習データでの精度確認
pred_train = model.predict(X_train)
rmse = np.sqrt(MSE(y_train, pred_train))
print(f"学習データのRMSEスコア: {rmse}")

# テストデータ予測
pred_test = model.predict(X_test)

# --- 5. 提出ファイルの作成 ---
sample_submit[1] = pred_test
sample_submit.to_csv("submit_linear.csv", index=False, header=False)

print("提出用ファイル 'submit_linear.csv' を作成しました。")

特徴量の数: 101
学習データのRMSEスコア: 3186.218184369981
提出用ファイル 'submit_linear.csv' を作成しました。


In [None]:
# --------------------------------------------------
# 3. 特徴量の作成 (修正版)
# --------------------------------------------------
# 日付処理
def process_gameday(df):
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    return df

train_all = process_gameday(train_all)
test_all = process_gameday(test_all)

# TV放送局の数をカウント（欠損値対策を追加）
def count_tv(x):
    if pd.isnull(x): return 0
    return len(str(x).split("／"))

train_all["tv_num"] = train_all["tv"].apply(count_tv)
test_all["tv_num"] = test_all["tv"].apply(count_tv)

# 天気の集約
def process_weather(x):
    if pd.isnull(x): return "other"
    if "雨" in x: return "rain"
    elif "晴" in x: return "sunny"
    else: return "other"

train_all["weather_simple"] = train_all["weather"].apply(process_weather)
test_all["weather_simple"] = test_all["weather"].apply(process_weather)

# データの結合
train_all["is_train"] = 1
test_all["is_train"] = 0
combined = pd.concat([train_all.drop("y", axis=1), test_all], sort=False)

# --- 【重要】ダミー変数化の修正 ---
# そのままだと "home_team" などの文字列と名前が被るため、
# "D_" という接頭辞(prefix)を付けて区別します
cols_to_dummy = ["week", "stage", "weather_simple", "home", "away"]
prefixes = ["D_week", "D_stage", "D_weather", "D_home", "D_away"]

combined_dummies = pd.get_dummies(combined, columns=cols_to_dummy, prefix=prefixes)

# --------------------------------------------------
# 4. 学習用データの準備 (修正版)
# --------------------------------------------------
# 特徴量を選択
# "D_" で始まる列（今回作ったダミー変数）だけを選びます
dummy_cols = [c for c in combined_dummies.columns if c.startswith("D_")]
features = ["capa", "month", "tv_num"] + dummy_cols

X_train = combined_dummies[combined_dummies["is_train"] == 1][features]
y_train = train_all["y"]
X_test = combined_dummies[combined_dummies["is_train"] == 0][features]

# 数値型に変換（念のため）し、欠損値を埋める
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

print(f"特徴量の数: {len(features)}")
print("学習データの準備完了")

# --------------------------------------------------
# 5. モデルの学習と予測
# --------------------------------------------------
from sklearn.ensemble import RandomForestRegressor

# モデル作成
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 学習データでの精度確認
pred_train = model.predict(X_train)
rmse = np.sqrt(MSE(y_train, pred_train))
print(f"学習データのRMSEスコア: {rmse}")

# テストデータ予測
pred_test = model.predict(X_test)

# --------------------------------------------------
# 6. 提出ファイルの作成
# --------------------------------------------------
sample_submit[1] = pred_test
sample_submit.to_csv("submit_rf_fixed.csv", index=False, header=False)

print("提出用ファイル 'submit_rf_fixed.csv' を作成しました。")

特徴量の数: 101
学習データの準備完了
学習データのRMSEスコア: 1288.4882168910697
提出用ファイル 'submit_rf_fixed.csv' を作成しました。


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# --------------------------------------------------
# 2. データの結合
# --------------------------------------------------
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 3. 特徴量の作成
# --------------------------------------------------
# 日付情報の処理
def process_gameday(df):
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    return df

train_all = process_gameday(train_all)
test_all = process_gameday(test_all)

# TV放送数のカウント
def count_tv(x):
    if pd.isnull(x): return 0
    return len(str(x).split("／"))

train_all["tv_num"] = train_all["tv"].apply(count_tv)
test_all["tv_num"] = test_all["tv"].apply(count_tv)

# 天気の集約
def process_weather(x):
    if pd.isnull(x): return "other"
    if "雨" in x: return "rain"
    elif "晴" in x: return "sunny"
    else: return "other"

train_all["weather_simple"] = train_all["weather"].apply(process_weather)
test_all["weather_simple"] = test_all["weather"].apply(process_weather)

# --------------------------------------------------
# 4. ダミー変数化
# --------------------------------------------------
train_all["is_train"] = 1
test_all["is_train"] = 0
combined = pd.concat([train_all.drop("y", axis=1), test_all], sort=False)

cols_to_dummy = ["week", "stage", "weather_simple", "home", "away"]
prefixes = ["D_week", "D_stage", "D_weather", "D_home", "D_away"]

combined_dummies = pd.get_dummies(combined, columns=cols_to_dummy, prefix=prefixes)

# --------------------------------------------------
# 5. 学習データの準備 (ホールドアウト法)
# --------------------------------------------------
dummy_cols = [c for c in combined_dummies.columns if c.startswith("D_")]
features = ["capa", "month", "tv_num"] + dummy_cols

X_full = combined_dummies[combined_dummies["is_train"] == 1][features]
y_full = train_all["y"]
X_test = combined_dummies[combined_dummies["is_train"] == 0][features]

# 欠損値処理
X_full = X_full.apply(pd.to_numeric, errors='coerce').fillna(0)
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

# 【ホールドアウト法】
# 学習データ全体を、さらに「学習用(train)」と「検証用(valid)」に8:2で分割します
# random_state=42 で毎回同じ分け方になるように固定します
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

print(f"分割後の学習データ数: {len(X_train)}")
print(f"分割後の検証データ数: {len(X_valid)}")

# --------------------------------------------------
# 6. モデル学習と評価 (ランダムフォレスト)
# --------------------------------------------------
# 過学習を防ぐためのパラメータ設定
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)

# 分割した学習データだけで学習
model.fit(X_train, y_train)

# 精度の確認
pred_train = model.predict(X_train)
rmse_train = np.sqrt(MSE(y_train, pred_train))

pred_valid = model.predict(X_valid)
rmse_valid = np.sqrt(MSE(y_valid, pred_valid))

print("-" * 30)
print(f"学習データRMSE: {rmse_train:.2f}")
print(f"検証データRMSE: {rmse_valid:.2f}") # ここが重要な指標です
print("-" * 30)

# --------------------------------------------------
# 7. 全データで再学習して提出用ファイル作成
# --------------------------------------------------
# 検証が終わったら、全ての学習データを使って本番用モデルを作ります
model_final = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)
model_final.fit(X_full, y_full)

pred_test = model_final.predict(X_test)

sample_submit[1] = pred_test
sample_submit.to_csv("submit_rf_holdout.csv", index=False, header=False)

print("提出用ファイル 'submit_rf_holdout.csv' を作成しました。")

分割後の学習データ数: 1562
分割後の検証データ数: 391
------------------------------
学習データRMSE: 2907.79
検証データRMSE: 4090.85
------------------------------
提出用ファイル 'submit_rf_holdout.csv' を作成しました。


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# --------------------------------------------------
# 2. データの結合
# --------------------------------------------------
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 3. 特徴量の作成
# --------------------------------------------------
# (1) スタジアムの収容人数 (capa)
# 数値型に変換して、欠損値は平均で埋めます
train_all["capa"] = pd.to_numeric(train_all["capa"], errors='coerce').fillna(train_all["capa"].mean())
test_all["capa"] = pd.to_numeric(test_all["capa"], errors='coerce').fillna(test_all["capa"].mean())

# (2) ホームチームとアウェイチームの人気度 (ターゲットエンコーディング)
# 「チーム名」を「そのチームの平均観客数」という数値に変換します
te_cols = ["home_team", "away_team"]

for col in te_cols:
    # 学習データから各チームの平均観客数を計算
    avg_map = train_all.groupby(col)["y"].mean().to_dict()
    global_mean = train_all["y"].mean() # 未知のチーム用

    # 変換関数 (辞書になければ全体の平均を入れる)
    def apply_te(x):
        return avg_map.get(x, global_mean)

    # 新しい特徴量を作成 (例: TE_home_team)
    train_all[f"TE_{col}"] = train_all[col].apply(apply_te)
    test_all[f"TE_{col}"] = test_all[col].apply(apply_te)

# --------------------------------------------------
# 4. 学習データの準備
# --------------------------------------------------
# 指定された3つの特徴量のみを使用
# 1. TE_home_team (ホームチームの人気度)
# 2. capa (収容人数)
# 3. TE_away_team (アウェイチームの人気度)
features = ["TE_home_team", "capa", "TE_away_team"]

print(f"使用する特徴量: {features}")

X = train_all[features]
y = train_all["y"]
X_test = test_all[features]

# 学習用と検証用に分割 (8:2)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------------------------------
# 5. LightGBMによる学習
# --------------------------------------------------
model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    n_jobs=-1
)

# 学習実行 (早期停止あり)
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='rmse',
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(100)
    ]
)

# --------------------------------------------------
# 6. 精度確認と提出
# --------------------------------------------------
# 検証データの精度
pred_valid = model.predict(X_valid)
rmse_valid = np.sqrt(MSE(y_valid, pred_valid))
print(f"\n検証データRMSE: {rmse_valid:.2f}")

# テストデータの予測
pred_test = model.predict(X_test)

# ファイル保存
sample_submit[1] = pred_test
sample_submit.to_csv("submit_lgbm_best3.csv", index=False, header=False)

print("提出用ファイル 'submit_lgbm_best3.csv' を作成しました。")

使用する特徴量: ['TE_home_team', 'capa', 'TE_away_team']
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 138
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 3
[LightGBM] [Info] Start training from score 10685.768246
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 3889.03	valid_0's l2: 1.51246e+07
[200]	valid_0's rmse: 3810.65	valid_0's l2: 1.4521e+07
[300]	valid_0's rmse: 3803.81	valid_0's l2: 1.4469e+07
Early stopping, best iteration is:
[284]	valid_0's rmse: 3800.56	valid_0's l2: 1.44442e+07

検証データRMSE: 3800.56
提出用ファイル 'submit_lgbm_best3.csv' を作成しました。


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# --------------------------------------------------
# 2. データの結合
# --------------------------------------------------
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 3. 特徴量の作成
# --------------------------------------------------
def preprocess(df):
    # (1) 日付情報
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    # 曜日
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["is_weekend"] = df["week"].apply(lambda x: 1 if x in ["土", "日", "祝"] else 0)

    # (2) 収容人数
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())

    # (3) TV放送局数 (注目度)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # (4) 試合の節数 (開幕や最終節の影響)
    # "第１節" -> 1 に変換。全角数字対応のため一度置換してから抽出
    # ※簡易的に数値だけ抽出します
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float)
    df["match_num"] = df["match_num"].fillna(df["match_num"].mean()) # 欠損は平均で埋める

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 4. 【重要】正しいターゲットエンコーディング (K-Fold法)
# --------------------------------------------------
# 学習データは「K-Fold分割」を使ってリークを防ぎ、テストデータは「全学習データ」から計算します

target_cols = ["home_team", "away_team", "stage"]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 入れ物を用意
for c in target_cols:
    train_all[f"TE_{c}"] = 0
    test_all[f"TE_{c}"] = 0

# --- 学習データ側 (K-Fold) ---
for train_idx, val_idx in kf.split(train_all):
    X_tr, X_val = train_all.iloc[train_idx], train_all.iloc[val_idx]

    for c in target_cols:
        # 学習パート(X_tr)で平均を計算し、検証パート(X_val)にマッピング
        mean_map = X_tr.groupby(c)["y"].mean()
        train_all.loc[val_idx, f"TE_{c}"] = X_val[c].map(mean_map)

# 欠損値（学習データに出てこなかったチームなど）を全体平均で埋める
global_mean = train_all["y"].mean()
for c in target_cols:
    train_all[f"TE_{c}"] = train_all[f"TE_{c}"].fillna(global_mean)

# --- テストデータ側 (全データ利用) ---
for c in target_cols:
    mean_map = train_all.groupby(c)["y"].mean()
    test_all[f"TE_{c}"] = test_all[c].map(mean_map).fillna(global_mean)

# --------------------------------------------------
# 5. 学習の実行
# --------------------------------------------------
# 使用する特徴量
features = [
    "capa", "month", "match_num", "tv_num", "is_weekend", # 基本情報
    "TE_home_team", "TE_away_team", "TE_stage"            # 人気度情報
]

print(f"使用する特徴量: {features}")

X = train_all[features]
y = train_all["y"]
X_test = test_all[features]

# LightGBMパラメータ (少し調整)
model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=2000,
    learning_rate=0.01, # ゆっくり丁寧に学習させる
    num_leaves=31,
    colsample_bytree=0.8, # 特徴量を間引いて過学習防止
    n_jobs=-1
)

# K-Fold Cross Validation で学習してアンサンブル（平均をとる）
# これでさらに精度が安定します
models = []
preds = np.zeros(len(X_test))
scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
    X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]

    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(0) # ログは非表示
        ]
    )

    # スコア確認
    val_pred = model.predict(X_val_fold)
    score = np.sqrt(MSE(y_val_fold, val_pred))
    scores.append(score)

    # テストデータの予測を足し合わせる
    preds += model.predict(X_test) / 5 # 5分割なので5で割る

print("-" * 30)
print(f"各FoldのRMSE: {[round(s, 2) for s in scores]}")
print(f"平均RMSE: {round(np.mean(scores), 2)}")
print("-" * 30)

# --------------------------------------------------
# 6. 提出ファイルの作成
# --------------------------------------------------
sample_submit[1] = preds
sample_submit.to_csv("submit_lgbm_kfold_te.csv", index=False, header=False)

print("提出用ファイル 'submit_lgbm_kfold_te.csv' を作成しました。")

 24883.97222222 36571.41176471 13350.12121212 11037.45
 12159.06451613 13350.12121212 16930.17142857 11114.32352941
 25067.35294118 11037.45       13804.94871795 16930.17142857
 12159.06451613 23819.22857143 15217.5        23819.22857143
 17499.90625    15217.5        11114.32352941 13350.12121212
 14628.89189189 16930.17142857 17499.90625    11114.32352941
 11037.45       25067.35294118 11645.66666667 12159.06451613
 13350.12121212 16930.17142857 15902.         11645.66666667
 12159.06451613 11037.45       17499.90625    12623.10526316
 36571.41176471 25067.35294118 24883.97222222 17541.35483871
 25067.35294118 12623.10526316 23819.22857143 13804.94871795
 17499.90625    36571.41176471 24883.97222222 17541.35483871
 15217.5        13350.12121212 36571.41176471  4214.42222222
  5093.15789474  8305.425       5419.45833333  3467.38888889
  9910.28205128  3905.51162791  3162.83333333  3905.51162791
  4577.86046512  3614.20588235  4577.86046512  5938.66666667
  6284.4         9840.1627907 

使用する特徴量: ['capa', 'month', 'match_num', 'tv_num', 'is_weekend', 'TE_home_team', 'TE_away_team', 'TE_stage']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 454
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 8
[LightGBM] [Info] Start training from score 10685.768246
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 458
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 8
[LightGBM] [Info] Start training from score 10599.138284
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to r

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# --------------------------------------------------
# 2. データの結合
# --------------------------------------------------
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 3. 高度な特徴量作成
# --------------------------------------------------
# (A) チームごとの「本拠地（都道府県）」を特定する辞書を作成
# スタジアムの住所から「東京都」「埼玉県」などを抽出
train_all["prefecture"] = train_all["address"].str.extract(r'([^県]+[県|都|道|府])')
test_all["prefecture"] = test_all["address"].str.extract(r'([^県]+[県|都|道|府])')

# 各チームが「どこの都道府県」で最も多く試合をしているか（＝本拠地）を集計
home_pref_map = train_all.groupby("home_team")["prefecture"].apply(lambda x: x.mode()[0]).to_dict()

def preprocess(df):
    # --- 日付・季節 ---
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["is_weekend"] = df["week"].apply(lambda x: 1 if x in ["土", "日", "祝"] else 0)

    # --- 試合重要度 ---
    # 第〇節 -> 数値化
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float)
    df["match_num"] = df["match_num"].fillna(df["match_num"].mean())

    # --- 収容人数 ---
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())

    # --- 天気・快適さ (不快指数) ---
    # 不快指数 = 0.81T + 0.01H(0.99T - 14.3) + 46.3
    # ※ 湿度の%を取って数値化
    def parse_humidity(x):
        try: return float(x.replace("%", ""))
        except: return 50.0 # 欠損などは50%と仮定

    df["temperature"] = pd.to_numeric(df["temperature"], errors='coerce').fillna(20.0)
    df["humidity_val"] = df["humidity"].apply(parse_humidity)

    # 計算式
    T = df["temperature"]
    H = df["humidity_val"]
    df["discomfort_index"] = 0.81 * T + 0.01 * H * (0.99 * T - 14.3) + 46.3

    # --- ダービーマッチ判定 (距離の近さ) ---
    # アウェイチームの本拠地を取得
    df["away_pref"] = df["away_team"].map(home_pref_map)
    # 試合会場の都道府県と同じなら 1 (近場からの遠征 or ダービー)
    df["is_derby"] = (df["prefecture"] == df["away_pref"]).astype(int)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 4. ターゲットエンコーディング (対数変換版に対応)
# --------------------------------------------------
# ※ 注意: ここでは通常のyを使ってエンコーディングします
target_cols = ["home_team", "away_team", "stage", "week"]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for c in target_cols:
    train_all[f"TE_{c}"] = 0
    test_all[f"TE_{c}"] = 0

# K-Fold Target Encoding
for train_idx, val_idx in kf.split(train_all):
    X_tr, X_val = train_all.iloc[train_idx], train_all.iloc[val_idx]
    for c in target_cols:
        mean_map = X_tr.groupby(c)["y"].mean()
        train_all.loc[val_idx, f"TE_{c}"] = X_val[c].map(mean_map)

# 欠損埋め
global_mean = train_all["y"].mean()
for c in target_cols:
    train_all[f"TE_{c}"] = train_all[f"TE_{c}"].fillna(global_mean)
    # テストデータ
    mean_map_full = train_all.groupby(c)["y"].mean()
    test_all[f"TE_{c}"] = test_all[c].map(mean_map_full).fillna(global_mean)

# --------------------------------------------------
# 5. 【最重要】学習と対数変換
# --------------------------------------------------
features = [
    "capa", "month", "match_num", "is_weekend",
    "discomfort_index", "is_derby", # 新特徴量
    "TE_home_team", "TE_away_team", "TE_stage", "TE_week"
]

print(f"使用する特徴量: {features}")

X = train_all[features]
# ★ここで目的変数 y を対数変換します (log(y+1))
y_log = np.log1p(train_all["y"])

X_test = test_all[features]

model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=3000,
    learning_rate=0.01,
    num_leaves=31,
    colsample_bytree=0.8,
    n_jobs=-1
)

models = []
preds_log = np.zeros(len(X_test)) # 対数スケールでの予測結果入れ物
scores = []

# Cross Validation
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_log)):
    X_train_fold, y_train_fold = X.iloc[train_idx], y_log.iloc[train_idx]
    X_val_fold, y_val_fold = X.iloc[val_idx], y_log.iloc[val_idx]

    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(0)
        ],
        eval_metric='rmse'
    )

    # 検証 (対数のまま予測 -> expで戻してRMSE計算)
    val_pred_log = model.predict(X_val_fold)
    val_pred_real = np.expm1(val_pred_log) # 元の人数に戻す
    y_val_real = np.expm1(y_val_fold)      # 正解も元の人数に戻す

    score = np.sqrt(MSE(y_val_real, val_pred_real))
    scores.append(score)

    # テストデータの予測 (対数のまま加算)
    preds_log += model.predict(X_test) / 5

print("-" * 30)
print(f"各FoldのRMSE: {[int(s) for s in scores]}")
print(f"平均RMSE: {int(np.mean(scores))}")
print("-" * 30)

# --------------------------------------------------
# 6. 提出ファイルの作成
# --------------------------------------------------
# 最後に予測値を対数から元の人数に戻します
final_preds = np.expm1(preds_log)

sample_submit[1] = final_preds
sample_submit.to_csv("submit_lgbm_log_derby.csv", index=False, header=False)

print("提出用ファイル 'submit_lgbm_log_derby.csv' を作成しました。")

 24883.97222222 36571.41176471 13350.12121212 11037.45
 12159.06451613 13350.12121212 16930.17142857 11114.32352941
 25067.35294118 11037.45       13804.94871795 16930.17142857
 12159.06451613 23819.22857143 15217.5        23819.22857143
 17499.90625    15217.5        11114.32352941 13350.12121212
 14628.89189189 16930.17142857 17499.90625    11114.32352941
 11037.45       25067.35294118 11645.66666667 12159.06451613
 13350.12121212 16930.17142857 15902.         11645.66666667
 12159.06451613 11037.45       17499.90625    12623.10526316
 36571.41176471 25067.35294118 24883.97222222 17541.35483871
 25067.35294118 12623.10526316 23819.22857143 13804.94871795
 17499.90625    36571.41176471 24883.97222222 17541.35483871
 15217.5        13350.12121212 36571.41176471  4214.42222222
  5093.15789474  8305.425       5419.45833333  3467.38888889
  9910.28205128  3905.51162791  3162.83333333  3905.51162791
  4577.86046512  3614.20588235  4577.86046512  5938.66666667
  6284.4         9840.1627907 

使用する特徴量: ['capa', 'month', 'match_num', 'is_weekend', 'discomfort_index', 'is_derby', 'TE_home_team', 'TE_away_team', 'TE_stage', 'TE_week']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 735
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 10
[LightGBM] [Info] Start training from score 9.026965
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 739
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 10
[LightGBM] [Info] Start training from score 9.015385
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set

In [None]:
# Optunaのインストール
!pip install optuna

import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# --------------------------------------------------
# 2. データの結合
# --------------------------------------------------
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 3. 特徴量の作成 (効果が高かったものに絞る)
# --------------------------------------------------
def preprocess(df):
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["is_weekend"] = df["week"].apply(lambda x: 1 if x in ["土", "日", "祝"] else 0)

    # 第〇節 -> 数値化 (これは効果があるので残す)
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float)
    df["match_num"] = df["match_num"].fillna(df["match_num"].mean())

    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())

    # TV放送数 (これも残す)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 4. K-Fold Target Encoding (リークなし)
# --------------------------------------------------
target_cols = ["home_team", "away_team", "stage", "week"]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for c in target_cols:
    train_all[f"TE_{c}"] = 0
    test_all[f"TE_{c}"] = 0

# 学習データ
for train_idx, val_idx in kf.split(train_all):
    X_tr, X_val = train_all.iloc[train_idx], train_all.iloc[val_idx]
    for c in target_cols:
        mean_map = X_tr.groupby(c)["y"].mean()
        train_all.loc[val_idx, f"TE_{c}"] = X_val[c].map(mean_map)

# 欠損埋め
global_mean = train_all["y"].mean()
for c in target_cols:
    train_all[f"TE_{c}"] = train_all[f"TE_{c}"].fillna(global_mean)
    mean_map_full = train_all.groupby(c)["y"].mean()
    test_all[f"TE_{c}"] = test_all[c].map(mean_map_full).fillna(global_mean)

# --------------------------------------------------
# 5. Optunaによるパラメータチューニング
# --------------------------------------------------
features = ["capa", "month", "match_num", "tv_num", "is_weekend",
            "TE_home_team", "TE_away_team", "TE_stage", "TE_week"]

X = train_all[features]
y = train_all["y"]
X_test = test_all[features]

print("Optunaで最適なパラメータを探索中... (これには少し時間がかかります)")

def objective(trial):
    # 探索するパラメータの範囲
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'random_state': 42,
        'n_estimators': 1000,
        'n_jobs': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    }

    # 高速化のため3-Foldで検証
    cv_scores = []
    kf_opt = KFold(n_splits=3, shuffle=True, random_state=42)

    for train_idx, val_idx in kf_opt.split(X, y):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)
        model.fit(X_tr, y_tr,
                  eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
                 )
        pred = model.predict(X_val)
        cv_scores.append(np.sqrt(MSE(y_val, pred)))

    return np.mean(cv_scores)

# 探索実行 (20回試行)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

print(f"\nBest RMSE: {study.best_value}")
print(f"Best Params: {study.best_params}")

# --------------------------------------------------
# 6. ベストパラメータで本番学習
# --------------------------------------------------
best_params = study.best_params
# 固定パラメータを追加
best_params['objective'] = 'regression'
best_params['random_state'] = 42
best_params['n_estimators'] = 5000 # 本番は多めに
best_params['n_jobs'] = -1

# 5-Foldで安定した予測を作成
final_preds = np.zeros(len(X_test))
kf_final = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in kf_final.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMRegressor(**best_params)
    model.fit(X_tr, y_tr,
              eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
             )

    final_preds += model.predict(X_test) / 5

# --------------------------------------------------
# 7. 提出ファイル作成
# --------------------------------------------------
sample_submit[1] = final_preds
sample_submit.to_csv("submit_lgbm_optuna.csv", index=False, header=False)
print("提出用ファイル 'submit_lgbm_optuna.csv' を作成しました。")

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


 24883.97222222 36571.41176471 13350.12121212 11037.45
 12159.06451613 13350.12121212 16930.17142857 11114.32352941
 25067.35294118 11037.45       13804.94871795 16930.17142857
 12159.06451613 23819.22857143 15217.5        23819.22857143
 17499.90625    15217.5        11114.32352941 13350.12121212
 14628.89189189 16930.17142857 17499.90625    11114.32352941
 11037.45       25067.35294118 11645.66666667 12159.06451613
 13350.12121212 16930.17142857 15902.         11645.66666667
 12159.06451613 11037.45       17499.90625    12623.10526316
 36571.41176471 25067.35294118 24883.97222222 17541.35483871
 25067.35294118 12623.10526316 23819.22857143 13804.94871795
 17499.90625    36571.41176471 24883.97222222 17541.35483871
 15217.5        13350.12121212 36571.41176471  4214.42222222
  5093.15789474  8305.425       5419.45833333  3467.38888889
  9910.28205128  3905.51162791  3162.83333333  3905.51162791
  4577.86046512  3614.20588235  4577.86046512  5938.66666667
  6284.4         9840.1627907 

Optunaで最適なパラメータを探索中... (これには少し時間がかかります)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used 

[I 2026-01-08 05:04:41,056] Trial 0 finished with value: 3452.6672966249625 and parameters: {'learning_rate': 0.06328827404563954, 'num_leaves': 89, 'max_depth': 12, 'min_child_samples': 8, 'colsample_bytree': 0.8298641442181429, 'subsample': 0.5524207173182353}. Best is trial 0 with value: 3452.6672966249625.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:41,948] Trial 1 finished with value: 3391.5379611611297 and parameters: {'learning_rate': 0.0913287172682225, 'num_leaves': 56, 'max_depth': 7, 'min_child_samples': 15, 'colsample_bytree': 0.6982231013346409, 'subsample': 0.8106631379977312}. Best is trial 1 with value: 3391.5379611611297.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019560 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:43,987] Trial 2 finished with value: 3345.2614886010238 and parameters: {'learning_rate': 0.015303521800446494, 'num_leaves': 41, 'max_depth': 15, 'min_child_samples': 29, 'colsample_bytree': 0.5859558051879059, 'subsample': 0.976311353589147}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:44,315] Trial 3 finished with value: 3468.592560677403 and parameters: {'learning_rate': 0.09266749969202134, 'num_leaves': 40, 'max_depth': 14, 'min_child_samples': 49, 'colsample_bytree': 0.6661842314451949, 'subsample': 0.5052665789625413}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676


[I 2026-01-08 05:04:44,591] Trial 4 finished with value: 3393.61432887806 and parameters: {'learning_rate': 0.07982010832860782, 'num_leaves': 93, 'max_depth': 9, 'min_child_samples': 13, 'colsample_bytree': 0.5596301681607654, 'subsample': 0.5404433190793368}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10711.512289
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:45,304] Trial 5 finished with value: 3405.377528671872 and parameters: {'learning_rate': 0.03990910532623422, 'num_leaves': 57, 'max_depth': 13, 'min_child_samples': 32, 'colsample_bytree': 0.8659024634109131, 'subsample': 0.6877989891458396}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9


[I 2026-01-08 05:04:45,600] Trial 6 finished with value: 3362.5663989716854 and parameters: {'learning_rate': 0.07739100524290649, 'num_leaves': 96, 'max_depth': 13, 'min_child_samples': 25, 'colsample_bytree': 0.8210627102619303, 'subsample': 0.6990063278896237}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10711.512289
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47

[I 2026-01-08 05:04:46,903] Trial 7 finished with value: 3351.8924274253445 and parameters: {'learning_rate': 0.013294953349967466, 'num_leaves': 35, 'max_depth': 9, 'min_child_samples': 30, 'colsample_bytree': 0.6371457189882599, 'subsample': 0.5663447738176122}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:47,568] Trial 8 finished with value: 3450.741943904557 and parameters: {'learning_rate': 0.030779552930127928, 'num_leaves': 37, 'max_depth': 14, 'min_child_samples': 37, 'colsample_bytree': 0.8270179788420826, 'subsample': 0.8674144553077883}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382


[I 2026-01-08 05:04:47,935] Trial 9 finished with value: 3357.248777525719 and parameters: {'learning_rate': 0.07461130120623759, 'num_leaves': 97, 'max_depth': 10, 'min_child_samples': 20, 'colsample_bytree': 0.9216343631014339, 'subsample': 0.8170059010403145}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10711.512289
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train se

[I 2026-01-08 05:04:48,766] Trial 10 finished with value: 3476.48504329495 and parameters: {'learning_rate': 0.020873625047068876, 'num_leaves': 22, 'max_depth': 19, 'min_child_samples': 45, 'colsample_bytree': 0.5060386738597296, 'subsample': 0.9869294031506257}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:50,763] Trial 11 finished with value: 3473.9124881705447 and parameters: {'learning_rate': 0.005975440309220132, 'num_leaves': 37, 'max_depth': 18, 'min_child_samples': 36, 'colsample_bytree': 0.6122196367658278, 'subsample': 0.9760335338255934}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:52,593] Trial 12 finished with value: 3374.0308733535635 and parameters: {'learning_rate': 0.005315775748080377, 'num_leaves': 26, 'max_depth': 16, 'min_child_samples': 26, 'colsample_bytree': 0.6245490919274266, 'subsample': 0.6369902805786699}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:53,535] Trial 13 finished with value: 3443.2858633074416 and parameters: {'learning_rate': 0.021165907289749545, 'num_leaves': 74, 'max_depth': 6, 'min_child_samples': 40, 'colsample_bytree': 0.7406901916986869, 'subsample': 0.9030251515721462}. Best is trial 2 with value: 3345.2614886010238.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total 

[I 2026-01-08 05:04:55,557] Trial 14 finished with value: 3318.373983687218 and parameters: {'learning_rate': 0.046522437027167865, 'num_leaves': 47, 'max_depth': 10, 'min_child_samples': 29, 'colsample_bytree': 0.5586937000071693, 'subsample': 0.6207338157298801}. Best is trial 14 with value: 3318.373983687218.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:56,485] Trial 15 finished with value: 3338.4094355166067 and parameters: {'learning_rate': 0.0462764532441007, 'num_leaves': 48, 'max_depth': 16, 'min_child_samples': 19, 'colsample_bytree': 0.5113511178125495, 'subsample': 0.627357166602016}. Best is trial 14 with value: 3318.373983687218.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:57,072] Trial 16 finished with value: 3324.3451065795994 and parameters: {'learning_rate': 0.048146586902270645, 'num_leaves': 73, 'max_depth': 17, 'min_child_samples': 20, 'colsample_bytree': 0.5032520993112154, 'subsample': 0.63322255565953}. Best is trial 14 with value: 3318.373983687218.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start trai

[I 2026-01-08 05:04:57,504] Trial 17 finished with value: 3341.1989074324247 and parameters: {'learning_rate': 0.05880971200889422, 'num_leaves': 69, 'max_depth': 11, 'min_child_samples': 22, 'colsample_bytree': 0.5543082188686923, 'subsample': 0.7275635522365068}. Best is trial 14 with value: 3318.373983687218.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train se

[I 2026-01-08 05:04:58,240] Trial 18 finished with value: 3460.9897474407867 and parameters: {'learning_rate': 0.03633794465949243, 'num_leaves': 80, 'max_depth': 20, 'min_child_samples': 11, 'colsample_bytree': 0.9901557281561353, 'subsample': 0.6275554692096065}. Best is trial 14 with value: 3318.373983687218.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10707.560676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 1302, number of used features: 9
[LightGBM] [Info] Start training from score 10469.601382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 450
[LightGBM] [Info] Number of data points in the train se

[I 2026-01-08 05:04:58,674] Trial 19 finished with value: 3373.2064573784387 and parameters: {'learning_rate': 0.05929558290289875, 'num_leaves': 66, 'max_depth': 17, 'min_child_samples': 16, 'colsample_bytree': 0.7240497531328467, 'subsample': 0.6564897865766395}. Best is trial 14 with value: 3318.373983687218.



Best RMSE: 3318.373983687218
Best Params: {'learning_rate': 0.046522437027167865, 'num_leaves': 47, 'max_depth': 10, 'min_child_samples': 29, 'colsample_bytree': 0.5586937000071693, 'subsample': 0.6207338157298801}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 483
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 9
[LightGBM] [Info] Start training from score 10685.768246
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 487
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 9
[LightGBM] [Info] Start training from score 10599.138284
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 sec

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# --------------------------------------------------
# 2. データの結合
# --------------------------------------------------
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 3. シンプルな特徴量作成
# --------------------------------------------------
def preprocess(df):
    # (1) 収容人数 (数値化して欠損埋め)
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())

    # (2) 月 (季節性)
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))

    # (3) 試合の節数 (第〇節 -> 数値)
    # 開幕戦や最終戦の盛り上がりを捉える
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float)
    df["match_num"] = df["match_num"].fillna(df["match_num"].mean())

    # (4) TV放送局数 (注目度)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # (5) 曜日とStageの取得 (エンコーディング用)
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 4. ホールドアウト分割 & ターゲットエンコーディング
# --------------------------------------------------
# ここが最重要ポイントです。
# 先にデータを分割してから、学習データだけを使って「チームの人気度」を計算します。

# 特徴量候補
# weatherやaddressなどは捨てます
base_cols = ["capa", "month", "match_num", "tv_num", "home_team", "away_team", "stage", "week"]
target_cols = ["home_team", "away_team", "stage", "week"] # これらを数値化します

X = train_all[base_cols]
y = train_all["y"]
X_test = test_all[base_cols]

# データを分割 (学習:検証 = 8:2)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# --- ターゲットエンコーディングの実行 ---
# 学習データ(X_train)の平均値を使って辞書を作る
for col in target_cols:
    # 辞書作成
    temp_df = pd.concat([X_train, y_train], axis=1)
    mean_map = temp_df.groupby(col)["y"].mean().to_dict()
    global_mean = y_train.mean()

    # 変換関数
    def apply_te(x):
        return mean_map.get(x, global_mean) # 未知のデータは全体平均で埋める

    # 変換して新しい列(TE_xxx)を作る
    # 学習用、検証用、テスト用すべてに「学習データの辞書」を適用する
    X_train[f"TE_{col}"] = X_train[col].apply(apply_te)
    X_valid[f"TE_{col}"] = X_valid[col].apply(apply_te)
    X_test[f"TE_{col}"] = X_test[col].apply(apply_te)

# 元の文字の列はもう不要なので削除
X_train = X_train.drop(target_cols, axis=1)
X_valid = X_valid.drop(target_cols, axis=1)
X_test = X_test.drop(target_cols, axis=1)

print("使用する特徴量:", X_train.columns.tolist())

# --------------------------------------------------
# 5. モデル学習 (LightGBM)
# --------------------------------------------------
model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.log_evaluation(0) # ログを静かにする
    ]
)

# --------------------------------------------------
# 6. 評価と提出ファイルの作成
# --------------------------------------------------
# 検証データのRMSE確認
pred_valid = model.predict(X_valid)
rmse = np.sqrt(MSE(y_valid, pred_valid))
print("-" * 30)
print(f"検証データRMSE: {rmse:.2f}")
print("-" * 30)

# テストデータの予測
pred_test = model.predict(X_test)

# 保存
sample_submit[1] = pred_test
sample_submit.to_csv("submit_simple_holdout.csv", index=False, header=False)

print("提出用ファイル 'submit_simple_holdout.csv' を作成しました。")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f"TE_{col}"] = X_test[col].apply(apply_te)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f"TE_{col}"] = X_test[col].apply(apply_te)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f"TE_{col}"] = X_test[col].apply(apply_te)
A value is trying to be set on a copy of a slice from a

使用する特徴量: ['capa', 'month', 'match_num', 'tv_num', 'TE_home_team', 'TE_away_team', 'TE_stage', 'TE_week']
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012609 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 208
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 8
[LightGBM] [Info] Start training from score 10685.768246
------------------------------
検証データRMSE: 3710.74
------------------------------
提出用ファイル 'submit_simple_holdout.csv' を作成しました。


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split

# --------------------------------------------------
# 1. データの読み込み & 結合
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量の作成（あなたの仮説を反映！）
# --------------------------------------------------
def preprocess(df):
    # (1) 基本データ
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # (2) 節数 (第〇節) の数値化
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float)
    df["match_num"] = df["match_num"].fillna(df["match_num"].mean())

    # (3) 【新機能】シーズン進行度とクライマックス判定
    # J1は全34節、J2は全42節 (2014年当時)
    # stage列を見て、そのリーグの「最大節数」で割ることで進行度(0~1)を出す
    def calc_progress(row):
        max_match = 34 if "Ｊ１" in row["stage"] else 42
        return row["match_num"] / max_match

    df["season_progress"] = df.apply(calc_progress, axis=1)

    # 進行度が0.85以上（終盤15%）ならクライマックスとする
    df["is_climax"] = df["season_progress"].apply(lambda x: 1 if x >= 0.85 else 0)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. ホールドアウト & ターゲットエンコーディング
# --------------------------------------------------
base_cols = ["capa", "month", "match_num", "season_progress", "is_climax", "tv_num",
             "home_team", "away_team", "stage", "week"]
target_cols = ["home_team", "away_team", "stage", "week"]

X = train_all[base_cols]
y = train_all["y"]
X_test = test_all[base_cols]

# 分割 (学習:検証 = 8:2)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 学習データのみでターゲットエンコーディング (リーク防止)
for col in target_cols:
    temp_df = pd.concat([X_train, y_train], axis=1)
    mean_map = temp_df.groupby(col)["y"].mean().to_dict()
    global_mean = y_train.mean()

    def apply_te(x):
        return mean_map.get(x, global_mean)

    X_train[f"TE_{col}"] = X_train[col].apply(apply_te)
    X_valid[f"TE_{col}"] = X_valid[col].apply(apply_te)
    X_test[f"TE_{col}"] = X_test[col].apply(apply_te)

# 元の文字カラムを削除
X_train = X_train.drop(target_cols, axis=1)
X_valid = X_valid.drop(target_cols, axis=1)
X_test = X_test.drop(target_cols, axis=1)

print("使用する特徴量:", X_train.columns.tolist())

# --------------------------------------------------
# 4. モデル学習 (LightGBM)
# --------------------------------------------------
model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.log_evaluation(0)
    ]
)

# --------------------------------------------------
# 5. 評価と提出
# --------------------------------------------------
pred_valid = model.predict(X_valid)
rmse = np.sqrt(MSE(y_valid, pred_valid))
print("-" * 30)
print(f"検証データRMSE: {rmse:.2f}")
print("-" * 30)

pred_test = model.predict(X_test)
sample_submit[1] = pred_test
sample_submit.to_csv("submit_season_climax.csv", index=False, header=False)
print("提出用ファイル 'submit_season_climax.csv' を作成しました。")

使用する特徴量: ['capa', 'month', 'match_num', 'season_progress', 'is_climax', 'tv_num', 'TE_home_team', 'TE_away_team', 'TE_stage', 'TE_week']
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 285
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 10
[LightGBM] [Info] Start training from score 10685.768246
------------------------------
検証データRMSE: 3675.04
------------------------------
提出用ファイル 'submit_season_climax.csv' を作成しました。


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f"TE_{col}"] = X_test[col].apply(apply_te)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f"TE_{col}"] = X_test[col].apply(apply_te)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[f"TE_{col}"] = X_test[col].apply(apply_te)
A value is trying to be set on a copy of a slice from a

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# --------------------------------------------------
# 2. データの結合
# --------------------------------------------------
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 3. 前処理 (シンプルイズベスト)
# --------------------------------------------------
def preprocess(df):
    # 収容人数
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    # 月
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    # 曜日
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    # 試合の節数 (第〇節 -> 数値)
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float)
    df["match_num"] = df["match_num"].fillna(df["match_num"].mean())
    # TV放送数
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # 天気 (シンプル化)
    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 4. K-Fold アンサンブル学習
# --------------------------------------------------
# ここが今回のキモです。5つのモデルを作って平均をとります。

# 特徴量リスト
base_features = ["capa", "month", "match_num", "tv_num"]
# Target Encodingしたい列
te_targets = ["home_team", "away_team", "stage", "week", "weather_simple"]

X = train_all
y = train_all["y"]
X_test = test_all

# K-Foldの設定 (5分割)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# テストデータの予測を入れる配列 (すべて0で初期化)
test_preds = np.zeros(len(X_test))

# スコア記録用
scores = []

print("K-Fold アンサンブル学習を開始します...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"--- Fold {fold+1} / 5 ---")

    # データの分割
    X_train_fold = X.iloc[train_idx].copy()
    y_train_fold = y.iloc[train_idx].copy()
    X_valid_fold = X.iloc[val_idx].copy()
    y_valid_fold = y.iloc[val_idx].copy()
    X_test_fold = X_test.copy() # このFold用のテストデータ

    # --- Target Encoding (このFoldの学習データだけを使う) ---
    # ここで毎回計算し直すことで、リークを完璧に防ぎます
    for col in te_targets:
        # 平均マップの作成
        mean_map = X_train_fold.groupby(col)["y"].mean().to_dict()
        global_mean = y_train_fold.mean()

        # 適用関数
        def apply_te(x):
            return mean_map.get(x, global_mean)

        # 変換
        X_train_fold[f"TE_{col}"] = X_train_fold[col].apply(apply_te)
        X_valid_fold[f"TE_{col}"] = X_valid_fold[col].apply(apply_te)
        X_test_fold[f"TE_{col}"] = X_test_fold[col].apply(apply_te)

    # 特徴量の選定
    use_cols = base_features + [f"TE_{c}" for c in te_targets]

    # モデル学習 (LightGBM)
    model = lgb.LGBMRegressor(
        random_state=42,
        n_estimators=1000,
        learning_rate=0.03, # 少し丁寧に
        num_leaves=31,
        n_jobs=-1
    )

    model.fit(
        X_train_fold[use_cols], y_train_fold,
        eval_set=[(X_valid_fold[use_cols], y_valid_fold)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(0)
        ]
    )

    # 検証データでのスコア確認
    val_pred = model.predict(X_valid_fold[use_cols])
    score = np.sqrt(MSE(y_valid_fold, val_pred))
    scores.append(score)
    print(f"Fold {fold+1} RMSE: {score:.2f}")

    # テストデータへの予測 (これを5回分足してあとで割る)
    test_preds += model.predict(X_test_fold[use_cols])

# --------------------------------------------------
# 5. 最終結果の作成
# --------------------------------------------------
# 5回の予測の平均をとる
final_preds = test_preds / 5

print("-" * 30)
print(f"全Foldの平均RMSE: {np.mean(scores):.2f}")
print("-" * 30)

sample_submit[1] = final_preds
sample_submit.to_csv("submit_kfold_ensemble.csv", index=False, header=False)

print("提出用ファイル 'submit_kfold_ensemble.csv' を作成しました。")

K-Fold アンサンブル学習を開始します...
--- Fold 1 / 5 ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 212
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 9
[LightGBM] [Info] Start training from score 10685.768246
Fold 1 RMSE: 3715.73
--- Fold 2 / 5 ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 211
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 9
[LightGBM] [Info] Start training from score 10599.138284
Fold 2 RMSE: 3145.45
--- Fold 3 / 5 ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the 

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み & 結合
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)
train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")
test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量エンジニアリング (ここを強化！)
# --------------------------------------------------
def preprocess(df):
    # 基本データ
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)

    # TV放送数
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # ★追加1: NHK総合での放送があるか (全国放送のビッグマッチ)
    df["is_nhk"] = df["tv"].apply(lambda x: 1 if "NHK総合" in str(x) else 0)

    # ★追加2: 悪天候フラグ (雨よりひどい天気)
    # 雪や雷が含まれていたら客足は激減する
    def check_bad_weather(x):
        if pd.isnull(x): return 0
        if "雪" in x or "雷" in x or "台風" in x: return 1
        return 0
    df["is_bad_weather"] = df["weather"].apply(check_bad_weather)

    # 既存の天気分類
    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. K-Fold Target Encoding & Interaction Features
# --------------------------------------------------
base_features = ["capa", "month", "match_num", "tv_num", "is_nhk", "is_bad_weather"]
te_targets = ["home_team", "away_team", "stage", "week", "weather_simple"]

X = train_all
y = train_all["y"]
X_test = test_all

kf = KFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
scores = []

print("K-Fold 学習開始 (NHKフラグ & ドリームマッチ度を追加)...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    # 分割
    X_tr = X.iloc[train_idx].copy()
    y_tr = y.iloc[train_idx].copy()
    X_val = X.iloc[val_idx].copy()
    y_val = y.iloc[val_idx].copy()
    X_te = X_test.copy()

    # Target Encoding
    for col in te_targets:
        mean_map = X_tr.groupby(col)["y"].mean().to_dict()
        global_mean = y_tr.mean()

        X_tr[f"TE_{col}"] = X_tr[col].map(mean_map).fillna(global_mean)
        X_val[f"TE_{col}"] = X_val[col].map(mean_map).fillna(global_mean)
        X_te[f"TE_{col}"] = X_te[col].map(mean_map).fillna(global_mean)

    # ★追加3: ドリームマッチ度 (Interaction Feature)
    # ホームの人気 × アウェイの人気 ＝ ビッグマッチ度
    # この掛け算特徴量は決定木が捉えにくい関係性を補完します
    X_tr["big_match_score"] = X_tr["TE_home_team"] * X_tr["TE_away_team"]
    X_val["big_match_score"] = X_val["TE_home_team"] * X_val["TE_away_team"]
    X_te["big_match_score"] = X_te["TE_home_team"] * X_te["TE_away_team"]

    # 特徴量セット
    use_cols = base_features + [f"TE_{c}" for c in te_targets] + ["big_match_score"]

    # 学習
    model = lgb.LGBMRegressor(
        random_state=42,
        n_estimators=2000,
        learning_rate=0.02, # 複雑になった分、学習率は少し下げる
        num_leaves=31,
        n_jobs=-1
    )

    model.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(0)
        ]
    )

    # 評価
    score = np.sqrt(MSE(y_val, model.predict(X_val[use_cols])))
    scores.append(score)
    print(f"Fold {fold+1} RMSE: {score:.2f}")

    test_preds += model.predict(X_te[use_cols])

# --------------------------------------------------
# 4. 結果出力
# --------------------------------------------------
final_preds = test_preds / 5
print("-" * 30)
print(f"平均RMSE: {np.mean(scores):.2f}")
print("-" * 30)

sample_submit[1] = final_preds
sample_submit.to_csv("submit_nhk_bigmatch.csv", index=False, header=False)
print("作成完了: submit_nhk_bigmatch.csv")

K-Fold 学習開始 (NHKフラグ & ドリームマッチ度を追加)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 466
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 10
[LightGBM] [Info] Start training from score 10685.768246
Fold 1 RMSE: 3700.91
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 465
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 10
[LightGBM] [Info] Start training from score 10599.138284
Fold 2 RMSE: 3178.12
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014908 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 466
[LightGBM] [Info] Number of data poin

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold
import datetime

# --------------------------------------------------
# 1. データの読み込み & 結合
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")
test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量エンジニアリング (スタジアムと祝日を強化)
# --------------------------------------------------
def preprocess(df):
    # 数値化
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # 年ごとのトレンドも重要なので入れる
    df["year"] = df["year"].astype(int)

    # ★追加1: 祝日判定ロジック (簡易版)
    # 2012-2014年の主な祝日を手動リスト化して精度を上げます
    holidays = [
        "01/01", "01/14", "02/11", "03/20", "04/29", "05/03", "05/04", "05/05", "05/06",
        "07/15", "07/21", "09/15", "09/16", "09/23", "10/13", "10/14", "11/03", "11/04", "11/23", "11/24", "12/23"
    ]
    # gamedayから "MM/DD" を抽出
    df["date_str"] = df["gameday"].apply(lambda x: x.split("(")[0])

    # 土日 または 祝日リストに含まれるなら「休日」とする
    def check_holiday(row):
        if row["week"] in ["土", "日"]: return 1
        if row["date_str"] in holidays: return 1
        return 0
    df["is_holiday"] = df.apply(check_holiday, axis=1)

    # ★追加2: NHKフラグ (効果大なので継続)
    df["is_nhk"] = df["tv"].apply(lambda x: 1 if "NHK総合" in str(x) else 0)

    # ★追加3: 天気詳細
    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        elif "曇" in x: return "cloudy"
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. K-Fold Target Encoding (スタジアムを追加！)
# --------------------------------------------------
# 基本特徴量
base_features = ["capa", "year", "month", "match_num", "tv_num", "is_nhk", "is_holiday"]

# Target Encodingする対象
# ★ここで "stadium" を追加するのが最大のポイントです
te_targets = ["home_team", "away_team", "stage", "week", "stadium", "weather_simple"]

X = train_all
y = train_all["y"]
X_test = test_all

kf = KFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
scores = []

print("学習開始: スタジアムTE ＆ 祝日判定入り...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr = X.iloc[train_idx].copy()
    y_tr = y.iloc[train_idx].copy()
    X_val = X.iloc[val_idx].copy()
    y_val = y.iloc[val_idx].copy()
    X_te = X_test.copy()

    # --- Target Encoding ---
    for col in te_targets:
        mean_map = X_tr.groupby(col)["y"].mean().to_dict()
        global_mean = y_tr.mean()

        X_tr[f"TE_{col}"] = X_tr[col].map(mean_map).fillna(global_mean)
        X_val[f"TE_{col}"] = X_val[col].map(mean_map).fillna(global_mean)
        X_te[f"TE_{col}"] = X_te[col].map(mean_map).fillna(global_mean)

    # --- Interaction (ドリームマッチ度) ---
    X_tr["big_match"] = X_tr["TE_home_team"] * X_tr["TE_away_team"]
    X_val["big_match"] = X_val["TE_home_team"] * X_val["TE_away_team"]
    X_te["big_match"] = X_te["TE_home_team"] * X_te["TE_away_team"]

    # 使用する全特徴量
    use_cols = base_features + [f"TE_{c}" for c in te_targets] + ["big_match"]

    # --- モデル学習 (LightGBM) ---
    # パラメータを少し調整 (num_leavesを増やして表現力を上げる)
    model = lgb.LGBMRegressor(
        random_state=42,
        n_estimators=3000,
        learning_rate=0.02,
        num_leaves=40,       # 少し複雑なパターンも覚えさせる
        colsample_bytree=0.8,
        n_jobs=-1
    )

    model.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(0)
        ]
    )

    # 評価
    score = np.sqrt(MSE(y_val, model.predict(X_val[use_cols])))
    scores.append(score)
    print(f"Fold {fold+1} RMSE: {score:.2f}")

    test_preds += model.predict(X_te[use_cols])

# --------------------------------------------------
# 4. 提出ファイル作成
# --------------------------------------------------
final_preds = test_preds / 5

print("-" * 30)
print(f"平均RMSE: {np.mean(scores):.2f}")
print("-" * 30)

sample_submit[1] = final_preds
sample_submit.to_csv("submit_stadium_te.csv", index=False, header=False)
print("作成完了: submit_stadium_te.csv")

学習開始: スタジアムTE ＆ 祝日判定入り...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 526
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 13
[LightGBM] [Info] Start training from score 10685.768246
Fold 1 RMSE: 3549.42
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015332 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 524
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 13
[LightGBM] [Info] Start training from score 10599.138284
Fold 2 RMSE: 2944.06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 525
[LightGBM] [Info] Number of data points in the tr

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# 結合
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)
train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")
test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量エンジニアリング (ベスト版に戻す)
# --------------------------------------------------
def preprocess(df):
    # 数値化
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # 成功していた特徴量
    df["is_nhk"] = df["tv"].apply(lambda x: 1 if "NHK総合" in str(x) else 0)

    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. Seed Averaging (3つの乱数で学習して平均)
# --------------------------------------------------
# 失敗した stadium TE は削除し、元の構成に戻します
base_features = ["capa", "month", "match_num", "tv_num", "is_nhk"]
te_targets = ["home_team", "away_team", "stage", "week", "weather_simple"]

X = train_all
y = train_all["y"]
X_test = test_all

# 異なる3つの乱数シード
SEEDS = [42, 2023, 9999]
final_test_preds = np.zeros(len(X_test))

print("Seed Averaging 学習開始 (時間がかかります)...")

for seed in SEEDS:
    print(f"\n=== Training with Random Seed: {seed} ===")

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    seed_preds = np.zeros(len(X_test))
    scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_tr = X.iloc[train_idx].copy()
        y_tr = y.iloc[train_idx].copy()
        X_val = X.iloc[val_idx].copy()
        y_val = y.iloc[val_idx].copy()
        X_te = X_test.copy()

        # --- Target Encoding ---
        for col in te_targets:
            mean_map = X_tr.groupby(col)["y"].mean().to_dict()
            global_mean = y_tr.mean()

            X_tr[f"TE_{col}"] = X_tr[col].map(mean_map).fillna(global_mean)
            X_val[f"TE_{col}"] = X_val[col].map(mean_map).fillna(global_mean)
            X_te[f"TE_{col}"] = X_te[col].map(mean_map).fillna(global_mean)

        # --- ドリームマッチ度 ---
        X_tr["big_match"] = X_tr["TE_home_team"] * X_tr["TE_away_team"]
        X_val["big_match"] = X_val["TE_home_team"] * X_val["TE_away_team"]
        X_te["big_match"] = X_te["TE_home_team"] * X_te["TE_away_team"]

        use_cols = base_features + [f"TE_{c}" for c in te_targets] + ["big_match"]

        # --- LightGBM ---
        model = lgb.LGBMRegressor(
            random_state=seed, # シードを変える
            n_estimators=2000,
            learning_rate=0.03,
            num_leaves=31, # 安全な値に戻す
            colsample_bytree=0.8,
            n_jobs=-1
        )

        model.fit(
            X_tr[use_cols], y_tr,
            eval_set=[(X_val[use_cols], y_val)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False),
                lgb.log_evaluation(0)
            ]
        )

        # 評価
        score = np.sqrt(MSE(y_val, model.predict(X_val[use_cols])))
        scores.append(score)

        seed_preds += model.predict(X_te[use_cols])

    # このシードでの平均RMSE
    print(f"Seed {seed} Average RMSE: {np.mean(scores):.2f}")

    # 最終予測に加算 (5-Foldの平均が入っているので、あとでシード数(3)で割る)
    final_test_preds += seed_preds / 5

# --------------------------------------------------
# 4. 提出ファイル作成
# --------------------------------------------------
# 3つのシードの平均をとる
final_test_preds = final_test_preds / len(SEEDS)

sample_submit[1] = final_test_preds
sample_submit.to_csv("submit_seed_ensemble.csv", index=False, header=False)

print("\n作成完了: submit_seed_ensemble.csv")

Seed Averaging 学習開始 (時間がかかります)...

=== Training with Random Seed: 42 ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015012 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 466
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 10
[LightGBM] [Info] Start training from score 10685.768246
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 465
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 10
[LightGBM] [Info] Start training from score 10599.138284
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027326 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [I

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# 結合
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)
train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")
test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量エンジニアリング
# --------------------------------------------------
def preprocess(df):
    # --- 基本データ ---
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))
    df["is_nhk"] = df["tv"].apply(lambda x: 1 if "NHK総合" in str(x) else 0)

    # --- ★追加1: 時間 (Hour) ---
    # "14:04" -> 14 という風に「何時台か」を取り出します
    df["kick_off_hour"] = df["time"].apply(lambda x: int(x.split(":")[0]))

    # --- ★追加2: 気温 (Temperature) ---
    # 数値変換し、欠損は平均（20度くらい）で埋める
    df["temperature"] = pd.to_numeric(df["temperature"], errors='coerce').fillna(20.0)

    # --- ★追加3: 湿度 (Humidity) ---
    # "50%" -> 50.0 に変換
    def parse_humidity(x):
        try:
            return float(str(x).replace("%", ""))
        except:
            return 50.0
    df["humidity"] = df["humidity"].apply(parse_humidity)

    # 天気カテゴリ
    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        elif "曇" in x: return "cloudy" # 曇りも区別してみる
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. K-Fold & Seed Averaging
# --------------------------------------------------
# キックオフ時間(hour)もターゲットエンコーディングの対象にします
# 例: 「19時開始の試合は平均〇〇人」という情報を学習
te_targets = ["home_team", "away_team", "stage", "week", "kick_off_hour", "weather_simple"]

# そのまま使う数値特徴量
base_features = ["capa", "month", "match_num", "tv_num", "is_nhk", "temperature", "humidity"]

X = train_all
y = train_all["y"]
X_test = test_all

# シードを3つ使用
SEEDS = [42, 2023, 108]
final_test_preds = np.zeros(len(X_test))

print(f"学習開始: 時間・気温・湿度を追加 (Seed x{len(SEEDS)})...")

for seed in SEEDS:
    print(f"\n--- Seed: {seed} ---")

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    seed_preds = np.zeros(len(X_test))
    scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_tr = X.iloc[train_idx].copy()
        y_tr = y.iloc[train_idx].copy()
        X_val = X.iloc[val_idx].copy()
        y_val = y.iloc[val_idx].copy()
        X_te = X_test.copy()

        # Target Encoding
        for col in te_targets:
            mean_map = X_tr.groupby(col)["y"].mean().to_dict()
            global_mean = y_tr.mean()

            X_tr[f"TE_{col}"] = X_tr[col].map(mean_map).fillna(global_mean)
            X_val[f"TE_{col}"] = X_val[col].map(mean_map).fillna(global_mean)
            X_te[f"TE_{col}"] = X_te[col].map(mean_map).fillna(global_mean)

        # Big Match Interaction
        X_tr["big_match"] = X_tr["TE_home_team"] * X_tr["TE_away_team"]
        X_val["big_match"] = X_val["TE_home_team"] * X_val["TE_away_team"]
        X_te["big_match"] = X_te["TE_home_team"] * X_te["TE_away_team"]

        # 使用する特徴量
        use_cols = base_features + [f"TE_{c}" for c in te_targets] + ["big_match"]

        # LightGBM
        model = lgb.LGBMRegressor(
            random_state=seed,
            n_estimators=3000,
            learning_rate=0.03,
            num_leaves=34, # 特徴量が増えたので少しだけ表現力を上げる
            colsample_bytree=0.8,
            n_jobs=-1
        )

        model.fit(
            X_tr[use_cols], y_tr,
            eval_set=[(X_val[use_cols], y_val)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False),
                lgb.log_evaluation(0)
            ]
        )

        score = np.sqrt(MSE(y_val, model.predict(X_val[use_cols])))
        scores.append(score)

        seed_preds += model.predict(X_te[use_cols])

    print(f"Seed {seed} RMSE: {np.mean(scores):.2f}")
    final_test_preds += seed_preds / 5

# --------------------------------------------------
# 4. 提出ファイル
# --------------------------------------------------
final_test_preds = final_test_preds / len(SEEDS)

sample_submit[1] = final_test_preds
sample_submit.to_csv("submit_time_weather.csv", index=False, header=False)

print("\n作成完了: submit_time_weather.csv")

学習開始: 時間・気温・湿度を追加 (Seed x3)...

--- Seed: 42 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002714 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 788
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 13
[LightGBM] [Info] Start training from score 10685.768246
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 784
[LightGBM] [Info] Number of data points in the train set: 1562, number of used features: 13
[LightGBM] [Info] Start training from score 10599.138284
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 791
[LightGBM] [Info] Number of data points in the train set: 1562, numb

In [None]:
# --------------------------------------------------
# 0. ライブラリのインストール
# --------------------------------------------------
# CatBoostはColabに標準で入っていないためインストールします
!pip install catboost

import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor # 新戦力
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# 結合
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)
train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")
test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量エンジニアリング (前回失敗した要素を除去し、ベスト版に戻す)
# --------------------------------------------------
def preprocess(df):
    # 基本
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # 効果が高かったNHKフラグ
    df["is_nhk"] = df["tv"].apply(lambda x: 1 if "NHK総合" in str(x) else 0)

    # 天気 (シンプル版に戻す)
    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. K-Fold アンサンブル (LGBM + CatBoost)
# --------------------------------------------------
# ターゲットエンコーディング対象
te_targets = ["home_team", "away_team", "stage", "week", "weather_simple"]
# 基本特徴量
base_features = ["capa", "month", "match_num", "tv_num", "is_nhk"]

X = train_all
y = train_all["y"]
X_test = test_all

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 予測値の入れ物
lgb_preds = np.zeros(len(X_test))
cat_preds = np.zeros(len(X_test))
scores = []

print("ハイブリッド学習開始 (LightGBM + CatBoost)...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr = X.iloc[train_idx].copy()
    y_tr = y.iloc[train_idx].copy()
    X_val = X.iloc[val_idx].copy()
    y_val = y.iloc[val_idx].copy()
    X_te = X_test.copy()

    # --- Target Encoding ---
    for col in te_targets:
        mean_map = X_tr.groupby(col)["y"].mean().to_dict()
        global_mean = y_tr.mean()
        X_tr[f"TE_{col}"] = X_tr[col].map(mean_map).fillna(global_mean)
        X_val[f"TE_{col}"] = X_val[col].map(mean_map).fillna(global_mean)
        X_te[f"TE_{col}"] = X_te[col].map(mean_map).fillna(global_mean)

    # --- Interaction (ドリームマッチ度) ---
    X_tr["big_match"] = X_tr["TE_home_team"] * X_tr["TE_away_team"]
    X_val["big_match"] = X_val["TE_home_team"] * X_val["TE_away_team"]
    X_te["big_match"] = X_te["TE_home_team"] * X_te["TE_away_team"]

    # 使用する特徴量
    use_cols = base_features + [f"TE_{c}" for c in te_targets] + ["big_match"]

    # ==========================================
    # Model 1: LightGBM
    # ==========================================
    model_lgb = lgb.LGBMRegressor(
        random_state=42,
        n_estimators=3000,
        learning_rate=0.03,
        num_leaves=31,
        n_jobs=-1,
        verbose=-1
    )
    model_lgb.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    val_pred_lgb = model_lgb.predict(X_val[use_cols])
    lgb_preds += model_lgb.predict(X_te[use_cols]) / 5

    # ==========================================
    # Model 2: CatBoost (New!)
    # ==========================================
    # CatBoostはパラメータ調整なしでも非常に高精度です
    model_cat = CatBoostRegressor(
        random_state=42,
        iterations=3000,
        learning_rate=0.03,
        depth=6,
        loss_function='RMSE',
        verbose=False,
        allow_writing_files=False
    )
    model_cat.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        early_stopping_rounds=100
    )
    val_pred_cat = model_cat.predict(X_val[use_cols])
    cat_preds += model_cat.predict(X_te[use_cols]) / 5

    # ==========================================
    # アンサンブル評価 (単純平均)
    # ==========================================
    val_pred_avg = (val_pred_lgb + val_pred_cat) / 2
    score = np.sqrt(MSE(y_val, val_pred_avg))
    scores.append(score)

    print(f"Fold {fold+1} RMSE: {score:.2f} (LGB:{np.sqrt(MSE(y_val, val_pred_lgb)):.0f}, CAT:{np.sqrt(MSE(y_val, val_pred_cat)):.0f})")

# --------------------------------------------------
# 4. 提出ファイル作成
# --------------------------------------------------
print("-" * 30)
print(f"平均RMSE: {np.mean(scores):.2f}")
print("-" * 30)

# 2つのモデルの予測を平均して提出
final_preds = (lgb_preds + cat_preds) / 2

sample_submit[1] = final_preds
sample_submit.to_csv("submit_ensemble_lgbm_cat.csv", index=False, header=False)

print("作成完了: submit_ensemble_lgbm_cat.csv")

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
ハイブリッド学習開始 (LightGBM + CatBoost)...
Fold 1 RMSE: 3626.67 (LGB:3685, CAT:3639)
Fold 2 RMSE: 3135.98 (LGB:3165, CAT:3197)
Fold 3 RMSE: 3342.22 (LGB:3361, CAT:3389)
Fold 4 RMSE: 2901.93 (LGB:2933, CAT:2923)
Fold 5 RMSE: 3054.26 (LGB:3151, CAT:3030)
------------------------------
平均RMSE: 3212.21
------------------------------
作成完了: submit_ensemble_lgbm_cat.csv


In [None]:
# --------------------------------------------------
# 0. ライブラリのインストール
# --------------------------------------------------
!pip install catboost xgboost

import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor
import xgboost as xgb  # 新戦力
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み & 結合
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)
train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")
test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量エンジニアリング (実績のあるベスト版)
# --------------------------------------------------
def preprocess(df):
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # 効いているNHKフラグ
    df["is_nhk"] = df["tv"].apply(lambda x: 1 if "NHK総合" in str(x) else 0)

    # 天気
    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. K-Fold アンサンブル (LGBM + CatBoost + XGBoost)
# --------------------------------------------------
# 特徴量設定
base_features = ["capa", "month", "match_num", "tv_num", "is_nhk"]
te_targets = ["home_team", "away_team", "stage", "week", "weather_simple"]

X = train_all
y = train_all["y"]
X_test = test_all

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 3つのモデルの予測入れ物
pred_lgb = np.zeros(len(X_test))
pred_cat = np.zeros(len(X_test))
pred_xgb = np.zeros(len(X_test))

scores = []

print("最強アンサンブル学習開始 (LGBM + Cat + XGB)...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr = X.iloc[train_idx].copy()
    y_tr = y.iloc[train_idx].copy()
    X_val = X.iloc[val_idx].copy()
    y_val = y.iloc[val_idx].copy()
    X_te = X_test.copy()

    # --- Target Encoding ---
    for col in te_targets:
        mean_map = X_tr.groupby(col)["y"].mean().to_dict()
        global_mean = y_tr.mean()
        X_tr[f"TE_{col}"] = X_tr[col].map(mean_map).fillna(global_mean)
        X_val[f"TE_{col}"] = X_val[col].map(mean_map).fillna(global_mean)
        X_te[f"TE_{col}"] = X_te[col].map(mean_map).fillna(global_mean)

    # --- Interaction ---
    X_tr["big_match"] = X_tr["TE_home_team"] * X_tr["TE_away_team"]
    X_val["big_match"] = X_val["TE_home_team"] * X_val["TE_away_team"]
    X_te["big_match"] = X_te["TE_home_team"] * X_te["TE_away_team"]

    use_cols = base_features + [f"TE_{c}" for c in te_targets] + ["big_match"]

    # ==========================================
    # 1. LightGBM
    # ==========================================
    model_lgb = lgb.LGBMRegressor(
        random_state=42, n_estimators=3000, learning_rate=0.03, num_leaves=31, n_jobs=-1, verbose=-1
    )
    model_lgb.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    p_lgb = model_lgb.predict(X_val[use_cols])
    pred_lgb += model_lgb.predict(X_te[use_cols]) / 5

    # ==========================================
    # 2. CatBoost
    # ==========================================
    model_cat = CatBoostRegressor(
        random_state=42, iterations=3000, learning_rate=0.03, depth=6, loss_function='RMSE', verbose=False, allow_writing_files=False
    )
    model_cat.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        early_stopping_rounds=100
    )
    p_cat = model_cat.predict(X_val[use_cols])
    pred_cat += model_cat.predict(X_te[use_cols]) / 5

    # ==========================================
    # 3. XGBoost (New!)
    # ==========================================
    model_xgb = xgb.XGBRegressor(
        random_state=42,
        n_estimators=3000,
        learning_rate=0.03,
        max_depth=6,
        n_jobs=-1,
        early_stopping_rounds=100
    )
    model_xgb.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        verbose=False
    )
    p_xgb = model_xgb.predict(X_val[use_cols])
    pred_xgb += model_xgb.predict(X_te[use_cols]) / 5

    # ==========================================
    # 平均スコア
    # ==========================================
    # 3つの平均をとって精度を確認
    p_avg = (p_lgb + p_cat + p_xgb) / 3
    score = np.sqrt(MSE(y_val, p_avg))
    scores.append(score)
    print(f"Fold {fold+1} RMSE: {score:.2f} (LGB:{np.sqrt(MSE(y_val, p_lgb)):.0f}, CAT:{np.sqrt(MSE(y_val, p_cat)):.0f}, XGB:{np.sqrt(MSE(y_val, p_xgb)):.0f})")

# --------------------------------------------------
# 4. 提出ファイル
# --------------------------------------------------
print("-" * 30)
print(f"平均RMSE: {np.mean(scores):.2f}")
print("-" * 30)

# 単純平均（すべてを信じる）
final_preds = (pred_lgb + pred_cat + pred_xgb) / 3

sample_submit[1] = final_preds
sample_submit.to_csv("submit_3model_ensemble.csv", index=False, header=False)
print("作成完了: submit_3model_ensemble.csv")

最強アンサンブル学習開始 (LGBM + Cat + XGB)...
Fold 1 RMSE: 3702.27 (LGB:3685, CAT:3639, XGB:3954)
Fold 2 RMSE: 3183.49 (LGB:3165, CAT:3197, XGB:3449)
Fold 3 RMSE: 3312.35 (LGB:3361, CAT:3389, XGB:3358)
Fold 4 RMSE: 2882.58 (LGB:2933, CAT:2923, XGB:2914)
Fold 5 RMSE: 3075.09 (LGB:3151, CAT:3030, XGB:3227)
------------------------------
平均RMSE: 3231.16
------------------------------
作成完了: submit_3model_ensemble.csv


In [None]:
# --------------------------------------------------
# 0. ライブラリのインストール
# --------------------------------------------------
!pip install catboost

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
train = pd.read_csv("train.csv")
train_add = pd.read_csv("train_add.csv")
test = pd.read_csv("test.csv")
condition = pd.read_csv("condition.csv")
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# 結合
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)
train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")
test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量エンジニアリング (実績No.1の構成)
# --------------------------------------------------
def preprocess(df):
    # 基本
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # 効き目の高いNHKフラグ
    df["is_nhk"] = df["tv"].apply(lambda x: 1 if "NHK総合" in str(x) else 0)

    # 天気 (シンプル版)
    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. K-Fold Cross Validation (CatBoost)
# --------------------------------------------------
base_features = ["capa", "month", "match_num", "tv_num", "is_nhk"]
te_targets = ["home_team", "away_team", "stage", "week", "weather_simple"]

X = train_all
y = train_all["y"]
X_test = test_all

kf = KFold(n_splits=5, shuffle=True, random_state=42)

test_preds = np.zeros(len(X_test))
scores = []

print("CatBoost単体での学習を開始します...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr = X.iloc[train_idx].copy()
    y_tr = y.iloc[train_idx].copy()
    X_val = X.iloc[val_idx].copy()
    y_val = y.iloc[val_idx].copy()
    X_te = X_test.copy()

    # --- Target Encoding ---
    for col in te_targets:
        mean_map = X_tr.groupby(col)["y"].mean().to_dict()
        global_mean = y_tr.mean()
        X_tr[f"TE_{col}"] = X_tr[col].map(mean_map).fillna(global_mean)
        X_val[f"TE_{col}"] = X_val[col].map(mean_map).fillna(global_mean)
        X_te[f"TE_{col}"] = X_te[col].map(mean_map).fillna(global_mean)

    # --- Interaction (ドリームマッチ度) ---
    X_tr["big_match"] = X_tr["TE_home_team"] * X_tr["TE_away_team"]
    X_val["big_match"] = X_val["TE_home_team"] * X_val["TE_away_team"]
    X_te["big_match"] = X_te["TE_home_team"] * X_te["TE_away_team"]

    # 特徴量セット
    use_cols = base_features + [f"TE_{c}" for c in te_targets] + ["big_match"]

    # --- CatBoost Regressor ---
    model = CatBoostRegressor(
        random_state=42,
        iterations=3000,     # 木の本数
        learning_rate=0.03,  # 学習率
        depth=6,             # 木の深さ
        loss_function='RMSE',
        verbose=False,       # ログを非表示にしてスッキリさせる
        allow_writing_files=False
    )

    # 学習
    model.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        early_stopping_rounds=100
    )

    # 評価
    val_pred = model.predict(X_val[use_cols])
    score = np.sqrt(MSE(y_val, val_pred))
    scores.append(score)

    print(f"Fold {fold+1} RMSE: {score:.2f}")

    test_preds += model.predict(X_te[use_cols])

# --------------------------------------------------
# 4. 提出ファイル作成
# --------------------------------------------------
final_preds = test_preds / 5

print("-" * 30)
print(f"CatBoost 平均RMSE: {np.mean(scores):.2f}")
print("-" * 30)

sample_submit[1] = final_preds
sample_submit.to_csv("submit_catboost_only.csv", index=False, header=False)

print("作成完了: submit_catboost_only.csv")

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
CatBoost単体での学習を開始します...
Fold 1 RMSE: 3639.37
Fold 2 RMSE: 3196.70
Fold 3 RMSE: 3389.49
Fold 4 RMSE: 2923.26
Fold 5 RMSE: 3029.65
------------------------------
CatBoost 平均RMSE: 3235.69
------------------------------
作成完了: submit_catboost_only.csv


In [None]:
# =============================================================================
# SIGNATE Jリーグ観客動員数予測モデル (CatBoost + Target Encoding)
# =============================================================================

# --------------------------------------------------
# 0. ライブラリの準備
# --------------------------------------------------
# CatBoostは標準では入っていないためインストールが必要です
!pip install catboost

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

# --------------------------------------------------
# 1. データの読み込み
# --------------------------------------------------
# 各種CSVファイルを読み込みます
train = pd.read_csv("train.csv")            # 学習用データ（観客数あり）
train_add = pd.read_csv("train_add.csv")    # 学習用追加データ
test = pd.read_csv("test.csv")              # 予測用データ（観客数なし・提出用）
condition = pd.read_csv("condition.csv")    # 試合の天気や詳細情報
condition_add = pd.read_csv("condition_add.csv")
stadium = pd.read_csv("stadium.csv")        # スタジアムのスペック情報
sample_submit = pd.read_csv("sample_submit.csv", header=None)

# --- データの結合 ---
# 学習データを縦に結合（train + train_add）
train_combined = pd.concat([train, train_add], sort=False).reset_index(drop=True)
# 試合条件データも縦に結合
condition_combined = pd.concat([condition, condition_add], sort=False).reset_index(drop=True)

# 学習データに詳細情報（condition, stadium）を横結合（マージ）
# idをキーにして試合情報を紐付け、stadium名をキーにしてスタジアム情報を紐付けます
train_all = pd.merge(train_combined, condition_combined, on="id", how="left")
train_all = pd.merge(train_all, stadium, left_on="stadium", right_on="name", how="left")

# テストデータにも同様に情報を結合
test_all = pd.merge(test, condition_combined, on="id", how="left")
test_all = pd.merge(test_all, stadium, left_on="stadium", right_on="name", how="left")

# --------------------------------------------------
# 2. 特徴量エンジニアリング (データ加工)
# --------------------------------------------------
def preprocess(df):
    """
    データフレームを受け取り、機械学習に適した形に加工する関数
    """
    # capa（収容人数）: 文字列が含まれる場合があるため数値化し、欠損は平均値で埋める
    df["capa"] = pd.to_numeric(df["capa"], errors='coerce').fillna(df["capa"].mean())

    # 日付から「月」と「曜日」を抽出
    # "03/12(土)" -> month: 3, week: "土"
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["week"] = df["gameday"].apply(lambda x: x.split("(")[1][0])

    # "第〇節" という文字列から数字だけを取り出す
    df["match_num"] = df["match"].astype(str).str.extract(r'(\d+)').astype(float).fillna(0)

    # 放送局の数をカウント（メディア露出が多いほど客が増える仮説）
    # 区切り文字「／」で分割してリストの長さを数える
    df["tv_num"] = df["tv"].apply(lambda x: 0 if pd.isnull(x) else len(str(x).split("／")))

    # --- 【工夫点】NHKフラグ ---
    # 全国放送である「NHK総合」が含まれている試合は動員が伸びる傾向があるため、
    # フラグとして独立させる（ドメイン知識の活用）
    df["is_nhk"] = df["tv"].apply(lambda x: 1 if "NHK総合" in str(x) else 0)

    # 天気データの簡素化
    # 複雑な天気情報を「雨」「晴れ」「その他」の3パターンに集約
    def process_weather(x):
        if pd.isnull(x): return "other"
        if "雨" in x: return "rain"
        elif "晴" in x: return "sunny"
        else: return "other"
    df["weather_simple"] = df["weather"].apply(process_weather)

    return df

# 加工の実行
train_all = preprocess(train_all)
test_all = preprocess(test_all)

# --------------------------------------------------
# 3. K-Fold Cross Validation (学習と検証のメインループ)
# --------------------------------------------------
# そのまま使う特徴量
base_features = ["capa", "month", "match_num", "tv_num", "is_nhk"]
# ターゲットエンコーディング（数値化）したいカテゴリ変数
te_targets = ["home_team", "away_team", "stage", "week", "weather_simple"]

X = train_all
y = train_all["y"]  # 目的変数（予測したい観客数）
X_test = test_all   # 最終的な予測対象

# 5分割交差検証の設定
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 結果を保存する箱
test_preds = np.zeros(len(X_test))
scores = []

print("CatBoost単体での学習を開始します...")

# K-Foldループ開始
# データを5回に分けて、学習(Train)と検証(Validation)を繰り返す
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    # データの分割
    X_tr = X.iloc[train_idx].copy()
    y_tr = y.iloc[train_idx].copy()
    X_val = X.iloc[val_idx].copy()
    y_val = y.iloc[val_idx].copy()
    X_te = X_test.copy()

    # --- 【重要】Target Encoding (リーク対策済み) ---
    # ここでループ内（分割後）に処理を行うのが最大のポイントです。
    # 「学習データ(X_tr)だけの平均」を計算し、それを検証データ(X_val)に当てはめます。
    # もし分割前に全体で平均をとると、検証データの答えをカンニングすることになります（リーク）。
    for col in te_targets:
        # 学習データのカテゴリごとの平均観客数を計算
        mean_map = X_tr.groupby(col)["y"].mean().to_dict()
        global_mean = y_tr.mean() # カテゴリが見つからない場合の埋め値（全体の平均）

        # マッピング（数値への変換）
        X_tr[f"TE_{col}"] = X_tr[col].map(mean_map).fillna(global_mean)
        X_val[f"TE_{col}"] = X_val[col].map(mean_map).fillna(global_mean)
        X_te[f"TE_{col}"] = X_te[col].map(mean_map).fillna(global_mean)

    # --- 【工夫点】Interaction (ドリームマッチ度) ---
    # ホームの人気(TE_home) × アウェイの人気(TE_away)
    # 両方とも人気チームの場合、数値が大きく跳ね上がる特徴量を作成
    X_tr["big_match"] = X_tr["TE_home_team"] * X_tr["TE_away_team"]
    X_val["big_match"] = X_val["TE_home_team"] * X_val["TE_away_team"]
    X_te["big_match"] = X_te["TE_home_team"] * X_te["TE_away_team"]

    # 最終的にモデルに入力するカラムのリスト
    use_cols = base_features + [f"TE_{c}" for c in te_targets] + ["big_match"]

    # --- モデルの構築 (CatBoost) ---
    model = CatBoostRegressor(
        random_state=42,
        iterations=3000,     # 木の最大数
        learning_rate=0.03,  # 学習率（小さいほど丁寧だが時間がかかる）
        depth=6,             # 木の深さ（深すぎると過学習のリスク）
        loss_function='RMSE',
        verbose=False,       # ログを非表示にして画面をスッキリさせる
        allow_writing_files=False
    )

    # 学習の実行
    # eval_setに検証データを渡すことで、過学習していないかチェックしながら進める
    model.fit(
        X_tr[use_cols], y_tr,
        eval_set=[(X_val[use_cols], y_val)],
        early_stopping_rounds=100  # 100回連続でスコア改善がなければそこで打ち切り
    )

    # 検証データでの予測と評価
    val_pred = model.predict(X_val[use_cols])
    score = np.sqrt(MSE(y_val, val_pred)) # RMSEを計算
    scores.append(score)

    print(f"Fold {fold+1} RMSE: {score:.2f}")

    # テストデータ（提出用）の予測を足し合わせる（後で5で割って平均する）
    test_preds += model.predict(X_te[use_cols])

# --------------------------------------------------
# 4. 結果の出力とファイル保存
# --------------------------------------------------
# 5回の予測値の平均をとる（アンサンブル効果で精度が安定する）
final_preds = test_preds / 5

print("-" * 30)
print(f"CatBoost 平均RMSE: {np.mean(scores):.2f}")
print("-" * 30)

# =============================================================================
# データの出力・保存用コード
# （既存コードの最後に追記して実行してください）
# =============================================================================

# 1. 前処理済み学習データ (train_all) の保存
# これには結合したスタジアム情報や、作成した特徴量(tv_num, is_nhkなど)が含まれます
train_all.to_csv("processed_train_data.csv", index=False, encoding='utf-8-sig')
print("保存完了: processed_train_data.csv (前処理済み学習データ)")

# 2. 前処理済みテストデータ (test_all) の保存
# これには予測を行うための準備が整ったデータが含まれます
test_all.to_csv("processed_test_data.csv", index=False, encoding='utf-8-sig')
print("保存完了: processed_test_data.csv (前処理済みテストデータ)")

# 3. 提出用ファイルは既に上のコードで出力されていますが、念のため確認
# sample_submit.to_csv("submit_catboost_only.csv", index=False, header=False)
# print("確認: submit_catboost_only.csv は生成済みです")

# --------------------------------------------------
# Google Colabからダウンロードする場合のヘルパー
# --------------------------------------------------
try:
    from google.colab import files
    print("ファイルをダウンロードします...")
    files.download('processed_train_data.csv')
    files.download('processed_test_data.csv')
    files.download('submit_catboost_only.csv')
except ImportError:
    print("Google Colab環境ではないため、自動ダウンロードはスキップしました。")
    print("ファイルはカレントディレクトリに保存されています。")

CatBoost単体での学習を開始します...
Fold 1 RMSE: 3639.37
Fold 2 RMSE: 3196.70
Fold 3 RMSE: 3389.49
Fold 4 RMSE: 2923.26
Fold 5 RMSE: 3029.65
------------------------------
CatBoost 平均RMSE: 3235.69
------------------------------
保存完了: processed_train_data.csv (前処理済み学習データ)
保存完了: processed_test_data.csv (前処理済みテストデータ)
ファイルをダウンロードします...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>