# Titanic Inference Notebook - exp0001

**テストデータ予測・提出・台帳更新**

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import yaml
import json
import subprocess
from datetime import datetime
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

# 設定読み込み
with open("config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

print(f"実験ID: {cfg['experiment']['id']}")
print(f"推論開始時刻: {datetime.now()}")

In [None]:
# データ読み込み・前処理（training.ipynbと同じ処理）
def load_and_preprocess_data(cfg):
    """データの読み込みと前処理"""

    # 生データ読み込み
    train_df = pd.read_csv(f"{cfg['paths']['raw_dir']}/train.csv")
    test_df = pd.read_csv(f"{cfg['paths']['raw_dir']}/test.csv")

    # データ結合
    all_data = pd.concat([train_df, test_df], ignore_index=True)

    # 欠損値処理
    all_data["Age"].fillna(all_data["Age"].median(), inplace=True)
    all_data["Fare"].fillna(all_data["Fare"].median(), inplace=True)
    all_data["Embarked"].fillna(all_data["Embarked"].mode()[0], inplace=True)

    # 特徴量エンジニアリング
    # Title抽出
    all_data["Title"] = all_data["Name"].str.extract(" ([A-Za-z]+)\.")
    title_mapping = {
        "Mr": "Mr",
        "Mrs": "Mrs",
        "Miss": "Miss",
        "Master": "Master",
        "Dr": "Rare",
        "Rev": "Rare",
        "Col": "Rare",
        "Major": "Rare",
        "Mlle": "Miss",
        "Countess": "Rare",
        "Ms": "Mrs",
        "Lady": "Rare",
        "Jonkheer": "Rare",
        "Don": "Rare",
        "Dona": "Rare",
        "Mme": "Mrs",
        "Capt": "Rare",
        "Sir": "Rare",
    }
    all_data["Title"] = all_data["Title"].map(title_mapping).fillna("Rare")

    # Family features
    all_data["FamilySize"] = all_data["SibSp"] + all_data["Parch"] + 1
    all_data["IsAlone"] = (all_data["FamilySize"] == 1).astype(int)

    # Age bands
    all_data["AgeBand"] = pd.cut(
        all_data["Age"], bins=[0, 12, 18, 35, 60, 100], labels=["Child", "Teen", "Adult", "Middle", "Senior"]
    )

    # Fare bands
    all_data["FareBand"] = pd.qcut(all_data["Fare"], q=4, labels=["Low", "Medium", "High", "VeryHigh"])

    # 学習・テストに分離
    train_data = all_data[: len(train_df)].copy()
    test_data = all_data[len(train_df) :].copy()

    # 特徴量とターゲットを分離
    feature_cols = cfg["features"]["use"]

    X_train = train_data[feature_cols]
    y_train = train_data[cfg["data"]["target"]]
    X_test = test_data[feature_cols]

    return X_train, y_train, X_test, test_data


# データ読み込み
X_train, y_train, X_test, test_data = load_and_preprocess_data(cfg)

print(f"テストデータ形状: {X_test.shape}")
print(f"特徴量: {list(X_test.columns)}")

In [None]:
# 学習済みモデルの読み込み
models = []
model_paths = []

for fold in range(cfg["cv"]["n_splits"]):
    model_path = f"model/fold{fold}.lgb"
    if Path(model_path).exists():
        model = lgb.Booster(model_file=model_path)
        models.append(model)
        model_paths.append(model_path)
        print(f"Fold {fold} モデル読み込み: {model_path}")
    else:
        raise FileNotFoundError(f"モデルファイルが見つかりません: {model_path}")

print(f"\n読み込み完了: {len(models)}個のモデル")

In [None]:
# テストデータ予測
test_predictions = np.zeros(len(X_test))
fold_predictions = []

print("テストデータ予測開始...")
for i, model in enumerate(models):
    pred = model.predict(X_test, num_iteration=model.best_iteration)
    test_predictions += pred / len(models)
    fold_predictions.append(pred)
    print(f"Fold {i}: 予測完了 (平均: {pred.mean():.4f})")

print(f"\nアンサンブル予測完了")
print(f"予測値統計: min={test_predictions.min():.4f}, max={test_predictions.max():.4f}, mean={test_predictions.mean():.4f}")

In [None]:
# 閾値の決定（OOF分析から）
# evaluation.ipynbで求めた最適閾値を使用するか、デフォルト0.5を使用
threshold = 0.5  # 後でevaluation結果から更新

try:
    # OOFデータから最適閾値を計算
    oof_df = pd.read_parquet("oof.parquet")
    from sklearn.metrics import precision_recall_curve

    precision, recall, thresholds = precision_recall_curve(oof_df["y_true"], oof_df["y_pred"])
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_threshold_idx = np.argmax(f1_scores)
    threshold = thresholds[best_threshold_idx]

    print(f"OOFから最適閾値を計算: {threshold:.4f}")
except:
    print(f"デフォルト閾値を使用: {threshold}")

# 提出用予測（バイナリ）
binary_predictions = (test_predictions > threshold).astype(int)

print(f"予測分布: {np.bincount(binary_predictions)}")
print(f"生存率: {binary_predictions.mean():.3f}")

In [None]:
# 提出ファイル作成
submission = pd.DataFrame({cfg["data"]["id"]: test_data[cfg["data"]["id"]], cfg["data"]["target"]: binary_predictions})

submission_path = f"submissions/submission.csv"
submission.to_csv(submission_path, index=False)

print(f"提出ファイル作成: {submission_path}")
print("\n提出データサンプル:")
print(submission.head(10))
print(f"\n提出データ形状: {submission.shape}")

In [None]:
# Kaggle API提出
import os
import re

# Git SHAを取得
try:
    with open("git_sha.txt", "r") as f:
        git_sha = f.read().strip()
except:
    try:
        git_sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip()[:8]
    except:
        git_sha = "unknown"

# 提出メッセージ
message = cfg["kaggle"]["message_template"].format(exp_id=cfg["experiment"]["id"], git_sha=git_sha)

print(f"Kaggle提出開始...")
print(f"Competition: {cfg['kaggle']['competition']}")
print(f"Message: {message}")

# 提出コマンド実行
cmd = f"kaggle competitions submit -c {cfg['kaggle']['competition']} -f {submission_path} -m '{message}'"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

if result.returncode == 0:
    print("✅ 提出成功!")
    print(result.stdout)

    # 提出IDを抽出
    submission_match = re.search(r"Successfully submitted to (.+)", result.stdout)
    if submission_match:
        competition_name = submission_match.group(1)
        print(f"Competition: {competition_name}")
else:
    print("❌ 提出失敗")
    print(f"Error: {result.stderr}")

# 提出履歴を取得してSubmission IDを取得
print("\n提出履歴を取得中...")
submissions_cmd = f"kaggle competitions submissions -c {cfg['kaggle']['competition']} -v"
submissions_result = subprocess.run(submissions_cmd, shell=True, capture_output=True, text=True)

submission_info = {}
if submissions_result.returncode == 0:
    # 最新の提出情報をパース
    lines = submissions_result.stdout.strip().split("\n")
    if len(lines) > 1:  # ヘッダー行を除く
        latest_submission = lines[1].split(",")
        if len(latest_submission) >= 6:
            submission_info = {
                "id": latest_submission[0],
                "description": latest_submission[1],
                "submitted_at": latest_submission[2],
                "public_score": latest_submission[4] if latest_submission[4] != "" else None,
            }
            print(f"Submission ID: {submission_info['id']}")
            print(f"Public Score: {submission_info['public_score']}")
else:
    print(f"提出履歴取得エラー: {submissions_result.stderr}")

In [None]:
# Submission manifest作成
manifest = {
    "exp_id": cfg["experiment"]["id"],
    "generated_at": datetime.now().isoformat(),
    "models": model_paths,
    "threshold": float(threshold),
    "postprocess": None,
    "oof_path": "oof.parquet",
    "config_path": "config.yaml",
    "git_sha": git_sha,
    "wandb_run": None,  # training.ipynbで生成されたW&B URLを後で追加
    "prediction_stats": {
        "mean": float(test_predictions.mean()),
        "std": float(test_predictions.std()),
        "min": float(test_predictions.min()),
        "max": float(test_predictions.max()),
        "survival_rate": float(binary_predictions.mean()),
    },
    "kaggle_submission": submission_info,
    "notes": cfg["experiment"]["description"],
}

# W&B Run URLを追加
try:
    with open("wandb_run.txt", "r") as f:
        wandb_info = f.read().strip().split("\n")
        manifest["wandb_run"] = wandb_info[0] if len(wandb_info) > 0 else None
except:
    pass

# Manifest保存
manifest_path = "submissions/submission.manifest.json"
with open(manifest_path, "w") as f:
    json.dump(manifest, f, indent=2, ensure_ascii=False)

print(f"\nSubmission manifest保存: {manifest_path}")
print(json.dumps(manifest, indent=2, ensure_ascii=False))

In [None]:
# 実験台帳を更新
def append_experiments_csv(row_data, path="../../experiments.csv"):
    """実験台帳にエントリを追加"""
    import csv
    import os

    header = [
        "exp_id",
        "date",
        "git_sha",
        "wandb_url",
        "cv_metric",
        "cv_mean",
        "cv_std",
        "lb_public",
        "lb_private",
        "data_rev",
        "seed",
        "n_splits",
        "cv_method",
        "split_id",
        "notes",
        "submission_id",
        "submitted_at",
    ]

    exists = os.path.exists(path)
    with open(path, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=header)
        if not exists:
            writer.writeheader()
        writer.writerow(row_data)


# メトリクス読み込み
try:
    with open("metrics.json", "r") as f:
        metrics = json.load(f)
except FileNotFoundError:
    metrics = {"cv": {"mean": None, "std": None}}

# CV分割情報読み込み
try:
    cv_folds_df = pd.read_parquet("cv_folds.parquet")
    split_id = cv_folds_df["split_id"].iloc[0]
except:
    split_id = "unknown"

# 実験台帳エントリ作成
experiment_row = {
    "exp_id": cfg["experiment"]["id"],
    "date": cfg["experiment"]["date"],
    "git_sha": git_sha,
    "wandb_url": manifest.get("wandb_run"),
    "cv_metric": "auc",
    "cv_mean": metrics["cv"]["mean"],
    "cv_std": metrics["cv"]["std"],
    "lb_public": submission_info.get("public_score"),
    "lb_private": None,  # コンペ終了後に更新
    "data_rev": "v1",
    "seed": cfg["cv"]["seed"],
    "n_splits": cfg["cv"]["n_splits"],
    "cv_method": cfg["cv"]["method"],
    "split_id": split_id,
    "notes": cfg["experiment"]["description"],
    "submission_id": submission_info.get("id"),
    "submitted_at": submission_info.get("submitted_at"),
}

# 台帳に追加
append_experiments_csv(experiment_row)
print("\n✅ 実験台帳を更新しました")
print(f"実験ID: {experiment_row['exp_id']}")
print(f"CV AUC: {experiment_row['cv_mean']} ± {experiment_row['cv_std']}")
print(f"Submission ID: {experiment_row['submission_id']}")
print(f"Public Score: {experiment_row['lb_public']}")

In [None]:
# 推論サマリー
print("\n" + "=" * 50)
print("INFERENCE SUMMARY")
print("=" * 50)

print(f"実験ID: {cfg['experiment']['id']}")
print(f"使用モデル: {len(models)}個のfoldアンサンブル")
print(f"予測閾値: {threshold:.4f}")
print(f"予測生存率: {binary_predictions.mean():.3f}")
print(f"提出ファイル: {submission_path}")
print(f"Submission ID: {submission_info.get('id', 'N/A')}")
print(f"Public Score: {submission_info.get('public_score', 'N/A')}")

print("\n成果物:")
print(f"- 提出CSV: {submission_path}")
print(f"- Manifest: {manifest_path}")
print(f"- 実験台帳: ../../experiments.csv (更新済み)")

print("\n" + "=" * 50)
print(f"推論完了時刻: {datetime.now()}")
print("=" * 50)