# RSNA-2025 ベースライン学習 (exp0001_baseline)

このノートブックでは、ベースラインモデル（GradientBoosting）の学習を実行します。

- 実験ID: exp0001_baseline
- モデル: GradientBoostingClassifier
- 特徴量: 年齢、性別、モダリティ
- 目的変数: Aneurysm Present

## 実験設定

実験の再現性を確保するため、以下の設定を使用します：
- SEED = 130
- test_size = 0.2
- stratified split


In [None]:
# 0) セットアップ（Colab）
import os
import sys
import subprocess
from pathlib import Path

IN_COLAB = 'google.colab' in sys.modules
print('IN_COLAB =', IN_COLAB)

if IN_COLAB:
    # pip を Python から実行
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-U', 'pip'], check=True)
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q',
                    'pandas', 'polars', 'seaborn', 'scikit-learn', 'matplotlib', 'gcsfs', 'fsspec'], check=True)

    # GCP 認証（ADC）
    from google.colab import auth  # type: ignore
    auth.authenticate_user()

    # 作業ディレクトリを設定
    os.chdir('/content')
    
    # GitHub から本リポジトリを取得
    REPO_URL = 'https://github.com/Kohei-Arita/RSNA-2025.git'
    REPO_DIR = Path('/content/RSNA-2025')
    if not REPO_DIR.exists():
        subprocess.run(['git', 'clone', REPO_URL], check=True)
    os.chdir('/content/RSNA-2025')

    # リポジトリの src を追加
    sys.path.insert(0, str(Path.cwd() / 'src'))

# GCS バケット設定
GCS_BUCKET = 'rsna2025-prod'
GCS_BASE = f'gs://{GCS_BUCKET}'
print('GCS_BASE =', GCS_BASE)


In [None]:
# 1) データ読込
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

SEED = 130

train_uri = f'{GCS_BASE}/train.csv'

# ColabのADCを gcsfs が利用
train = pd.read_csv(train_uri, storage_options={'token': 'cloud'})

print(f"Number of training series: {train.shape[0]}")
display(train.head())


In [None]:
# 2) 特徴量エンジニアリング
# 年齢を数値化（"xx - yy" 形式の先頭、または数字抽出）
df_age_str = train['PatientAge'].astype(str)
age_first = df_age_str.str.split(' - ').str[0]
age_vals = pd.to_numeric(age_first.str.extract(r'([0-9]+(?:\.[0-9]+)?)')[0], errors='coerce')

# 特徴量作成: 年齢・性別（Male=1）・モダリティone-hot
x_age = age_vals.fillna(age_vals.median())
X = pd.DataFrame({
    'age': x_age,
    'sex': (train['PatientSex'] == 'Male').astype(int)
})
mod_dummies = pd.get_dummies(train['Modality'], prefix='mod')
X = pd.concat([X, mod_dummies], axis=1)

# 目的変数
if train['Aneurysm Present'].dtype != np.int64 and train['Aneurysm Present'].dtype != np.int32:
    y = train['Aneurysm Present'].astype(int)
else:
    y = train['Aneurysm Present']

print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution: {y.value_counts()}")


In [None]:
# 3) 学習・検証データ分割
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Training target distribution:\n{y_train.value_counts()}")
print(f"Validation target distribution:\n{y_val.value_counts()}")


In [None]:
# 4) モデル学習
gbm = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=8,
    random_state=SEED
)

gbm.fit(X_train, y_train)

# 訓練データでの予測
train_probs = gbm.predict_proba(X_train)[:, 1]
train_auc = roc_auc_score(y_train, train_probs)
print(f"GBM Training AUC: {train_auc:.4f}")

# 検証データでの予測
val_probs = gbm.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
print(f"GBM Validation AUC: {val_auc:.4f}")


In [None]:
# 5) 特徴量重要度の可視化
import matplotlib.pyplot as plt
import seaborn as sns

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': gbm.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.show()

display(feature_importance)


In [None]:
# 6) モデルと設定の保存
import pickle
import json

# モデルの保存
models_dir = Path('models/exp0001_baseline')
models_dir.mkdir(parents=True, exist_ok=True)

with open(models_dir / 'gbm_baseline.pkl', 'wb') as f:
    pickle.dump(gbm, f)

# 列名の保存（推論で必要）
MOD_COLUMNS = list(mod_dummies.columns)
metadata = {
    'feature_columns': list(X.columns),
    'mod_columns': MOD_COLUMNS,
    'train_auc': float(train_auc),
    'val_auc': float(val_auc),
    'seed': SEED,
    'model_params': gbm.get_params()
}

with open(models_dir / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Model saved to: {models_dir}")
print(f"Training AUC: {train_auc:.4f}")
print(f"Validation AUC: {val_auc:.4f}")
