# RSNA Aneurysm Detection Inference - exp0001

**目的**: テストデータでの推論とKaggle提出

**実行内容**:
- 学習済みモデル読み込み
- テストデータ推論（TTA適用）
- アンサンブル予測
- 閾値適用・提出ファイル生成
- Kaggle API自動提出

In [None]:
# 基本ライブラリ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
import json
from pathlib import Path
import warnings
from datetime import datetime
import subprocess

# PyTorch関連
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
from torch.cuda.amp import autocast

# 画像処理
import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2

# 進捗バー
from tqdm import tqdm

warnings.filterwarnings("ignore")

# 設定読み込み
with open("config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

with open("evaluation_metrics.json", "r") as f:
    eval_metrics = json.load(f)

print(f"Running inference for: {cfg['experiment']['id']}")
print(f"Model: {cfg['model']['architecture']}")
print(f"Recommended threshold: {eval_metrics['recommended_threshold']:.4f}")

# デバイス設定
device = torch.device(cfg["environment"]["device"])
print(f"Using device: {device}")

In [None]:
# モデル定義（学習時と同じ）
class AneurysmModel(nn.Module):
    def __init__(self, model_name="resnet50", num_classes=1, pretrained=True):
        super().__init__()

        if model_name == "resnet50":
            self.backbone = models.resnet50(pretrained=pretrained)
            in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()

        self.classifier = nn.Sequential(
            nn.Dropout(cfg["model"]["dropout"]),
            nn.Linear(in_features, cfg["model"]["hidden_dim"]),
            nn.ReLU(),
            nn.Dropout(cfg["model"]["dropout"]),
            nn.Linear(cfg["model"]["hidden_dim"], num_classes),
        )

    def forward(self, x):
        features = self.backbone(x)
        output = self.classifier(features)
        return output


print("Model class defined")

In [None]:
# 学習済みモデル読み込み
models_list = []
for fold in range(cfg["cv"]["n_folds"]):
    model_path = f"model/fold_{fold}_best.pth"
    if Path(model_path).exists():
        model = AneurysmModel(
            model_name=cfg["model"]["architecture"],
            num_classes=cfg["model"]["num_classes"],
            pretrained=False,  # 学習済み重みをロードするのでFalse
        )
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)
        model.eval()
        models_list.append(model)
        print(f"Loaded model for fold {fold + 1}")
    else:
        print(f"⚠️  Model file not found: {model_path}")

print(f"\nTotal models loaded: {len(models_list)}")
if len(models_list) == 0:
    raise FileNotFoundError("No model files found! Run training.ipynb first.")

In [None]:
# テストデータ読み込み（仮のパス - 実際のデータ構造に応じて修正）
data_dir = Path(cfg["paths"]["data_dir"])
test_df = pd.read_csv(data_dir / "test.csv")  # 実際のテストファイルパス
print(f"Test data shape: {test_df.shape}")
print(f"Test data columns: {list(test_df.columns)}")
print("\nFirst few rows:")
print(test_df.head())

In [None]:
# テスト用Transform定義（TTA用の複数バージョン）
def get_test_transforms(tta_type=None):
    base_transforms = [
        A.Resize(cfg["data"]["image_size"][0], cfg["data"]["image_size"][1]),
    ]

    # TTA transforms
    if tta_type == "hflip":
        base_transforms.append(A.HorizontalFlip(p=1.0))
    elif tta_type == "vflip":
        base_transforms.append(A.VerticalFlip(p=1.0))
    elif tta_type == "rotate":
        base_transforms.append(A.Rotate(limit=10, p=1.0))

    base_transforms.extend(
        [
            A.Normalize(
                mean=cfg["data"]["normalization"]["mean"], std=cfg["data"]["normalization"]["std"], max_pixel_value=255.0
            ),
            ToTensorV2(),
        ]
    )

    return A.Compose(base_transforms)


# TTA設定
tta_transforms = [None, "hflip", "vflip"] if cfg["model"].get("inference", {}).get("tta_enabled", True) else [None]
print(f"TTA transforms: {tta_transforms}")

In [None]:
# テスト用Dataset
class TestDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_dir = Path(image_dir)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # 画像読み込み（実際のファイル形式・命名規則に応じて修正）
        image_path = self.image_dir / f"{row['image_id']}.png"
        image = cv2.imread(str(image_path))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transform:
            augmented = self.transform(image=image)
            image = augmented["image"]

        return image


print("Test dataset class defined")

In [None]:
# 推論実行
def predict_with_tta(models, dataloader, device, use_amp=True):
    """
    モデルアンサンブル + TTA で予測
    """
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            images = batch.to(device)
            batch_predictions = []

            # 各モデルで予測
            for model in models:
                with autocast(enabled=use_amp):
                    outputs = model(images)
                    probs = torch.sigmoid(outputs).cpu().numpy()
                    batch_predictions.append(probs)

            # モデル間平均
            ensemble_preds = np.mean(batch_predictions, axis=0)
            all_predictions.extend(ensemble_preds.flatten())

    return np.array(all_predictions)


# TTA推論実行
tta_predictions = []

for tta_type in tta_transforms:
    print(f"\nRunning inference with TTA: {tta_type}")

    # Dataset・DataLoader作成
    test_dataset = TestDataset(
        test_df,
        cfg["paths"]["processed_data"],  # テスト画像ディレクトリ
        transform=get_test_transforms(tta_type),
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=cfg["train"]["batch_size"] * 2,  # 推論時はバッチサイズ大きめ
        shuffle=False,
        num_workers=4,
        pin_memory=True,
    )

    # 予測実行
    predictions = predict_with_tta(models_list, test_loader, device, use_amp=cfg["environment"]["mixed_precision"])

    tta_predictions.append(predictions)
    print(f"Predictions shape: {predictions.shape}")
    print(f"Prediction range: [{predictions.min():.4f}, {predictions.max():.4f}]")

# TTA結果の平均
final_predictions = np.mean(tta_predictions, axis=0)
print(f"\n🎯 Final predictions (TTA ensemble):")
print(f"  Shape: {final_predictions.shape}")
print(f"  Range: [{final_predictions.min():.4f}, {final_predictions.max():.4f}]")
print(f"  Mean: {final_predictions.mean():.4f}")
print(f"  Std: {final_predictions.std():.4f}")

In [None]:
# 予測分布の可視化
plt.figure(figsize=(12, 4))

# 予測確率分布
plt.subplot(1, 2, 1)
plt.hist(final_predictions, bins=50, alpha=0.7, edgecolor="black")
plt.axvline(
    eval_metrics["recommended_threshold"],
    color="red",
    linestyle="--",
    label=f"Recommended threshold: {eval_metrics['recommended_threshold']:.3f}",
)
plt.xlabel("Predicted Probability")
plt.ylabel("Frequency")
plt.title("Test Predictions Distribution")
plt.legend()
plt.grid(True, alpha=0.3)

# 累積分布
plt.subplot(1, 2, 2)
sorted_preds = np.sort(final_predictions)
cumulative = np.arange(1, len(sorted_preds) + 1) / len(sorted_preds)
plt.plot(sorted_preds, cumulative)
plt.axvline(eval_metrics["recommended_threshold"], color="red", linestyle="--")
plt.xlabel("Predicted Probability")
plt.ylabel("Cumulative Probability")
plt.title("Cumulative Distribution")
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("test_predictions_distribution.png", dpi=300, bbox_inches="tight")
plt.show()

# 予測統計
threshold = eval_metrics["recommended_threshold"]
positive_predictions = (final_predictions >= threshold).sum()
positive_ratio = positive_predictions / len(final_predictions)

print(f"\n📊 Test Predictions Summary:")
print(f"  Total samples: {len(final_predictions)}")
print(f"  Predicted positive (≥{threshold:.3f}): {positive_predictions} ({positive_ratio:.2%})")
print(f"  Predicted negative (<{threshold:.3f}): {len(final_predictions) - positive_predictions} ({1 - positive_ratio:.2%})")

In [None]:
# 提出ファイル作成
def create_submission_file(test_df, predictions, threshold, output_dir="submissions"):
    """
    Kaggle提出用CSVファイルを作成
    """
    Path(output_dir).mkdir(exist_ok=True)

    # 予測確率版（主要提出）
    submission_proba = test_df[["image_id"]].copy()
    submission_proba["aneurysm"] = predictions  # 確率値をそのまま

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    proba_filename = f"{output_dir}/{cfg['experiment']['id']}_proba_{timestamp}.csv"
    submission_proba.to_csv(proba_filename, index=False)

    # 二値予測版（バックアップ）
    submission_binary = test_df[["image_id"]].copy()
    submission_binary["aneurysm"] = (predictions >= threshold).astype(int)

    binary_filename = f"{output_dir}/{cfg['experiment']['id']}_binary_th{threshold:.3f}_{timestamp}.csv"
    submission_binary.to_csv(binary_filename, index=False)

    return proba_filename, binary_filename


# 提出ファイル生成
proba_file, binary_file = create_submission_file(test_df, final_predictions, eval_metrics["recommended_threshold"])

print(f"\n📄 Submission files created:")
print(f"  Probability version: {proba_file}")
print(f"  Binary version: {binary_file}")

# 提出ファイル内容確認
submission_df = pd.read_csv(proba_file)
print(f"\n📋 Submission file preview:")
print(submission_df.head(10))
print(f"\nSubmission statistics:")
print(submission_df["aneurysm"].describe())

In [None]:
# Kaggle API提出準備
def submit_to_kaggle(submission_file, competition_name, message):
    """
    Kaggle APIで提出
    """
    try:
        cmd = f'kaggle competitions submit -c {competition_name} -f {submission_file} -m "{message}"'
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

        if result.returncode == 0:
            print(f"✅ Successfully submitted to Kaggle!")
            print(f"Output: {result.stdout}")
            return True
        else:
            print(f"❌ Submission failed: {result.stderr}")
            return False

    except Exception as e:
        print(f"❌ Error during submission: {str(e)}")
        return False


# 提出実行
git_sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip()[:8]
submission_message = cfg["kaggle"]["submission_message"].format(git_sha=git_sha)

print(f"\n🚀 Submitting to Kaggle...")
print(f"Competition: {cfg['kaggle']['competition']}")
print(f"File: {proba_file}")
print(f"Message: {submission_message}")

# 実際の提出（コメントアウトしている場合は手動で実行）
success = submit_to_kaggle(proba_file, cfg["kaggle"]["competition"], submission_message)

if success:
    print("\n🏆 Submission completed!")
else:
    print("\n⚠️  Manual submission required:")
    print(f"   kaggle competitions submit -c {cfg['kaggle']['competition']} -f {proba_file} -m '{submission_message}'")

In [None]:
# 推論結果保存（トレーサビリティ用）
inference_manifest = {
    "experiment_id": cfg["experiment"]["id"],
    "inference_date": datetime.now().isoformat(),
    "git_sha": git_sha,
    # モデル情報
    "models_used": [f"fold_{i}_best.pth" for i in range(len(models_list))],
    "model_architecture": cfg["model"]["architecture"],
    # 推論設定
    "tta_enabled": len(tta_transforms) > 1,
    "tta_methods": tta_transforms,
    "threshold_used": float(eval_metrics["recommended_threshold"]),
    "mixed_precision": cfg["environment"]["mixed_precision"],
    # 予測統計
    "test_samples": len(final_predictions),
    "prediction_statistics": {
        "mean": float(final_predictions.mean()),
        "std": float(final_predictions.std()),
        "min": float(final_predictions.min()),
        "max": float(final_predictions.max()),
        "positive_predictions": int(positive_predictions),
        "positive_ratio": float(positive_ratio),
    },
    # 提出情報
    "submission_files": {"probability": proba_file, "binary": binary_file},
    "kaggle_submission": {"competition": cfg["kaggle"]["competition"], "message": submission_message, "success": success},
    # 品質指標（学習時参照）
    "training_performance": {
        "cv_auc": eval_metrics["cv_mean_auc"],
        "oof_auc": eval_metrics["oof_auc"],
        "cv_reliability_score": eval_metrics["quality_audit"]["cv_reliability_score"],
    },
}

# マニフェスト保存
with open("inference_manifest.json", "w") as f:
    json.dump(inference_manifest, f, indent=2)

# テスト予測も保存（後でアンサンブルに使用可能）
test_predictions_df = test_df[["image_id"]].copy()
test_predictions_df["predicted_probability"] = final_predictions
test_predictions_df["experiment_id"] = cfg["experiment"]["id"]
test_predictions_df.to_csv("test_predictions.csv", index=False)

print(f"\n💾 Inference artifacts saved:")
print(f"  - inference_manifest.json")
print(f"  - test_predictions.csv")
print(f"  - test_predictions_distribution.png")
print(f"  - {proba_file}")
print(f"  - {binary_file}")

In [None]:
# Google Drive バックアップ
backup_dir = f"/content/drive/MyDrive/rsna-aneurysm/{cfg['experiment']['id']}"
!mkdir -p "{backup_dir}/submissions"

# 重要ファイルをDriveにコピー
!cp inference_manifest.json "{backup_dir}/"
!cp test_predictions.csv "{backup_dir}/"
!cp test_predictions_distribution.png "{backup_dir}/"
!cp "{proba_file}" "{backup_dir}/submissions/"
!cp "{binary_file}" "{backup_dir}/submissions/"

print(f"\n☁️  Files backed up to Google Drive: {backup_dir}")

# 実験完了サマリー
print(f"\n" + "=" * 60)
print(f"🎯 EXPERIMENT {cfg['experiment']['id'].upper()} - INFERENCE COMPLETED")
print(f"=" * 60)
print(f"Model: {cfg['model']['architecture']}")
print(f"CV Performance: {eval_metrics['cv_mean_auc']:.4f} ± {eval_metrics['cv_std_auc']:.4f}")
print(f"Test Predictions: {len(final_predictions)} samples")
print(f"Predicted Positive: {positive_predictions} ({positive_ratio:.2%})")
print(f"Kaggle Submission: {'✅ Success' if success else '⚠️ Manual required'}")
print(f"\nNext Steps:")
print(f"  1. Monitor Kaggle LB score")
print(f"  2. Update experiments.csv with LB results")
print(f"  3. Plan next experiment (exp0002)")
print(f"\n🚀 Ready for next experiment!")