# LightGBM による不動産価格予測モデル

## 目的
- リーケージを防止した特徴量エンジニアリング
- クロスバリデーションによるモデル検証
- ハイパーパラメータチューニング
- 特徴量重要度の分析

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 日本語フォント設定
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'Hiragino Sans']
plt.rcParams['axes.unicode_minus'] = False

print("ライブラリのインポート完了")

## データの読み込み

In [None]:
# 元データを読み込み（特徴量エンジニアリングをCV内で実施するため）
train = pd.read_csv("../input/estyle-community-competition-2025/train.csv")
test = pd.read_csv("../input/estyle-community-competition-2025/test.csv")

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# ターゲット変数を分離
target = train['TradePrice'].copy()
train_ids = train['Id'].copy()
test_ids = test['Id'].copy()

# 対数変換したターゲット（RMSLE評価のため）
target_log = np.log1p(target)
print(f"\nTarget statistics:")
print(target.describe())

## リーケージ防止を考慮した特徴量エンジニアリング関数

In [None]:
def create_features(df, is_train=True, train_target=None, fold_stats=None):
    """
    リーケージを防止しながら特徴量を作成する関数
    
    Parameters:
    -----------
    df : DataFrame
        入力データ
    is_train : bool
        訓練データかどうか
    train_target : Series
        訓練データのターゲット（ターゲットエンコーディング用）
    fold_stats : dict
        Fold内で計算した統計量（テストデータ用）
    
    Returns:
    --------
    df_processed : DataFrame
        処理済みデータ
    fold_stats : dict
        計算した統計量（次のFoldやテストデータで使用）
    """
    df = df.copy()
    
    # 統計量の辞書を初期化
    if fold_stats is None:
        fold_stats = {}
    
    # ========== 欠損フラグの作成 ==========
    df['HasBuilding'] = df['TotalFloorArea'].notnull().astype(int)
    df['HasFloorPlan'] = df['FloorPlan'].notnull().astype(int)
    df['HasStructure'] = df['Structure'].notnull().astype(int)
    df['HasBuildingYear'] = df['BuildingYear'].notnull().astype(int)
    df['HasRoadAccess'] = df['Breadth'].notnull().astype(int)
    df['HasRoadClassification'] = df['Classification'].notnull().astype(int)
    df['HasFrontage'] = df['Frontage'].notnull().astype(int)
    df['HasLandShape'] = df['LandShape'].notnull().astype(int)
    df['HasDirection'] = df['Direction'].notnull().astype(int)
    df['HasStation'] = df['NearestStation'].notnull().astype(int)
    df['HasRemarks'] = df['Remarks'].notnull().astype(int)
    
    # ========== 欠損値の補完 ==========
    # 駅情報
    df['NearestStation'] = df['NearestStation'].fillna('No Station')
    
    # MinTimeToNearestStation: Type別の中央値で補完
    if 'MinTime_by_Type' not in fold_stats:
        fold_stats['MinTime_by_Type'] = df.groupby('Type')['MinTimeToNearestStation'].median().to_dict()
        fold_stats['MinTime_global'] = df['MinTimeToNearestStation'].median()
    
    df['MinTimeToNearestStation'] = df.apply(
        lambda row: fold_stats['MinTime_by_Type'].get(row['Type'], fold_stats['MinTime_global']) 
        if pd.isna(row['MinTimeToNearestStation']) else row['MinTimeToNearestStation'],
        axis=1
    )
    
    # 冗長な列を削除
    df = df.drop(['MaxTimeToNearestStation', 'TimeToNearestStation'], axis=1, errors='ignore')
    
    # 地域情報
    df['Region'] = df['Region'].fillna('Unknown')
    
    # 建物情報
    df['FloorPlan'] = df['FloorPlan'].fillna('No Building')
    df['TotalFloorArea'] = df['TotalFloorArea'].fillna(0)
    df['BuildingYear'] = df['BuildingYear'].fillna(0)
    df['Structure'] = df['Structure'].fillna('No Building')
    df['Use'] = df['Use'].fillna('Vacant Land')
    df['Renovation'] = df['Renovation'].fillna('Unknown')
    df = df.drop('Purpose', axis=1, errors='ignore')
    
    # 土地情報
    df['LandShape'] = df['LandShape'].fillna('Unknown')
    df['Direction'] = df['Direction'].fillna('Unknown')
    
    # Frontage: Type別の中央値で補完
    if 'Frontage_by_Type' not in fold_stats:
        fold_stats['Frontage_by_Type'] = df.groupby('Type')['Frontage'].median().to_dict()
        fold_stats['Frontage_global'] = df['Frontage'].median()
    
    df['Frontage'] = df.apply(
        lambda row: fold_stats['Frontage_by_Type'].get(row['Type'], fold_stats['Frontage_global'])
        if pd.isna(row['Frontage']) else row['Frontage'],
        axis=1
    )
    
    # 道路情報
    df['Classification'] = df['Classification'].fillna('No Road')
    df['Breadth'] = df['Breadth'].fillna(0)
    
    # 都市計画情報
    df['CityPlanning'] = df['CityPlanning'].fillna('Outside City Planning')
    
    # CoverageRatio, FloorAreaRatio: Type別の中央値で補完
    for col in ['CoverageRatio', 'FloorAreaRatio']:
        if f'{col}_by_Type' not in fold_stats:
            fold_stats[f'{col}_by_Type'] = df.groupby('Type')[col].median().to_dict()
            fold_stats[f'{col}_global'] = df[col].median()
        
        df[col] = df.apply(
            lambda row: fold_stats[f'{col}_by_Type'].get(row['Type'], fold_stats[f'{col}_global'])
            if pd.isna(row[col]) else row[col],
            axis=1
        )
    
    # その他
    df['DistrictName'] = df['DistrictName'].fillna('Unknown')
    df = df.drop('Remarks', axis=1, errors='ignore')
    
    # ========== 派生特徴量の作成 ==========
    # 欠損パターン集約
    df['MissingBuildingInfo'] = (
        (1 - df['HasFloorPlan']) + 
        (1 - df['HasBuilding']) + 
        (1 - df['HasBuildingYear']) + 
        (1 - df['HasStructure'])
    )
    
    df['MissingRoadInfo'] = (
        (1 - df['HasRoadAccess']) + 
        (1 - df['HasRoadClassification']) + 
        (1 - df['HasFrontage']) + 
        (1 - df['HasDirection'])
    )
    
    # 築年数
    df['BuildingAge'] = df['Year'] - df['BuildingYear']
    df.loc[df['BuildingYear'] == 0, 'BuildingAge'] = -1
    
    # 面積関連特徴
    df['FloorAreaRatioActual'] = df['TotalFloorArea'] / df['Area']
    df['FloorAreaRatioActual'] = df['FloorAreaRatioActual'].replace([np.inf, -np.inf], 0).fillna(0)
    
    df['FloorAreaRatioUsage'] = 0.0
    mask = df['FloorAreaRatio'] > 0
    df.loc[mask, 'FloorAreaRatioUsage'] = (
        df.loc[mask, 'FloorAreaRatioActual'] / df.loc[mask, 'FloorAreaRatio']
    )
    
    # 四半期ダミー
    for q in range(1, 5):
        df[f'Quarter_Q{q}'] = (df['Quarter'] == q).astype(int)
    
    # ========== ターゲットエンコーディング（高カーディナリティ変数） ==========
    # リーケージ防止: 訓練データのみでエンコーディング値を計算
    high_card_cols = ['NearestStation', 'DistrictName', 'Municipality']
    
    if is_train and train_target is not None:
        for col in high_card_cols:
            if col in df.columns:
                # 各カテゴリの平均値を計算（対数変換済みターゲット）
                target_mean = pd.DataFrame({
                    col: df[col],
                    'target': train_target
                }).groupby(col)['target'].mean()
                
                fold_stats[f'{col}_target_enc'] = target_mean.to_dict()
                fold_stats[f'{col}_global_mean'] = train_target.mean()
    
    # ターゲットエンコーディング値を適用
    for col in high_card_cols:
        if col in df.columns and f'{col}_target_enc' in fold_stats:
            df[f'{col}_TargetEnc'] = df[col].map(
                fold_stats[f'{col}_target_enc']
            ).fillna(fold_stats[f'{col}_global_mean'])
    
    # ========== カテゴリ変数のLabel Encoding ==========
    # LightGBM用にカテゴリ変数をそのまま保持（文字列型）
    categorical_features = [
        'Type', 'Prefecture', 'Region', 'Structure', 'LandShape', 
        'Direction', 'Classification', 'Renovation', 'CityPlanning',
        'NearestStation', 'DistrictName', 'Municipality', 'Use', 'FloorPlan'
    ]
    
    # カテゴリ型に変換
    for col in categorical_features:
        if col in df.columns:
            df[col] = df[col].astype('category')

    
    return df, fold_stats, categorical_features

print("特徴量エンジニアリング関数の定義完了")

## RMSLE計算関数

In [None]:
def rmsle(y_true, y_pred):
    """
    Root Mean Squared Logarithmic Error
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def rmsle_lgb(y_pred, dtrain):
    """
    LightGBM用のRMSLE評価関数
    対数変換済みの予測値を元のスケールに戻してRMSLEを計算
    """
    y_true = dtrain.get_label()
    # 対数変換を元に戻す
    y_true_original = np.expm1(y_true)
    y_pred_original = np.expm1(y_pred)
    # 負の値をクリップ
    y_pred_original = np.maximum(y_pred_original, 0)
    score = rmsle(y_true_original, y_pred_original)
    return 'rmsle', score, False

print("評価関数の定義完了")

## クロスバリデーション実装

In [None]:
def train_and_predict_with_cv(train_df, test_df, target, n_splits=5, seed=42):
    """
    クロスバリデーションを用いた学習と予測
    
    Parameters:
    -----------
    train_df : DataFrame
        訓練データ
    test_df : DataFrame
        テストデータ
    target : Series
        ターゲット変数（元のスケール）
    n_splits : int
        Foldの数
    seed : int
        乱数シード
    
    Returns:
    --------
    oof_predictions : ndarray
        Out-of-fold予測値
    test_predictions : ndarray
        テストデータの予測値
    models : list
        学習済みモデルのリスト
    feature_importance_df : DataFrame
        特徴量重要度
    cv_scores : list
        各Foldのスコア
    train_processed_df : DataFrame
        処理済み訓練データ（全Fold分）
    test_processed_df : DataFrame
        処理済みテストデータ
    """
    # 対数変換したターゲット
    target_log = np.log1p(target)
    
    # KFold設定
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    # 予測値を格納する配列
    oof_predictions = np.zeros(len(train_df))
    test_predictions = np.zeros(len(test_df))
    
    # モデルと特徴量重要度を格納するリスト
    models = []
    feature_importance_list = []
    
    # スコアを格納するリスト
    cv_scores = []
    
    # 処理済みデータフレームを格納（最後のFoldのものを保存）
    train_processed_df = None
    test_processed_df = None
    
    # LightGBMのパラメータ
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'max_depth': -1,
        'min_child_samples': 20,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': seed,
        'n_jobs': -1,
        'verbose': -1
    }
    
    print(f"\n{'='*60}")
    print(f"クロスバリデーション開始: {n_splits} Folds")
    print(f"{'='*60}\n")
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df), 1):
        print(f"\n--- Fold {fold}/{n_splits} ---")
        
        # 訓練データと検証データに分割
        X_train_fold = train_df.iloc[train_idx].copy()
        X_val_fold = train_df.iloc[val_idx].copy()
        y_train_fold = target_log.iloc[train_idx]
        y_val_fold = target_log.iloc[val_idx]
        
        # Fold内で特徴量エンジニアリング（リーケージ防止）
        X_train_processed, fold_stats, cat_features = create_features(
            X_train_fold, 
            is_train=True, 
            train_target=y_train_fold
        )
        
        X_val_processed, _, _ = create_features(
            X_val_fold,
            is_train=False,
            fold_stats=fold_stats
        )
        
        # 不要なカラムを削除
        drop_cols = ['Id', 'TradePrice']
        X_train_processed = X_train_processed.drop(drop_cols, axis=1, errors='ignore')
        X_val_processed = X_val_processed.drop(drop_cols, axis=1, errors='ignore')
        
        # カラムの整合性を確認
        common_cols = X_train_processed.columns.intersection(X_val_processed.columns)
        X_train_processed = X_train_processed[common_cols]
        X_val_processed = X_val_processed[common_cols]
        
        # カテゴリカル特徴のインデックスを取得
        cat_features_idx = [X_train_processed.columns.get_loc(col) 
                           for col in cat_features if col in X_train_processed.columns]
        
        # LightGBM用のデータセット作成
        lgb_train = lgb.Dataset(
            X_train_processed, 
            y_train_fold,
            categorical_feature=cat_features_idx
        )
        lgb_val = lgb.Dataset(
            X_val_processed, 
            y_val_fold,
            categorical_feature=cat_features_idx,
            reference=lgb_train
        )
        
        # モデル学習
        model = lgb.train(
            params,
            lgb_train,
            num_boost_round=1000,
            valid_sets=[lgb_train, lgb_val],
            valid_names=['train', 'valid'],
            feval=rmsle_lgb,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50, verbose=False),
                lgb.log_evaluation(period=100)
            ]
        )
        
        # Out-of-fold予測（対数スケール）
        oof_pred_log = model.predict(X_val_processed, num_iteration=model.best_iteration)
        oof_predictions[val_idx] = oof_pred_log
        
        # 元のスケールに戻してスコア計算
        y_val_original = np.expm1(y_val_fold)
        oof_pred_original = np.expm1(oof_pred_log)
        oof_pred_original = np.maximum(oof_pred_original, 0)
        
        fold_score = rmsle(y_val_original, oof_pred_original)
        cv_scores.append(fold_score)
        print(f"Fold {fold} RMSLE: {fold_score:.6f}")
        
        # テストデータの予測
        X_test_processed, _, _ = create_features(
            test_df,
            is_train=False,
            fold_stats=fold_stats
        )
        X_test_processed = X_test_processed.drop(['Id'], axis=1, errors='ignore')
        X_test_processed = X_test_processed[common_cols]
        
        test_pred_log = model.predict(X_test_processed, num_iteration=model.best_iteration)
        test_predictions += test_pred_log / n_splits
        
        # モデルと特徴量重要度を保存
        models.append(model)
        
        fold_importance = pd.DataFrame({
            'feature': X_train_processed.columns,
            'importance': model.feature_importance(importance_type='gain'),
            'fold': fold
        })
        feature_importance_list.append(fold_importance)
        
        # 最後のFoldで処理済みデータフレームを保存
        if fold == n_splits:
            # 全訓練データを処理（最後のfold_statsを使用）
            train_processed_full, _, _ = create_features(
                train_df,
                is_train=False,
                fold_stats=fold_stats
            )
            train_processed_df = train_processed_full.drop(['Id', 'TradePrice'], axis=1, errors='ignore')
            train_processed_df = train_processed_df[common_cols]
            
            # テストデータも保存
            test_processed_df = X_test_processed.copy()
    
    # 全体のCVスコア
    oof_pred_original = np.expm1(oof_predictions)
    oof_pred_original = np.maximum(oof_pred_original, 0)
    overall_score = rmsle(target, oof_pred_original)
    
    print(f"\n{'='*60}")
    print(f"クロスバリデーション結果")
    print(f"{'='*60}")
    print(f"各Foldのスコア: {[f'{s:.6f}' for s in cv_scores]}")
    print(f"平均スコア: {np.mean(cv_scores):.6f} (+/- {np.std(cv_scores):.6f})")
    print(f"Overall OOF Score: {overall_score:.6f}")
    print(f"{'='*60}\n")
    
    # 特徴量重要度の集約
    feature_importance_df = pd.concat(feature_importance_list, axis=0)
    
    return oof_predictions, test_predictions, models, feature_importance_df, cv_scores, train_processed_df, test_processed_df

print("クロスバリデーション関数の定義完了")

## モデルの学習と予測

In [None]:
# クロスバリデーションで学習
oof_preds, test_preds, trained_models, feature_importance, cv_scores, train_processed, test_processed = train_and_predict_with_cv(
    train, 
    test, 
    target,
    n_splits=5,
    seed=42
)

## 6. 処理済みデータの保存

In [None]:
# 処理済みデータフレームの確認
print(f"訓練データ処理済み: {train_processed.shape}")
print(f"テストデータ処理済み: {test_processed.shape}")

# CSVファイルとして保存
import os
output_dir = "../output"
os.makedirs(output_dir, exist_ok=True)

train_processed.to_csv(f"{output_dir}/train_features_processed.csv", index=False)
test_processed.to_csv(f"{output_dir}/test_features_processed.csv", index=False)

print(f"\n処理済みデータを保存しました:")
print(f"- {output_dir}/train_features_processed.csv")
print(f"- {output_dir}/test_features_processed.csv")

## 特徴量重要度の可視化

In [None]:
# 平均重要度を計算
importance_grouped = feature_importance.groupby('feature')['importance'].agg(['mean', 'std']).reset_index()
importance_grouped = importance_grouped.sort_values('mean', ascending=False)

# 上位30個の特徴量を可視化
top_n = 30
top_features = importance_grouped.head(top_n)

plt.figure(figsize=(12, 10))
plt.barh(range(len(top_features)), top_features['mean'], xerr=top_features['std'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance (Gain)')
plt.title(f'Top {top_n} Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../output/feature_importance_lightgbm.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n=== Top 30 Feature Importance ===")
print(top_features.to_string(index=False))

## 予測結果の分析

In [None]:
# OOF予測を元のスケールに変換
oof_preds_original = np.expm1(oof_preds)
oof_preds_original = np.maximum(oof_preds_original, 0)

# 実測値 vs 予測値のプロット
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 散布図
axes[0].scatter(target, oof_preds_original, alpha=0.3, s=1)
axes[0].plot([target.min(), target.max()], [target.min(), target.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price (Yen)')
axes[0].set_ylabel('Predicted Price (Yen)')
axes[0].set_title('Actual vs Predicted (OOF)')
axes[0].set_xscale('log')
axes[0].set_yscale('log')

# 残差プロット
residuals = target - oof_preds_original
axes[1].scatter(oof_preds_original, residuals, alpha=0.3, s=1)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Price (Yen)')
axes[1].set_ylabel('Residuals (Yen)')
axes[1].set_title('Residual Plot')
axes[1].set_xscale('log')

plt.tight_layout()
plt.savefig('../output/prediction_analysis_lightgbm.png', dpi=300, bbox_inches='tight')
plt.show()

# 予測値の統計
print("\n=== OOF Prediction Statistics ===")
print(pd.Series(oof_preds_original).describe())

## サブミッションファイルの作成

In [None]:
# テスト予測を元のスケールに変換
test_preds_original = np.expm1(test_preds)
test_preds_original = np.maximum(test_preds_original, 0)

# サブミッションファイル作成
submission = pd.DataFrame({
    'Id': test_ids,
    'TradePrice': test_preds_original
})

# 保存
submission.to_csv('../output/submission_lightgbm.csv', index=False)

print("\n=== Submission File Created ===")
print(f"Shape: {submission.shape}")
print(f"\nSample:")
print(submission.head(10))
print(f"\nPrediction Statistics:")
print(submission['TradePrice'].describe())
print(f"\nFile saved: ../output/submission_lightgbm.csv")

## モデルパフォーマンスのサマリー

In [None]:
# CVスコアの可視化
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(cv_scores) + 1), cv_scores, alpha=0.7, color='steelblue')
plt.axhline(y=np.mean(cv_scores), color='red', linestyle='--', label=f'Mean: {np.mean(cv_scores):.6f}')
plt.xlabel('Fold')
plt.ylabel('RMSLE')
plt.title('Cross-Validation Scores by Fold')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('../output/cv_scores_lightgbm.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "="*60)
print("モデルパフォーマンスサマリー")
print("="*60)
print(f"モデル: LightGBM")
print(f"特徴量数: {len(importance_grouped)}")
print(f"CVフォールド数: {len(cv_scores)}")
print(f"\nクロスバリデーションスコア:")
for i, score in enumerate(cv_scores, 1):
    print(f"  Fold {i}: {score:.6f}")
print(f"\n平均RMSLE: {np.mean(cv_scores):.6f}")
print(f"標準偏差: {np.std(cv_scores):.6f}")
print(f"Overall OOF RMSLE: {rmsle(target, oof_preds_original):.6f}")
print("="*60)