In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from scipy import stats
from scipy.stats import norm

In [None]:
df = pd.read_csv("../input/estyle-community-competition-2025/train.csv")
test = pd.read_csv("../input/estyle-community-competition-2025/test.csv")

In [None]:
# データの基本情報を確認
print("=== Train Data Shape ===")
print(f"Train shape: {df.shape}")
print(f"Test shape: {test.shape}")
print("\n=== Train Data Info ===")
print(df.info())
print("\n=== Missing Values Summary ===")
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2),
    'Dtype': df.dtypes
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
print(missing_df)

In [None]:
# カテゴリカル変数のユニーク値数とサンプル値を確認
categorical_cols = df.select_dtypes(include=['object']).columns
print("=== Categorical Variables Analysis ===\n")
for col in categorical_cols:
    unique_count = df[col].nunique()
    top_values = df[col].value_counts().head(5)
    print(f"\n{col}:")
    print(f"  Unique values: {unique_count}")
    print(f"  Top 5 values:\n{top_values}")
    print(f"  Sample values: {df[col].dropna().unique()[:10].tolist()}")

In [None]:
# 数値変数の統計情報と分布
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("=== Numeric Variables Statistics ===\n")
print(df[numeric_cols].describe())

# Target変数の分布を確認
print("\n=== Target Variable (TradePrice) Distribution ===")
print(f"Skewness: {df['TradePrice'].skew():.4f}")
print(f"Kurtosis: {df['TradePrice'].kurtosis():.4f}")
print(f"Log-transformed Skewness: {np.log1p(df['TradePrice']).skew():.4f}")

In [None]:
# 欠損値のパターンを分析
# 1. Type別の欠損値パターン
print("=== Missing Values Pattern by Type ===\n")
for col in missing_df['Column'].head(10):  # 欠損の多い上位10カラム
    print(f"\n{col} missing rate by Type:")
    type_missing = df.groupby('Type')[col].apply(lambda x: x.isnull().sum() / len(x) * 100)
    print(type_missing.sort_values(ascending=False))


In [None]:
# 重要な特徴量と欠損値の関係性を分析
print("=== Correlation between Missing Values and Target ===\n")

# 欠損値フラグを作成してTradePriceとの相関を確認
for col in missing_df['Column'].head(10):
    if col != 'TradePrice':
        missing_flag = df[col].isnull().astype(int)
        
        # 欠損あり/なしでのTradePriceの平均値の差
        price_with_value = df[df[col].notnull()]['TradePrice'].mean()
        price_without_value = df[df[col].isnull()]['TradePrice'].mean()
        diff_pct = ((price_with_value - price_without_value) / price_without_value * 100) if price_without_value > 0 else 0
        
        print(f"{col}:")
        print(f"  Avg price with value: ¥{price_with_value:,.0f}")
        print(f"  Avg price without value: ¥{price_without_value:,.0f}")
        print(f"  Difference: {diff_pct:+.1f}%\n")

In [None]:
# Train/Testでの欠損値パターンの違いを確認
print("=== Missing Values Comparison: Train vs Test ===\n")
comparison_data = []
for col in df.columns:
    if col in test.columns:
        train_missing = (df[col].isnull().sum() / len(df) * 100)
        test_missing = (test[col].isnull().sum() / len(test) * 100)
        comparison_data.append({
            'Column': col,
            'Train_Missing_%': train_missing,
            'Test_Missing_%': test_missing,
            'Difference': test_missing - train_missing
        })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df[(comparison_df['Train_Missing_%'] > 0) | (comparison_df['Test_Missing_%'] > 0)]
comparison_df = comparison_df.sort_values('Train_Missing_%', ascending=False)
print(comparison_df.head(20))

In [None]:
# カテゴリカル変数の詳細情報（欠損値処理のため）
print("=== Categorical Variables Detailed Info ===\n")
categorical_info = []
for col in categorical_cols:
    info = {
        'Column': col,
        'Unique_Count': df[col].nunique(),
        'Missing_Rate_%': (df[col].isnull().sum() / len(df) * 100).round(2),
        'Top_Value': df[col].mode()[0] if len(df[col].mode()) > 0 else None,
        'Top_Value_Freq_%': (df[col].value_counts().iloc[0] / df[col].count() * 100).round(2) if df[col].count() > 0 else 0,
        'Is_Ordinal': 'Unknown'  # 手動で判定が必要
    }
    categorical_info.append(info)

cat_info_df = pd.DataFrame(categorical_info)
print(cat_info_df.sort_values('Missing_Rate_%', ascending=False))