# 顔特徴点検出データの探索 / Facial Keypoints Data Exploration

このノートブックでは、Kaggle顔特徴点検出データセットの探索的データ分析を行います。

This notebook performs exploratory data analysis on the Kaggle facial keypoints detection dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path for imports
sys.path.append(os.path.join('..', 'src'))

from data.preprocessing import DataPreprocessor
from utils.visualization import (
    plot_keypoints_on_image, 
    visualize_data_distribution,
    KEYPOINT_NAMES
)

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

%matplotlib inline

## 1. データ読み込み / Data Loading

In [None]:
# Load the training data
# Please ensure you have downloaded the Kaggle data
data_path = "../training.csv"  # Adjust path as needed

try:
    df = pd.read_csv(data_path)
    print(f"データが正常に読み込まれました / Data loaded successfully")
    print(f"Shape: {df.shape}")
except FileNotFoundError:
    print("データファイルが見つかりません / Data file not found")
    print("Please download the Kaggle facial keypoints detection data")
    df = None

## 2. データ基本情報 / Basic Data Information

In [None]:
if df is not None:
    # Basic information
    print("データセット基本情報 / Dataset Basic Information")
    print("=" * 50)
    print(f"サンプル数 / Number of samples: {len(df)}")
    print(f"列数 / Number of columns: {len(df.columns)}")
    print(f"画像列 / Image column: {'Image' in df.columns}")
    
    # Keypoint columns
    keypoint_cols = [col for col in df.columns if col != 'Image']
    print(f"特徴点列数 / Keypoint columns: {len(keypoint_cols)}")
    
    # Display first few rows
    print("\n最初の5行 / First 5 rows:")
    display(df.head())

## 3. 欠損値分析 / Missing Value Analysis

In [None]:
if df is not None:
    # Missing value analysis
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Count': missing_values,
        'Missing Percentage': missing_percentage
    })
    
    # Filter only columns with missing values
    missing_df = missing_df[missing_df['Missing Count'] > 0]
    missing_df = missing_df.sort_values('Missing Percentage', ascending=False)
    
    print("欠損値分析 / Missing Value Analysis")
    print("=" * 50)
    display(missing_df)
    
    # Visualize missing values
    if len(missing_df) > 0:
        plt.figure(figsize=(12, 6))
        plt.bar(range(len(missing_df)), missing_df['Missing Percentage'])
        plt.xlabel('Keypoint Features')
        plt.ylabel('Missing Percentage (%)')
        plt.title('Missing Values by Keypoint Feature')
        plt.xticks(range(len(missing_df)), missing_df.index, rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

## 4. 画像データ分析 / Image Data Analysis

In [None]:
if df is not None:
    # Analyze image data
    sample_image_str = df['Image'].iloc[0]
    sample_pixels = np.array(sample_image_str.split(), dtype=np.float32)
    
    print("画像データ分析 / Image Data Analysis")
    print("=" * 50)
    print(f"画像ピクセル数 / Image pixels: {len(sample_pixels)}")
    print(f"想定画像サイズ / Expected image size: {int(np.sqrt(len(sample_pixels)))}x{int(np.sqrt(len(sample_pixels)))}")
    print(f"ピクセル値範囲 / Pixel value range: {sample_pixels.min():.1f} - {sample_pixels.max():.1f}")
    print(f"ピクセル値平均 / Pixel value mean: {sample_pixels.mean():.1f}")
    print(f"ピクセル値標準偏差 / Pixel value std: {sample_pixels.std():.1f}")

## 5. サンプル画像の可視化 / Sample Image Visualization

In [None]:
if df is not None:
    # Visualize sample images
    num_samples = min(8, len(df))
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()
    
    for i in range(num_samples):
        # Get image
        image_str = df['Image'].iloc[i]
        image = np.array(image_str.split(), dtype=np.float32).reshape(96, 96)
        
        # Display image
        axes[i].imshow(image, cmap='gray')
        axes[i].set_title(f'Sample {i+1}')
        axes[i].axis('off')
    
    plt.suptitle('サンプル画像 / Sample Images', fontsize=16)
    plt.tight_layout()
    plt.show()

## 6. 特徴点データ分析 / Keypoint Data Analysis

In [None]:
if df is not None:
    # Analyze keypoint data using the preprocessor
    stats = DataPreprocessor.analyze_dataset(data_path)
    
    print("特徴点データ統計 / Keypoint Data Statistics")
    print("=" * 50)
    print(f"全特徴点を持つサンプル / Samples with all keypoints: {stats['samples_with_all_keypoints']}")
    print(f"欠損データを持つサンプル / Samples with missing data: {stats['samples_with_missing_data']}")
    
    # Display keypoint statistics for available data
    print("\n特徴点座標統計 / Keypoint Coordinate Statistics:")
    for i, (name, stat) in enumerate(list(stats['keypoint_statistics'].items())[:10]):  # Show first 10
        print(f"{name}: mean={stat['mean']:.2f}, std={stat['std']:.2f}, range=[{stat['min']:.1f}, {stat['max']:.1f}]")

## 7. 特徴点付き画像の可視化 / Keypoint Visualization

In [None]:
if df is not None:
    # Find samples with complete keypoints
    complete_samples = df.dropna()
    
    if len(complete_samples) > 0:
        print(f"完全なデータを持つサンプル数 / Samples with complete data: {len(complete_samples)}")
        
        # Visualize samples with keypoints
        num_samples = min(4, len(complete_samples))
        fig, axes = plt.subplots(1, num_samples, figsize=(16, 4))
        if num_samples == 1:
            axes = [axes]
        
        for i in range(num_samples):
            # Get image and keypoints
            row = complete_samples.iloc[i]
            image_str = row['Image']
            image = np.array(image_str.split(), dtype=np.float32).reshape(96, 96)
            
            # Get keypoints
            keypoint_cols = [col for col in df.columns if col != 'Image']
            keypoints = row[keypoint_cols].values.reshape(-1, 2)
            
            # Plot
            axes[i].imshow(image, cmap='gray')
            axes[i].scatter(keypoints[:, 0], keypoints[:, 1], c='red', s=20, alpha=0.8)
            axes[i].set_title(f'Sample {i+1} with Keypoints')
            axes[i].axis('off')
        
        plt.suptitle('特徴点付きサンプル画像 / Sample Images with Keypoints', fontsize=14)
        plt.tight_layout()
        plt.show()
    else:
        print("完全なデータを持つサンプルがありません / No samples with complete data found")

## 8. 特徴点分布の可視化 / Keypoint Distribution Visualization

In [None]:
if df is not None:
    # Create keypoint distribution plot
    keypoint_cols = [col for col in df.columns if col != 'Image']
    keypoint_data = df[keypoint_cols].values
    
    # Use the visualization utility
    fig = visualize_data_distribution(
        keypoint_data, 
        keypoint_names=keypoint_cols,
        save_path=None
    )
    plt.show()

## 9. データ分割の検証 / Data Split Validation

In [None]:
if df is not None:
    # Test data splitting
    try:
        train_df, val_df, test_df = DataPreprocessor.split_data(
            data_path,
            val_size=0.2,
            test_size=0.1,
            random_state=42
        )
        
        print("データ分割結果 / Data Split Results")
        print("=" * 50)
        print(f"訓練データ / Training data: {len(train_df)} samples ({len(train_df)/len(df)*100:.1f}%)")
        print(f"検証データ / Validation data: {len(val_df)} samples ({len(val_df)/len(df)*100:.1f}%)")
        print(f"テストデータ / Test data: {len(test_df)} samples ({len(test_df)/len(df)*100:.1f}%)")
        print(f"合計 / Total: {len(train_df) + len(val_df) + len(test_df)} samples")
        
        # Visualize split
        labels = ['Train', 'Validation', 'Test']
        sizes = [len(train_df), len(val_df), len(test_df)]
        colors = ['lightblue', 'lightgreen', 'lightcoral']
        
        plt.figure(figsize=(8, 6))
        plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
        plt.title('データ分割 / Data Split Distribution')
        plt.axis('equal')
        plt.show()
        
    except Exception as e:
        print(f"データ分割エラー / Data split error: {e}")

## 10. まとめ / Summary

このノートブックでは以下の分析を行いました：

This notebook performed the following analyses:

1. **データの基本情報 / Basic data information**
2. **欠損値の分析 / Missing value analysis**
3. **画像データの特性 / Image data characteristics**
4. **特徴点データの統計 / Keypoint data statistics**
5. **サンプル画像の可視化 / Sample image visualization**
6. **特徴点分布の分析 / Keypoint distribution analysis**
7. **データ分割の検証 / Data split validation**

次のステップとして、モデル訓練用のデータローダーを作成し、実際の訓練を開始することができます。

As next steps, you can create data loaders for model training and start the actual training process.