# NASA Space Apps Challenge 2025: Exoplanet Detection EDA

## Challenge: A World Away – Hunting for Exoplanets with AI

This notebook performs comprehensive exploratory data analysis on three major NASA exoplanet datasets:
- **Kepler Objects of Interest (KOI)**: Discoveries from the Kepler Space Telescope
- **K2 Planets and Candidates**: Extended mission observations 
- **TESS Objects of Interest (TOI)**: Transiting Exoplanet Survey Satellite data

Our goal is to build an AI system that can identify and classify potential exoplanets from stellar observations.

In [None]:
# Import Libraries and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Visualization settings
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("📊 Libraries imported successfully!")
print("🚀 Ready for NASA Space Apps Challenge EDA!")

## 1. Load Datasets

Let's load our three NASA exoplanet datasets and examine their basic structure.

In [None]:
# Load the three NASA exoplanet datasets
data_path = Path("../data/raw")

try:
    # Load Kepler Objects of Interest
    kepler_df = pd.read_csv(data_path / "kepler.csv", comment='#', low_memory=False)
    print("✅ Kepler dataset loaded successfully")
    
    # Load K2 Planets and Candidates  
    k2_df = pd.read_csv(data_path / "k2.csv", comment='#', low_memory=False)
    print("✅ K2 dataset loaded successfully")
    
    # Load TESS Objects of Interest
    tess_df = pd.read_csv(data_path / "tess.csv", comment='#', low_memory=False)
    print("✅ TESS dataset loaded successfully")
    
    print(f"\n📊 Dataset Summary:")
    print(f"   Kepler: {kepler_df.shape[0]:,} objects, {kepler_df.shape[1]} features")
    print(f"   K2: {k2_df.shape[0]:,} objects, {k2_df.shape[1]} features")
    print(f"   TESS: {tess_df.shape[0]:,} objects, {tess_df.shape[1]} features")
    print(f"   Total: {kepler_df.shape[0] + k2_df.shape[0] + tess_df.shape[0]:,} exoplanet candidates")
    
except FileNotFoundError as e:
    print(f"❌ Error loading datasets: {e}")
    print("Please ensure the datasets are in '../data/raw/' directory")

## 2. Dataset Overview and Shape Analysis

Let's examine the structure of each dataset - their columns, data types, and first few rows.

In [None]:
# Kepler Dataset Overview
print("🔭 KEPLER OBJECTS OF INTEREST (KOI)")
print("=" * 50)
print(f"Shape: {kepler_df.shape}")
print(f"Columns: {list(kepler_df.columns)}")
print("\nFirst few rows:")
kepler_df.head()

In [None]:
# K2 Dataset Overview  
print("🛰️ K2 PLANETS AND CANDIDATES")
print("=" * 50)
print(f"Shape: {k2_df.shape}")
print(f"Columns: {list(k2_df.columns[:20])}...")  # Show first 20 columns
print("\nFirst few rows:")
k2_df.head()

In [None]:
# TESS Dataset Overview
print("🌟 TESS OBJECTS OF INTEREST (TOI)")
print("=" * 50)
print(f"Shape: {tess_df.shape}")
print(f"Columns: {list(tess_df.columns[:20])}...")  # Show first 20 columns
print("\nFirst few rows:")
tess_df.head()

## 3. Missing Values Analysis

Understanding the completeness of our data across all three datasets.

In [None]:
def analyze_missing_values(df, dataset_name):
    """Analyze missing values in a dataset"""
    missing_counts = df.isnull().sum()
    missing_percentages = (missing_counts / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Column': missing_counts.index,
        'Missing_Count': missing_counts.values,
        'Missing_Percentage': missing_percentages.values
    }).sort_values('Missing_Percentage', ascending=False)
    
    # Only show columns with missing values
    missing_df = missing_df[missing_df['Missing_Count'] > 0]
    
    print(f"\n📊 {dataset_name} - Missing Values Summary")
    print(f"Total columns: {len(df.columns)}")
    print(f"Columns with missing values: {len(missing_df)}")
    print(f"Complete columns: {len(df.columns) - len(missing_df)}")
    
    if len(missing_df) > 0:
        print(f"\nTop 10 columns with most missing values:")
        print(missing_df.head(10).to_string(index=False))
    else:
        print("🎉 No missing values found!")
    
    return missing_df

# Analyze missing values for each dataset
kepler_missing = analyze_missing_values(kepler_df, "KEPLER")
k2_missing = analyze_missing_values(k2_df, "K2") 
tess_missing = analyze_missing_values(tess_df, "TESS")

In [None]:
# Visualize missing values with heatmaps
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

datasets = [
    (kepler_df, "Kepler Objects of Interest", axes[0]),
    (k2_df, "K2 Planets and Candidates", axes[1]), 
    (tess_df, "TESS Objects of Interest", axes[2])
]

for df, title, ax in datasets:
    # Create missing value matrix (sample columns for visibility)
    sample_cols = df.select_dtypes(include=[np.number]).columns[:20]  # First 20 numeric columns
    missing_matrix = df[sample_cols].isnull()
    
    # Create heatmap
    sns.heatmap(missing_matrix.T, cbar=True, cmap='viridis', ax=ax, 
                xticklabels=False, yticklabels=True)
    ax.set_title(f'{title} - Missing Values Pattern (First 20 Numeric Columns)')
    ax.set_xlabel('Samples')
    ax.set_ylabel('Features')

plt.tight_layout()
plt.show()

## 4. Class Distribution Analysis

Analyzing the target class distributions across all three datasets to understand the balance of confirmed planets, candidates, and false positives.

In [None]:
# Identify target columns for each dataset
def find_target_columns(df):
    """Find potential target columns containing disposition/classification"""
    target_cols = []
    for col in df.columns:
        if any(keyword in col.lower() for keyword in ['disposition', 'disp', 'status', 'classification']):
            target_cols.append(col)
    return target_cols

# Find target columns
kepler_targets = find_target_columns(kepler_df)
k2_targets = find_target_columns(k2_df)
tess_targets = find_target_columns(tess_df)

print("🎯 Target Columns Found:")
print(f"Kepler: {kepler_targets}")
print(f"K2: {k2_targets}")  
print(f"TESS: {tess_targets}")

# Analyze class distributions
def analyze_class_distribution(df, target_col, dataset_name):
    """Analyze and visualize class distribution"""
    if target_col not in df.columns:
        print(f"❌ Column '{target_col}' not found in {dataset_name}")
        return None
    
    # Get value counts
    class_counts = df[target_col].value_counts()
    class_percentages = df[target_col].value_counts(normalize=True) * 100
    
    print(f"\n📊 {dataset_name} - Class Distribution ({target_col}):")
    for class_name, count in class_counts.items():
        percentage = class_percentages[class_name]
        print(f"   {class_name}: {count:,} ({percentage:.1f}%)")
    
    return class_counts

# Try to identify the main target columns
try:
    if 'koi_disposition' in kepler_df.columns:
        kepler_classes = analyze_class_distribution(kepler_df, 'koi_disposition', 'KEPLER')
    elif len(kepler_targets) > 0:
        kepler_classes = analyze_class_distribution(kepler_df, kepler_targets[0], 'KEPLER')
    else:
        print("❌ No disposition column found in Kepler dataset")
        kepler_classes = None
        
    if 'disposition' in k2_df.columns:
        k2_classes = analyze_class_distribution(k2_df, 'disposition', 'K2')
    elif len(k2_targets) > 0:
        k2_classes = analyze_class_distribution(k2_df, k2_targets[0], 'K2')
    else:
        print("❌ No disposition column found in K2 dataset") 
        k2_classes = None
        
    if 'tfopwg_disp' in tess_df.columns:
        tess_classes = analyze_class_distribution(tess_df, 'tfopwg_disp', 'TESS')
    elif len(tess_targets) > 0:
        tess_classes = analyze_class_distribution(tess_df, tess_targets[0], 'TESS')
    else:
        print("❌ No disposition column found in TESS dataset")
        tess_classes = None

except Exception as e:
    print(f"❌ Error analyzing class distributions: {e}")

In [None]:
# Visualize class distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('🎯 Class Distribution Across NASA Exoplanet Datasets', fontsize=16, fontweight='bold')

datasets_info = [
    (kepler_classes, 'KEPLER', 'Kepler Objects of Interest'),
    (k2_classes, 'K2', 'K2 Planets and Candidates'),
    (tess_classes, 'TESS', 'TESS Objects of Interest')
]

for i, (class_data, dataset_name, full_name) in enumerate(datasets_info):
    ax = axes[i]
    
    if class_data is not None and len(class_data) > 0:
        # Create pie chart
        wedges, texts, autotexts = ax.pie(class_data.values, 
                                         labels=class_data.index,
                                         autopct='%1.1f%%',
                                         startangle=90,
                                         colors=plt.cm.Set3(range(len(class_data))))
        
        # Customize the text
        for autotext in autotexts:
            autotext.set_color('black')
            autotext.set_fontweight('bold')
            autotext.set_fontsize(10)
            
        ax.set_title(f'{dataset_name}\n({full_name})', fontsize=12, fontweight='bold')
    else:
        ax.text(0.5, 0.5, f'No class data\navailable for\n{dataset_name}', 
                ha='center', va='center', fontsize=12)
        ax.set_title(f'{dataset_name}\n({full_name})', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Summary statistics
print("\n📋 Class Distribution Summary:")
total_confirmed = 0
total_candidates = 0
total_false_positives = 0

for class_data, dataset_name, _ in datasets_info:
    if class_data is not None:
        # Try to map common class names
        for class_name, count in class_data.items():
            class_lower = str(class_name).lower()
            if 'confirmed' in class_lower or 'planet' in class_lower:
                total_confirmed += count
            elif 'candidate' in class_lower:
                total_candidates += count
            elif 'false' in class_lower or 'positive' in class_lower:
                total_false_positives += count

print(f"   Estimated Confirmed Planets: {total_confirmed:,}")
print(f"   Estimated Candidates: {total_candidates:,}")
print(f"   Estimated False Positives: {total_false_positives:,}")
print(f"   Total Objects: {total_confirmed + total_candidates + total_false_positives:,}")

## 5. Numeric Feature Distributions

Analyzing the distributions of key astrophysical parameters across all three datasets.

In [None]:
# Identify key numeric features
def identify_key_features(df, dataset_name):
    """Identify key astrophysical parameters for exoplanet analysis"""
    key_patterns = [
        'period', 'radius', 'temp', 'magnitude', 'depth', 'duration', 
        'impact', 'flux', 'stellar', 'planet', 'transit', 'depth',
        'snr', 'mass', 'density', 'distance', 'brightness'
    ]
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    key_features = []
    
    for col in numeric_cols:
        col_lower = col.lower()
        if any(pattern in col_lower for pattern in key_patterns):
            key_features.append(col)
    
    print(f"🔍 Key Features for {dataset_name}:")
    for i, feature in enumerate(key_features[:10], 1):  # Show top 10
        print(f"   {i:2d}. {feature}")
    if len(key_features) > 10:
        print(f"   ... and {len(key_features) - 10} more features")
    
    return key_features

# Find key features for each dataset
kepler_features = identify_key_features(kepler_df, 'KEPLER')
k2_features = identify_key_features(k2_df, 'K2')
tess_features = identify_key_features(tess_df, 'TESS')

# Function to plot feature distributions
def plot_feature_distributions(df, features, dataset_name, max_features=8):
    """Plot distributions of key numeric features"""
    features_to_plot = features[:max_features]
    
    if len(features_to_plot) == 0:
        print(f"❌ No features to plot for {dataset_name}")
        return
    
    # Calculate grid size
    n_features = len(features_to_plot)
    n_cols = min(4, n_features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
    fig.suptitle(f'🌟 {dataset_name} - Key Feature Distributions', fontsize=16, fontweight='bold')
    
    if n_features == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes if hasattr(axes, '__len__') else [axes]
    else:
        axes = axes.flatten()
    
    for i, feature in enumerate(features_to_plot):
        ax = axes[i]
        
        try:
            # Get non-null values
            values = df[feature].dropna()
            
            if len(values) == 0:
                ax.text(0.5, 0.5, f'No valid data\nfor {feature}', 
                       ha='center', va='center', fontsize=10)
                ax.set_title(f'{feature}\n(No Data)', fontsize=10)
                continue
            
            # Create histogram
            ax.hist(values, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
            ax.set_title(f'{feature}\n(n={len(values):,})', fontsize=10, fontweight='bold')
            ax.set_xlabel(feature)
            ax.set_ylabel('Frequency')
            
            # Add statistics text
            stats_text = f'Mean: {values.mean():.2e}\nStd: {values.std():.2e}'
            ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, 
                   verticalalignment='top', fontsize=8,
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
            
        except Exception as e:
            ax.text(0.5, 0.5, f'Error plotting\n{feature}\n{str(e)}', 
                   ha='center', va='center', fontsize=8)
            ax.set_title(f'{feature}\n(Error)', fontsize=10)
    
    # Hide unused subplots
    for i in range(n_features, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

print("📊 Plotting feature distributions for each dataset...\n")

# Plot distributions for each dataset
plot_feature_distributions(kepler_df, kepler_features, 'KEPLER')
plot_feature_distributions(k2_df, k2_features, 'K2')  
plot_feature_distributions(tess_df, tess_features, 'TESS')

## 6. Feature Correlation Analysis

Exploring correlations between key astrophysical parameters to understand feature relationships.

In [None]:
# Function to create correlation matrix
def analyze_correlations(df, features, dataset_name, min_features=5):
    """Analyze and visualize feature correlations"""
    if len(features) < min_features:
        print(f"❌ {dataset_name}: Need at least {min_features} features for correlation analysis")
        return None
    
    # Select top features with good data coverage
    feature_coverage = {}
    for feature in features:
        if feature in df.columns:
            coverage = (1 - df[feature].isnull().mean()) * 100
            feature_coverage[feature] = coverage
    
    # Sort by coverage and select top features
    sorted_features = sorted(feature_coverage.items(), key=lambda x: x[1], reverse=True)
    top_features = [f[0] for f in sorted_features[:12] if f[1] > 50]  # At least 50% data coverage
    
    if len(top_features) < 3:
        print(f"❌ {dataset_name}: Insufficient features with good data coverage")
        return None
    
    print(f"🔗 {dataset_name} - Correlation Analysis:")
    print(f"   Selected {len(top_features)} features with >50% data coverage")
    
    # Calculate correlation matrix
    corr_data = df[top_features].corr()
    
    # Create correlation heatmap
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(corr_data, dtype=bool), k=1)
    
    sns.heatmap(corr_data, 
                mask=mask,
                annot=True, 
                cmap='RdYlBu_r', 
                center=0,
                square=True,
                fmt='.2f',
                cbar_kws={'label': 'Correlation Coefficient'})
    
    plt.title(f'🔗 {dataset_name} - Feature Correlation Matrix', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated pairs
    high_corr_pairs = []
    for i in range(len(corr_data.columns)):
        for j in range(i+1, len(corr_data.columns)):
            corr_val = corr_data.iloc[i, j]
            if abs(corr_val) > 0.7:  # High correlation threshold
                high_corr_pairs.append((
                    corr_data.columns[i], 
                    corr_data.columns[j], 
                    corr_val
                ))
    
    if high_corr_pairs:
        print(f"\n⚠️  High Correlations (|r| > 0.7):")
        for feat1, feat2, corr in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True):
            print(f"   {feat1} ↔ {feat2}: {corr:.3f}")
    else:
        print("\n✅ No highly correlated feature pairs found (|r| > 0.7)")
    
    return corr_data

# Analyze correlations for each dataset
print("🔗 Analyzing feature correlations across all datasets...\n")

kepler_corr = analyze_correlations(kepler_df, kepler_features, 'KEPLER')
print("\n" + "="*60 + "\n")

k2_corr = analyze_correlations(k2_df, k2_features, 'K2')
print("\n" + "="*60 + "\n")

tess_corr = analyze_correlations(tess_df, tess_features, 'TESS')

## 7. Cross-Dataset Comparison

Comparing feature availability and data quality across the three NASA datasets.

In [None]:
# Cross-dataset comparison
def compare_datasets():
    """Compare key characteristics across all three datasets"""
    
    datasets = {
        'KEPLER': kepler_df,
        'K2': k2_df, 
        'TESS': tess_df
    }
    
    features = {
        'KEPLER': kepler_features,
        'K2': k2_features,
        'TESS': tess_features
    }
    
    comparison_data = []
    
    for name, df in datasets.items():
        row = {
            'Dataset': name,
            'Total Objects': len(df),
            'Total Features': len(df.columns),
            'Key Features': len(features[name]),
            'Memory Usage (MB)': df.memory_usage(deep=True).sum() / 1024 / 1024,
            'Missing Data (%)': (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
        }
        comparison_data.append(row)
    
    comparison_df = pd.DataFrame(comparison_data)
    
    print("🚀 NASA Exoplanet Datasets Comparison:")
    print("=" * 80)
    print(comparison_df.to_string(index=False, float_format='%.1f'))
    
    return comparison_df

comparison_summary = compare_datasets()

# Feature overlap analysis
def analyze_feature_overlap():
    """Analyze common features across datasets"""
    
    # Get column names (lowercase for comparison)
    kepler_cols = set(col.lower() for col in kepler_df.columns)
    k2_cols = set(col.lower() for col in k2_df.columns)
    tess_cols = set(col.lower() for col in tess_df.columns)
    
    # Find overlaps
    all_three = kepler_cols & k2_cols & tess_cols
    kepler_k2 = (kepler_cols & k2_cols) - all_three
    kepler_tess = (kepler_cols & tess_cols) - all_three
    k2_tess = (k2_cols & tess_cols) - all_three
    
    # Unique to each
    kepler_only = kepler_cols - k2_cols - tess_cols
    k2_only = k2_cols - kepler_cols - tess_cols
    tess_only = tess_cols - kepler_cols - k2_cols
    
    print("\n🔍 Feature Overlap Analysis:")
    print("=" * 50)
    print(f"📊 Common to all three datasets: {len(all_three)}")
    if len(all_three) > 0:
        print(f"   Examples: {list(all_three)[:5]}")
    
    print(f"\n🤝 Shared between pairs:")
    print(f"   KEPLER & K2 only: {len(kepler_k2)}")
    print(f"   KEPLER & TESS only: {len(kepler_tess)}")
    print(f"   K2 & TESS only: {len(k2_tess)}")
    
    print(f"\n🎯 Unique to each dataset:")
    print(f"   KEPLER unique: {len(kepler_only)}")
    print(f"   K2 unique: {len(k2_only)}")
    print(f"   TESS unique: {len(tess_only)}")
    
    # Create Venn diagram data
    overlap_data = {
        'All Three': len(all_three),
        'KEPLER & K2': len(kepler_k2),
        'KEPLER & TESS': len(kepler_tess),
        'K2 & TESS': len(k2_tess),
        'KEPLER Only': len(kepler_only),
        'K2 Only': len(k2_only),
        'TESS Only': len(tess_only)
    }
    
    return overlap_data

overlap_results = analyze_feature_overlap()

# Visualize dataset comparison
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('🌌 NASA Exoplanet Datasets - Comparative Analysis', fontsize=16, fontweight='bold')

# 1. Object counts
ax1.bar(comparison_summary['Dataset'], comparison_summary['Total Objects'], 
        color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
ax1.set_title('Total Objects per Dataset', fontweight='bold')
ax1.set_ylabel('Number of Objects')
for i, v in enumerate(comparison_summary['Total Objects']):
    ax1.text(i, v + max(comparison_summary['Total Objects'])*0.01, f'{v:,}', 
             ha='center', fontweight='bold')

# 2. Feature counts
ax2.bar(comparison_summary['Dataset'], comparison_summary['Total Features'], 
        color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
ax2.set_title('Total Features per Dataset', fontweight='bold')
ax2.set_ylabel('Number of Features')
for i, v in enumerate(comparison_summary['Total Features']):
    ax2.text(i, v + max(comparison_summary['Total Features'])*0.01, str(v), 
             ha='center', fontweight='bold')

# 3. Missing data percentage
ax3.bar(comparison_summary['Dataset'], comparison_summary['Missing Data (%)'], 
        color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
ax3.set_title('Missing Data Percentage', fontweight='bold')
ax3.set_ylabel('Missing Data (%)')
for i, v in enumerate(comparison_summary['Missing Data (%)']):
    ax3.text(i, v + max(comparison_summary['Missing Data (%)'])*0.01, f'{v:.1f}%', 
             ha='center', fontweight='bold')

# 4. Memory usage
ax4.bar(comparison_summary['Dataset'], comparison_summary['Memory Usage (MB)'], 
        color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
ax4.set_title('Memory Usage', fontweight='bold')
ax4.set_ylabel('Memory (MB)')
for i, v in enumerate(comparison_summary['Memory Usage (MB)']):
    ax4.text(i, v + max(comparison_summary['Memory Usage (MB)'])*0.01, f'{v:.1f}', 
             ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 8. Data Quality Assessment

Comprehensive evaluation of data quality issues and recommendations for preprocessing.

In [None]:
# Comprehensive data quality assessment
def assess_data_quality(df, dataset_name):
    """Comprehensive data quality assessment"""
    
    print(f"🔍 Data Quality Assessment for {dataset_name}")
    print("=" * 60)
    
    quality_report = {}
    
    # 1. Basic statistics
    total_rows = len(df)
    total_cols = len(df.columns)
    total_cells = total_rows * total_cols
    
    print(f"📊 Dataset Overview:")
    print(f"   • Total rows: {total_rows:,}")
    print(f"   • Total columns: {total_cols:,}")
    print(f"   • Total cells: {total_cells:,}")
    
    # 2. Missing data analysis
    missing_counts = df.isnull().sum()
    missing_percentages = (missing_counts / total_rows) * 100
    
    # Categorize columns by missing data
    complete_cols = missing_counts[missing_counts == 0]
    low_missing = missing_counts[(missing_counts > 0) & (missing_percentages <= 10)]
    medium_missing = missing_counts[(missing_percentages > 10) & (missing_percentages <= 50)]
    high_missing = missing_counts[missing_percentages > 50]
    
    print(f"\n🚫 Missing Data Analysis:")
    print(f"   • Complete columns (0% missing): {len(complete_cols)}")
    print(f"   • Low missing (1-10%): {len(low_missing)}")
    print(f"   • Medium missing (11-50%): {len(medium_missing)}")
    print(f"   • High missing (>50%): {len(high_missing)}")
    
    if len(high_missing) > 0:
        print(f"   ⚠️  High missing columns: {list(high_missing.index[:5])}")
    
    # 3. Data type analysis
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    object_cols = df.select_dtypes(include=['object']).columns
    datetime_cols = df.select_dtypes(include=['datetime64']).columns
    
    print(f"\n📋 Data Types:")
    print(f"   • Numeric columns: {len(numeric_cols)}")
    print(f"   • Text/Object columns: {len(object_cols)}")
    print(f"   • Datetime columns: {len(datetime_cols)}")
    
    # 4. Duplicate analysis
    duplicate_rows = df.duplicated().sum()
    duplicate_percentage = (duplicate_rows / total_rows) * 100
    
    print(f"\n🔄 Duplicate Analysis:")
    print(f"   • Duplicate rows: {duplicate_rows:,} ({duplicate_percentage:.2f}%)")
    
    # 5. Outlier detection for numeric columns (basic)
    outlier_summary = {}
    if len(numeric_cols) > 0:
        print(f"\n📈 Outlier Analysis (IQR method):")
        outlier_count = 0
        for col in numeric_cols[:10]:  # Check first 10 numeric columns
            if df[col].notna().sum() > 0:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
                outlier_pct = (outliers / df[col].notna().sum()) * 100
                
                if outliers > 0 and outlier_count < 5:  # Show top 5
                    print(f"   • {col}: {outliers:,} outliers ({outlier_pct:.1f}%)")
                    outlier_count += 1
                
                outlier_summary[col] = outliers
    
    # 6. Value consistency check
    print(f"\n✅ Data Consistency:")
    consistency_issues = 0
    
    # Check for mixed case in text columns
    for col in object_cols[:5]:  # Check first 5 object columns
        unique_vals = df[col].dropna().astype(str)
        if len(unique_vals) > 0:
            # Check for potential case sensitivity issues
            unique_lower = unique_vals.str.lower().nunique()
            unique_original = unique_vals.nunique()
            
            if unique_lower < unique_original:
                consistency_issues += 1
                print(f"   ⚠️  {col}: Potential case sensitivity issues")
    
    if consistency_issues == 0:
        print(f"   ✅ No obvious consistency issues detected")
    
    # 7. Summary recommendations
    print(f"\n💡 Preprocessing Recommendations:")
    
    if len(high_missing) > 0:
        print(f"   • Consider removing columns with >50% missing data ({len(high_missing)} columns)")
    
    if duplicate_rows > 0:
        print(f"   • Remove {duplicate_rows:,} duplicate rows")
    
    if len(medium_missing) > 0:
        print(f"   • Apply imputation strategies for {len(medium_missing)} columns with 11-50% missing data")
    
    if any(outlier_summary.values()):
        high_outlier_cols = sum(1 for v in outlier_summary.values() if v > total_rows * 0.05)
        if high_outlier_cols > 0:
            print(f"   • Review outliers in {high_outlier_cols} columns (>5% outliers)")
    
    if consistency_issues > 0:
        print(f"   • Standardize text values for consistency")
    
    quality_score = 100 - (missing_percentages.mean() + duplicate_percentage + consistency_issues * 5)
    quality_score = max(0, min(100, quality_score))  # Clamp between 0-100
    
    print(f"\n⭐ Overall Data Quality Score: {quality_score:.1f}/100")
    
    return {
        'dataset': dataset_name,
        'total_rows': total_rows,
        'total_cols': total_cols,
        'complete_cols': len(complete_cols),
        'high_missing_cols': len(high_missing),
        'duplicate_rows': duplicate_rows,
        'consistency_issues': consistency_issues,
        'quality_score': quality_score
    }

# Assess data quality for all datasets
print("🔍 Comprehensive Data Quality Assessment")
print("=" * 80)

kepler_quality = assess_data_quality(kepler_df, 'KEPLER')
print("\n" + "="*80 + "\n")

k2_quality = assess_data_quality(k2_df, 'K2')
print("\n" + "="*80 + "\n")

tess_quality = assess_data_quality(tess_df, 'TESS')

## 9. Summary and Next Steps

Key findings from the exploratory data analysis and recommendations for the NASA Space Apps Challenge solution.

In [None]:
# Create comprehensive summary
print("🌟 NASA Space Apps Challenge 2025 - EDA Summary")
print("=" * 80)
print("Challenge: A World Away: Hunting for Exoplanets with AI")
print("=" * 80)

# Combine quality assessments
quality_summary = pd.DataFrame([kepler_quality, k2_quality, tess_quality])

print("\n📊 DATASET OVERVIEW:")
print(quality_summary[['dataset', 'total_rows', 'total_cols', 'quality_score']].to_string(index=False))

print(f"\n🎯 KEY FINDINGS:")
print(f"   • Total exoplanet objects across all datasets: {quality_summary['total_rows'].sum():,}")
print(f"   • Combined feature space: {quality_summary['total_cols'].sum()} total columns")
print(f"   • Average data quality score: {quality_summary['quality_score'].mean():.1f}/100")
print(f"   • Best quality dataset: {quality_summary.loc[quality_summary['quality_score'].idxmax(), 'dataset']}")

print(f"\n🔍 DATA CHALLENGES IDENTIFIED:")
total_missing_cols = quality_summary['high_missing_cols'].sum()
total_duplicates = quality_summary['duplicate_rows'].sum()

if total_missing_cols > 0:
    print(f"   • High missing data: {total_missing_cols} columns with >50% missing values")
if total_duplicates > 0:
    print(f"   • Duplicate records: {total_duplicates:,} duplicate rows found")

print(f"\n💡 MACHINE LEARNING STRATEGY:")
print(f"   1. Data Preprocessing:")
print(f"      • Normalize class labels across datasets (CONFIRMED, CANDIDATE, FALSE_POSITIVE)")
print(f"      • Handle missing values using domain-appropriate imputation")
print(f"      • Remove or impute high-missing columns (>50% missing)")
print(f"      • Feature scaling for algorithm compatibility")

print(f"\n   2. Feature Engineering:")
print(f"      • Focus on common astrophysical parameters (period, radius, temperature)")
print(f"      • Create derived features from transit measurements")
print(f"      • Address high correlation pairs to reduce multicollinearity")

print(f"\n   3. Model Selection:")
print(f"      • Baseline: Logistic Regression for interpretability")
print(f"      • Advanced: Random Forest/XGBoost for complex patterns")
print(f"      • Evaluation: Precision, Recall, F1-score, AUC-ROC")
print(f"      • Cross-validation for robust performance assessment")

print(f"\n🚀 NEXT STEPS FOR IMPLEMENTATION:")
print(f"   ✅ EDA Complete - Comprehensive analysis finished")
print(f"   📋 Create preprocessing pipeline (src/preprocess.py)")
print(f"   🤖 Implement model training (src/train.py)")
print(f"   🎯 Build prediction interface (src/predict.py)")
print(f"   🌐 Develop Streamlit web app (app.py)")
print(f"   📖 Update documentation (README.md)")

print(f"\n🏆 SUCCESS METRICS:")
print(f"   • Model Accuracy: Target >85% on test set")
print(f"   • Class Balance: Handle imbalanced classes effectively") 
print(f"   • Generalization: Consistent performance across datasets")
print(f"   • User Experience: Intuitive web interface for planet hunters")

print(f"\n⭐ Ready to build the complete exoplanet classification solution!")
print("=" * 80)