In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [None]:
# Load the dataset
df = pd.read_csv('../data/exoplanet.csv', comment='#')

print(f"Dataset shape: {df.shape}")
print(f"Number of exoplanets: {df.shape[0]:,}")
print(f"Number of features: {df.shape[1]:,}")

# Display basic information
print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)
df.info()


In [None]:
# Examine the first few rows
print("First 5 rows of the dataset:")
print("-" * 50)
display(df.head())

print("\nColumn names:")
print("-" * 20)
for i, col in enumerate(df.columns):
    print(f"{i+1:2d}. {col}")


In [None]:
# Key columns for habitability analysis
key_columns = [
    'pl_name', 'hostname', 'pl_rade', 'pl_radj', 'pl_bmasse', 'pl_bmassj',
    'pl_orbper', 'pl_orbsmax', 'pl_eqt', 'st_teff', 'st_rad', 'st_mass',
    'discoverymethod', 'disc_year', 'sy_dist'
]

# Create a subset with key columns
df_key = df[key_columns].copy()

print("Key columns for analysis:")
print("-" * 30)
for col in key_columns:
    print(f"• {col}")

print(f"\nKey dataset shape: {df_key.shape}")
display(df_key.head())


In [None]:
# Missing data analysis
print("MISSING DATA ANALYSIS")
print("=" * 40)

missing_data = df_key.isnull().sum()
missing_percent = (missing_data / len(df_key)) * 100

missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing_Count': missing_data.values,
    'Missing_Percentage': missing_percent.values
}).sort_values('Missing_Percentage', ascending=False)

print(missing_df.to_string(index=False))

# Visualize missing data
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
missing_df.plot(x='Column', y='Missing_Percentage', kind='bar', ax=plt.gca())
plt.title('Missing Data Percentage by Column')
plt.xticks(rotation=45)
plt.ylabel('Missing Percentage (%)')

plt.subplot(1, 2, 2)
sns.heatmap(df_key.isnull(), cbar=True, yticklabels=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.tight_layout()
plt.show()


In [None]:
# Statistical summary of key numerical features
numerical_cols = ['pl_rade', 'pl_radj', 'pl_bmasse', 'pl_bmassj', 'pl_orbper', 
                 'pl_orbsmax', 'pl_eqt', 'st_teff', 'st_rad', 'st_mass']

print("STATISTICAL SUMMARY")
print("=" * 50)
display(df_key[numerical_cols].describe())

# Correlation analysis
plt.figure(figsize=(12, 8))
correlation_matrix = df_key[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Correlation Matrix of Exoplanet Features')
plt.tight_layout()
plt.show()

print("\nKey Correlations:")
print("-" * 20)
# Find strong correlations (|r| > 0.5)
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.5:
            print(f"{correlation_matrix.columns[i]} ↔ {correlation_matrix.columns[j]}: {corr_val:.3f}")


In [None]:
# Discovery trends analysis
print("DISCOVERY TRENDS ANALYSIS")
print("=" * 40)

# Discovery by year
discovery_by_year = df_key.groupby('disc_year').size()
print(f"Discovery years range: {discovery_by_year.index.min():.0f} - {discovery_by_year.index.max():.0f}")
print(f"Peak discovery year: {discovery_by_year.idxmax():.0f} ({discovery_by_year.max()} discoveries)")

# Discovery methods
discovery_methods = df_key['discoverymethod'].value_counts()
print(f"\nTop 5 Discovery Methods:")
print("-" * 25)
for method, count in discovery_methods.head().items():
    percentage = (count / len(df_key)) * 100
    print(f"{method}: {count} ({percentage:.1f}%)")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Discovery timeline
axes[0,0].plot(discovery_by_year.index, discovery_by_year.values, marker='o')
axes[0,0].set_title('Exoplanet Discoveries Over Time')
axes[0,0].set_xlabel('Discovery Year')
axes[0,0].set_ylabel('Number of Discoveries')
axes[0,0].grid(True, alpha=0.3)

# Discovery methods (top 10)
top_methods = discovery_methods.head(10)
axes[0,1].barh(range(len(top_methods)), top_methods.values)
axes[0,1].set_yticks(range(len(top_methods)))
axes[0,1].set_yticklabels(top_methods.index, fontsize=8)
axes[0,1].set_title('Top 10 Discovery Methods')
axes[0,1].set_xlabel('Number of Discoveries')

# Planet radius distribution
axes[1,0].hist(df_key['pl_rade'].dropna(), bins=50, alpha=0.7, edgecolor='black')
axes[1,0].set_title('Distribution of Planet Radius')
axes[1,0].set_xlabel('Planet Radius (Earth radii)')
axes[1,0].set_ylabel('Frequency')
axes[1,0].set_xlim(0, 10)

# Stellar temperature distribution
axes[1,1].hist(df_key['st_teff'].dropna(), bins=50, alpha=0.7, edgecolor='black', color='orange')
axes[1,1].set_title('Distribution of Stellar Temperature')
axes[1,1].set_xlabel('Stellar Temperature (K)')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()
