# üöÄ NASA Space Monitoring - Exploratory Data Analysis

This notebook provides exploratory analysis of Near-Earth Object (NEO) data collected from NASA's API.

## Objectives:
1. Load and explore asteroid approach data
2. Analyze distribution of asteroid characteristics
3. Identify potentially hazardous asteroids
4. Visualize approach patterns
5. Calculate risk metrics

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ Libraries imported successfully")

## 1. Data Loading

In [None]:
# Load processed data
data_dir = Path('../data/processed/neo')
processed_files = list(data_dir.glob('neo_processed_*.parquet'))

if processed_files:
    latest_file = max(processed_files, key=lambda p: p.stat().st_mtime)
    print(f"üìÇ Loading: {latest_file.name}")
    
    df = pd.read_parquet(latest_file)
    print(f"‚úÖ Loaded {len(df):,} records")
    print(f"üìÖ Date range: {df['close_approach_date'].min()} to {df['close_approach_date'].max()}")
else:
    print("‚ùå No processed data found. Please run the ingestion and transformation first.")

## 2. Data Overview

In [None]:
# Display basic info
print("=" * 60)
print("DATA SUMMARY")
print("=" * 60)
print(f"Total approaches: {len(df):,}")
print(f"Unique asteroids: {df['asteroid_id'].nunique():,}")
print(f"Potentially hazardous: {df['is_potentially_hazardous'].sum():,} ({df['is_potentially_hazardous'].mean()*100:.1f}%)")
print(f"\nColumns: {', '.join(df.columns.tolist())}")

# Display first few records
df.head()

In [None]:
# Statistical summary
df[[
    'diameter_avg_km',
    'miss_distance_lunar',
    'relative_velocity_kms',
    'risk_score'
]].describe()

## 3. Threat Level Analysis

In [None]:
# Threat level distribution
threat_counts = df['threat_level'].value_counts()

fig = px.pie(
    values=threat_counts.values,
    names=threat_counts.index,
    title='Distribution by Threat Level',
    color=threat_counts.index,
    color_discrete_map={
        'High': '#ff4444',
        'Medium': '#ff9944',
        'Low': '#ffdd44',
        'Minimal': '#44ff44'
    }
)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

print("\nüìä Threat Level Breakdown:")
for level, count in threat_counts.items():
    print(f"  {level}: {count} ({count/len(df)*100:.1f}%)")

## 4. Size Distribution Analysis

In [None]:
# Size category distribution
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Size Category Distribution', 'Diameter Distribution'),
    specs=[[{'type': 'bar'}, {'type': 'histogram'}]]
)

# Bar chart of categories
size_counts = df['size_category'].value_counts()
fig.add_trace(
    go.Bar(x=size_counts.index, y=size_counts.values, name='Count'),
    row=1, col=1
)

# Histogram of diameters
fig.add_trace(
    go.Histogram(x=df['diameter_avg_km'], nbinsx=30, name='Diameter'),
    row=1, col=2
)

fig.update_layout(height=400, showlegend=False, title_text="Asteroid Size Analysis")
fig.show()

## 5. Proximity Analysis

In [None]:
# Distance vs Velocity scatter
fig = px.scatter(
    df,
    x='miss_distance_lunar',
    y='relative_velocity_kms',
    size='diameter_avg_km',
    color='threat_level',
    color_discrete_map={
        'High': '#ff4444',
        'Medium': '#ff9944',
        'Low': '#ffdd44',
        'Minimal': '#44ff44'
    },
    hover_data=['name', 'close_approach_date'],
    title='Asteroid Proximity: Distance vs Velocity',
    labels={
        'miss_distance_lunar': 'Distance (Lunar Distances)',
        'relative_velocity_kms': 'Velocity (km/s)',
        'diameter_avg_km': 'Diameter (km)'
    }
)

# Add reference line at Moon distance
fig.add_hline(y=df['relative_velocity_kms'].mean(), line_dash="dash", 
              annotation_text="Avg Velocity", line_color="gray")
fig.add_vline(x=1.0, line_dash="dash", 
              annotation_text="Moon Distance", line_color="gray")

fig.show()

## 6. Temporal Analysis

In [None]:
# Approaches over time
daily_counts = df.groupby(df['close_approach_date'].dt.date).agg({
    'asteroid_id': 'count',
    'is_potentially_hazardous': 'sum',
    'miss_distance_lunar': 'min'
}).reset_index()

daily_counts.columns = ['date', 'total_approaches', 'hazardous_count', 'closest_distance']

fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Daily Approach Count', 'Closest Approach Distance'),
    vertical_spacing=0.15
)

# Total approaches
fig.add_trace(
    go.Bar(x=daily_counts['date'], y=daily_counts['total_approaches'], 
           name='Total', marker_color='lightblue'),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=daily_counts['date'], y=daily_counts['hazardous_count'], 
           name='Hazardous', marker_color='red'),
    row=1, col=1
)

# Closest distance
fig.add_trace(
    go.Scatter(x=daily_counts['date'], y=daily_counts['closest_distance'], 
               mode='lines+markers', name='Closest Distance', 
               line=dict(color='orange', width=2)),
    row=2, col=1
)

fig.update_layout(height=600, title_text="Temporal Analysis of Asteroid Approaches")
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Distance (LD)", row=2, col=1)

fig.show()

## 7. Risk Score Analysis

In [None]:
# Risk score distribution
fig = px.histogram(
    df,
    x='risk_score',
    color='is_potentially_hazardous',
    nbins=50,
    title='Risk Score Distribution',
    labels={'risk_score': 'Risk Score (0-100)'},
    color_discrete_map={True: 'red', False: 'blue'}
)

fig.show()

print("\nüìä Risk Score Statistics:")
print(df.groupby('is_potentially_hazardous')['risk_score'].describe())

## 8. Top 10 Highest Risk Asteroids

In [None]:
# Get top 10 by risk score
top_risk = df.nlargest(10, 'risk_score')[[
    'name',
    'close_approach_date',
    'miss_distance_lunar',
    'diameter_avg_km',
    'relative_velocity_kms',
    'risk_score',
    'threat_level',
    'is_potentially_hazardous'
]].copy()

top_risk['close_approach_date'] = top_risk['close_approach_date'].dt.strftime('%Y-%m-%d')

print("\nüö® TOP 10 HIGHEST RISK ASTEROIDS:\n")
print(top_risk.to_string(index=False))

## 9. Correlation Analysis

In [None]:
# Select numeric columns
numeric_cols = [
    'diameter_avg_km',
    'miss_distance_lunar',
    'relative_velocity_kms',
    'risk_score'
]

# Calculate correlation matrix
corr_matrix = df[numeric_cols].corr()

# Create heatmap
fig = px.imshow(
    corr_matrix,
    text_auto='.2f',
    title='Feature Correlation Heatmap',
    color_continuous_scale='RdBu_r',
    aspect='auto'
)

fig.show()

## 10. Export Summary Report

In [None]:
# Create summary report
report = {
    'Analysis Date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'Total Approaches': len(df),
    'Unique Asteroids': df['asteroid_id'].nunique(),
    'Potentially Hazardous': df['is_potentially_hazardous'].sum(),
    'High Threat Count': (df['threat_level'] == 'High').sum(),
    'Average Distance (LD)': df['miss_distance_lunar'].mean(),
    'Closest Approach (LD)': df['miss_distance_lunar'].min(),
    'Average Velocity (km/s)': df['relative_velocity_kms'].mean(),
    'Average Diameter (km)': df['diameter_avg_km'].mean(),
    'Average Risk Score': df['risk_score'].mean()
}

report_df = pd.DataFrame(list(report.items()), columns=['Metric', 'Value'])

print("\n" + "="*60)
print("SUMMARY REPORT")
print("="*60)
print(report_df.to_string(index=False))

# Save to CSV
output_path = Path('../data/analytics/analysis_summary.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)
report_df.to_csv(output_path, index=False)
print(f"\n‚úÖ Summary saved to: {output_path}")

## Conclusions

Key findings from this analysis:

1. **Volume**: Monitoring X asteroids with Y total approaches
2. **Threat Assessment**: Z% classified as potentially hazardous
3. **Proximity**: Closest approach at N lunar distances
4. **Size Distribution**: Most asteroids in the [category] range
5. **Temporal Patterns**: [Observation about timing]

### Recommendations:
- Continue monitoring high-risk asteroids
- Set up automated alerts for close approaches (<1 LD)
- Regular data updates every 6 hours
- Further analysis on velocity patterns

---

**Next Steps**: Run the Streamlit dashboard for interactive exploration!