# Import csv data

In [3]:
# Core data manipulation and analysis libraries
import pandas as pd
import numpy as np

# Advanced parallel processing for large datasets
import dask.dataframe as dd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # Interactive plots

# Machine learning and statistical libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Time handling
import datetime as dt

# Performance monitoring
import time
import os

# Setting plot styles for better visuals
plt.style.use('ggplot')

# Suppress warnings for a cleaner output
import warnings
warnings.filterwarnings('ignore')

# Displaying plots inline
%matplotlib inline

In [4]:
# Kaggle dataset path
file_path = '/workspaces/NASA-close-ML-Analysis/nasa_close_approach_2023_2024.csv'
# Loading the data into a Pandas DataFrame
data = pd.read_csv(file_path)

# Displaying the first few rows of the dataset to get an overview
data.head()

Unnamed: 0,des,orbit_id,jd,cd,dist,dist_min,dist_max,v_rel,v_inf,t_sigma_f,h
0,2022 YS6,6,2459946.0,2023-Jan-01 01:15,0.011943,0.011913,0.011973,9.719379,9.696398,< 00:01,26.86
1,2022 YR1,8,2459946.0,2023-Jan-01 20:45,0.041749,0.041365,0.042133,6.034629,6.024044,< 00:01,26.04
2,2022 YT3,8,2459947.0,2023-Jan-02 00:43,0.02846,0.028384,0.028536,6.438394,6.423836,< 00:01,25.81
3,2022 YU3,5,2459947.0,2023-Jan-02 05:11,0.025191,0.025017,0.025365,7.133786,7.118944,< 00:01,25.86
4,2022 YY6,4,2459947.0,2023-Jan-02 11:07,0.005439,0.005411,0.005467,20.269551,20.245368,< 00:01,26.15


## 🔍 Exploring the Dataset¶


In [5]:
# Checking the basic information of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2554 entries, 0 to 2553
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   des        2554 non-null   object 
 1   orbit_id   2554 non-null   int64  
 2   jd         2554 non-null   float64
 3   cd         2554 non-null   object 
 4   dist       2554 non-null   float64
 5   dist_min   2554 non-null   float64
 6   dist_max   2554 non-null   float64
 7   v_rel      2554 non-null   float64
 8   v_inf      2554 non-null   float64
 9   t_sigma_f  2554 non-null   object 
 10  h          2554 non-null   float64
dtypes: float64(7), int64(1), object(3)
memory usage: 219.6+ KB


## Fetch data from NASA APi

In [9]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# Define the API endpoint
API_URL = "https://ssd-api.jpl.nasa.gov/cad.api"

# Function to fetch close approach data
def fetch_close_approach_data(start_date, end_date, limit=100):
    """
    Fetch close approach data from NASA's CAD API.

    Parameters:
    - start_date (str): Start date in 'YYYY-MM-DD' format.
    - end_date (str): End date in 'YYYY-MM-DD' format.
    - limit (int): Maximum number of records to retrieve (default: 100).

    Returns:
    - data (list): List of close approach data entries.
    """
    params = {
        "date-min": start_date,
        "date-max": end_date,
        "neo": "true",
        "sort": "date",
        "limit": limit
    }

    try:
        response = requests.get(API_URL, params=params)
        response.raise_for_status()

        # Parse the JSON response
        data = response.json()
        if "data" in data:
            print(f"Successfully fetched {len(data['data'])} records from {start_date} to {end_date}")
            return data["data"]
        else:
            print("No data found for the specified date range.")
            return []

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return []

# Function to save the data to a CSV file
def save_data_to_csv(data, filename):
    """
    Save fetched data to a local CSV file.

    Parameters:
    - data (list): List of close approach data entries.
    - filename (str): Name of the file to save the data.
    """
    # Define the updated column names based on the API response structure
    columns = [
        "des","orbit_id","jd",
        "cd","dist","dist_min","dist_max",
        "v_rel","v_inf","t_sigma_f","h"
    ]

    # Convert the data to a DataFrame
    df = pd.DataFrame(data, columns=columns)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")


# Main function to execute the script
if __name__ == "__main__":
    # Define the date range (next 60 days)
    start_date = datetime.now().strftime("%Y-%m-%d")
    end_date = (datetime.now() + timedelta(days=60)).strftime("%Y-%m-%d")

    # Fetch the data
    data = fetch_close_approach_data(start_date, end_date, limit=100)

    # Save the data if available
    if data:
        filename = f"cad_data_{start_date}_to_{end_date}.csv"
        save_data_to_csv(data, filename)


Successfully fetched 40 records from 2024-11-13 to 2025-01-12
Data saved to cad_data_2024-11-13_to_2025-01-12.csv
