# GOES Historical Data Access from GridSat

This code will help download the historical GOES data in bulk using GridSat https://www.ncei.noaa.gov/data/gridsat-goes/access/goes/, store it in csvs along with the download link. 

In [5]:
# import necessary libraries
import pandas
import os
import requests
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

First, we want to see the list of dates with labeled data. Since we initially wrote a code to store the dataset in a numpy array, we need to convert it to the list. 

In [7]:
# Load the list from the .npy file 
# west_ar.npy file contains the list of labeled AR data from ClimateNet (https://gmd.copernicus.org/articles/14/107/2021/)
file_list = np.load('west_ar.npy', allow_pickle=True)

# Ensure it is a list (if it's stored as a numpy array, you might need to convert it)
if isinstance(file_list, np.ndarray):
    file_list = file_list.tolist()
    
#file_list

# GOES all with links

In [None]:
# Base url from the GridSat
base_url = "https://www.ncei.noaa.gov/data/gridsat-goes/access/goes/"

# Function to fetch and parse HTML content from a URL
def fetch_html(url):
    response = requests.get(url)
    return response.content if response.status_code == 200 else None

# Function to parse the list of .nc files from HTML content
def parse_nc_files(html_content, year, month):
    data = []
    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')
        # Find all links ending with .nc (assuming they are the .nc files)
        nc_links = soup.find_all('a', href=lambda href: href and href.endswith('.nc'))
        for link in nc_links:
            nc_file_url = urljoin(year_month_url, link['href'])
            # Extract year, month, day, hour, and GOES number from URL
            parts = link['href'].split('.')
            year = int(parts[2])
            month = int(parts[3])
            day = int(parts[4])
            hour = parts[5][:2]  # Extract first two characters for hour
            goesXX = parts[1].replace('goes', '')  # Extracts goesXX from the filename
            
            # Determine east or west
            if goesXX in ['08', '12', '13']:
                goes_direction = 'east'
            else:
                goes_direction = 'west'
            
            hour_minute = f"{hour}:00"
            data.append([year, month, day, hour_minute, goesXX, goes_direction, nc_file_url])
    return data

# CSV filename
csv_filename = 'goes_all_w_links.csv'

# Iterate over years from 1996 to 2017 (adjust as needed - depends on which year are you interested in exploring)
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Year', 'Month', 'Day', 'Hour', 'GOES_Num', 'Direction', 'URL'])  # Write header
    
    for year in range(1996, 2017):
        for month in range(1, 13):
            year_month_url = f"{base_url}{year}/{month:02d}/"
            html_content = fetch_html(year_month_url)
            if html_content:
                nc_data = parse_nc_files(html_content, year, month)
                for row in nc_data:
                    csv_writer.writerow(row)
                    print(f"Saved: {row}")

#print(f"CSV file '{csv_filename}' has been successfully created.")


# GOES West only 

In [None]:
# This part only downloads goes_west data

# Load the CSV file
csv_filename = 'goes_all_w_links.csv'
df = pd.read_csv(csv_filename)

# Filter out rows with 'goes_direction' containing "08", "12", or "13"
df_filtered = df[~df['Direction'].isin(["east"])]

# df_filtered.to_csv('/Users/surabhiupadhyay/Documents/aether/GOES_hist/goes_west.csv', index=False)

# Extract Data 7 days prior and 2 days after an AR landfall event

In [None]:
# This part helps me download 
# Load the list from the .npy file
file_list = np.load('/Users/surabhiupadhyay/Downloads/west_ar.npy', allow_pickle=True)

# Ensure it is a list (if it's stored as a numpy array, you might need to convert it)
if isinstance(file_list, np.ndarray):
    file_list = file_list.tolist()

# Function to extract date from file path
def extract_date_from_filepath(filepath):
    match = re.search(r'\d{4}-\d{2}-\d{2}', filepath)
    return match.group(0) if match else None

# Extract dates from files_list
file_dates = {extract_date_from_filepath(fp) for fp in file_list if extract_date_from_filepath(fp)}

# Load the CSV file
csv_filename = r'goes_west.csv'
df = pd.read_csv(csv_filename)

# Extract URLs from the CSV
urls = df['URL'].tolist()

# Function to extract date from URL
def extract_date_from_url(url):
    match = re.search(r'\d{4}\.\d{2}\.\d{2}', url)
    return match.group(0).replace('.', '-') if match else None

# Extract dates from URLs
url_dates = {extract_date_from_url(url) for url in urls if extract_date_from_url(url)}

# Find common dates
common_dates = file_dates.intersection(url_dates)

# Function to get URLs for a given date
def get_urls_for_date(date_str, url_list):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    start_date = date_obj - timedelta(days=7)
    end_date = date_obj + timedelta(days=3)
    relevant_urls = []
    
    for url in url_list:
        url_date_str = extract_date_from_url(url)
        if url_date_str:
            url_date_obj = datetime.strptime(url_date_str, '%Y-%m-%d')
            if start_date <= url_date_obj <= end_date:
                relevant_urls.append(url)
                
    return relevant_urls

# Get all relevant URLs
all_relevant_urls = []
for date in common_dates:
    all_relevant_urls.extend(get_urls_for_date(date, urls))

# Remove duplicates
all_relevant_urls = list(set(all_relevant_urls))

# Writing relevant URLs to a new CSV file
#output_csv_filename = 'relevant_data_with_urls.csv'
#output_df = pd.DataFrame({'URL': all_relevant_urls})
#output_df.to_csv(output_csv_filename, index=False)
#print(f"Relevant URLs saved to {output_csv_filename}")

# Extract 00, 06, 12, and 18 hours from the dataset

In [None]:
import numpy as np
import pandas as pd
from datetime import timedelta

# Load the CSV file
csv_filename = r'relevant_data_with_urls.csv'
df = pd.read_csv(csv_filename)

# Assuming 'Year', 'Month', 'Day' are columns already present in your DataFrame
# Create 'Date' column from 'Year', 'Month', 'Day'
df['Date'] = df.apply(lambda row: f"{int(row['Year']):04d}-{int(row['Month']):02d}-{int(row['Day']):02d}", axis=1)

# Ensure 'Hour' column is in hh:mm format
# Example: '06:00' -> '06:00:00'
df['Hour'] = df['Hour'].apply(lambda x: x + ':00')

# Convert 'Hour' column to timedelta for comparison
df['Hour'] = pd.to_timedelta(df['Hour'])

# Define target hours as timedeltas
target_hours = [pd.to_timedelta(h) for h in ['00:00:00', '06:00:00', '12:00:00', '18:00:00']]

# Function to find the closest hour
def find_closest_hour(hours, target_hour):
    time_diffs = [abs(h - target_hour) for h in hours]
    closest_hour = hours[time_diffs.index(min(time_diffs))]
    return closest_hour

# Drop duplicates based on 'Hour' column
df_no_duplicates = df.sort_values(by='Hour').drop_duplicates(subset=['Date', 'Hour'], keep='first')

# Filter and find the nearest available hour for each date and target hour
filtered_rows = []
for target_hour in target_hours:
    for date in df_no_duplicates['Date'].unique():
        daily_data = df_no_duplicates[df_no_duplicates['Date'] == date]
        if not daily_data.empty:
            available_hours = daily_data['Hour'].tolist()
            closest_hour = find_closest_hour(available_hours, target_hour)
            filtered_rows.append(daily_data[daily_data['Hour'] == closest_hour])

# Concatenate filtered rows into a new DataFrame
filtered_df = pd.concat(filtered_rows)

# Sort the filtered DataFrame
filtered_df.sort_values(by=['Date', 'Hour'], inplace=True)

# Writing the filtered DataFrame to a new CSV file
output_csv_filename = 'relevant_data_with_urls_filtered.csv'
filtered_df.to_csv(output_csv_filename, index=False)
print(f"Filtered data with relevant URLs saved to {output_csv_filename}")