In [30]:
import argparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import pandas as pd
import numpy as np
import os
from concurrent.futures import ThreadPoolExecutor
import xarray as xr

In [22]:
def list_nc_files_to_csv_and_npy(url, csv_file, npy_file, start_year=1996, end_year=2022, download_path='nc_files'):
    # Get the content of the URL
    response = requests.get(url)
    response.raise_for_status()
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all links ending with .nc
    nc_files = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.nc')]
    
    # Filter files based on year range and create a list of dictionaries
    file_list = []
    for nc_file in nc_files:
        # Extract the year from the file name
        year_match = re.search(r'(\d{4})', nc_file)
        if year_match:
            year = int(year_match.group(1))
            if start_year <= year <= end_year:
                file_url = urljoin(url, nc_file)
                file_list.append({'File Name': nc_file, 'File URL': file_url, 'Year': year})
                download_file(file_url, download_path)
    
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(file_list)
    
    # Save DataFrame to CSV file
    df.to_csv(csv_file, index=False)
    print(f"List of .nc files saved to {csv_file}")
    
    # Save list as .npy file
    np.save(npy_file, file_list)
    print(f"List of .nc files saved to {npy_file}")

In [23]:
def download_file(url, download_path):
    filename = os.path.join(download_path, os.path.basename(url))
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print(f"Downloaded: {filename}")

In [24]:
url = "https://portal.nersc.gov/archive/home/a/arhoades/Shared/www/TE_ERA5_ARs/"
csv_file = "labeled_ARs_TE.csv"
npy_file = "labeled_ARs_TE.npy"
start_year = 1996
end_year = 2022
download_path = "labeled_data_arhoades"

In [25]:
# Ensure the download directory exists
os.makedirs(download_path, exist_ok=True)

In [28]:
# # Call the function to list .nc files, download them, and save to CSV and .npy
# list_nc_files_to_csv_and_npy(url, csv_file, npy_file, start_year, end_year, download_path)

In [36]:
import matplotlib.pyplot as plt

In [39]:
ds

In [68]:
ds_selected

In [122]:
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import numpy as np
from tqdm import tqdm
from datetime import datetime, timedelta

In [110]:
files = [f for f in os.listdir('labeled_data_arhoades') if not 'nff' in f]

In [113]:
dates = []
for f in tqdm(files):
    ds = xr.open_dataset(os.path.join('labeled_data_arhoades',f))
    lat_range=(90, 0)
    lon_range=(180, 260)
    ds_selected = ds.sel(latitude=slice(*lat_range), longitude=slice(*lon_range))
    
    for i in range(len(ds_selected.time)):
        data = ds_selected['AR_binary_tag'][i,:,:]
        lat = ds_selected.latitude
        lon = ds_selected.longitude
        lon, lat = np.meshgrid(lon, lat)
    
        d = np.array(data).flatten()
        tr = d[(d==1)].shape[0]/d.shape[0]
        if tr>0.02:
            dates.append(pd.to_datetime(data.time.values).date().strftime(format='%Y-%m-%d'))
            

100%|██████████| 324/324 [02:06<00:00,  2.57it/s]


In [114]:
def unique_list(input_list):
    seen = set()
    unique_elements = []
    for element in input_list:
        if element not in seen:
            unique_elements.append(element)
            seen.add(element)
    return unique_elements

In [116]:
len(unique_list(dates))

6275

In [119]:
dates_df = pd.DataFrame({'Dates':unique_list(dates)})

In [124]:
dates_df['Dates'] = pd.to_datetime(dates_df['Dates'])

In [127]:
dates_df[dates_df['Dates']>datetime(2018,8,28)].to_csv('AR_dates_g17.csv', index=False)

In [128]:
dates_df.to_csv('AR_dates.csv', index=False)

In [None]:
            # # Set up the map
            # fig = plt.figure(figsize=(10, 5))
            # ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
            # ax.add_feature(cfeature.COASTLINE)
            # ax.add_feature(cfeature.BORDERS, linestyle=':')
            
            # # Add data to the map
            # plt.contourf(lon, lat, data, transform=ccrs.PlateCarree(), cmap='viridis')
            
            # # Add a colorbar
            # plt.colorbar(ax=ax, orientation='vertical')
            
            # # Title and labels
            # ax.set_xlabel('Longitude')
            # ax.set_ylabel('Latitude')
            # ax.set_title(i)
            
            # ax.set_xlim([-180,-100])
            
            # # Show the plot
            # plt.show()