# Downloading Required OMNI and Hpo Data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import os
import sys

ml_utils_dir = os.path.join(os.getcwd(), 'src', 'ml')

sys.path.append(ml_utils_dir)

# Get functions for processing hp30
from data_loader import process_hp30_data

## Hp30

1. Download data manually from: https://kp.gfz.de/en/hp30-hp60/data

Start date: 01/01/1995    End date: 01/01/2025
Index selection: Hp30     Format Selection: Text File

2. From this page, save as a .txt file using 'ctrl + s' on windows or 'cmd + s' on mac and choose the 'src/data' path

3. Run the following cell to reformat into required dataset

In [None]:
times, hp30 = process_hp30_data()   # Read in downloaded hp30 data

df = pd.DataFrame(data=hp30, columns=['hp30'], index=times)   # Convert to pandas DataFrame

df.to_parquet(os.path.join(os.getcwd(), 'src', 'data', 'hp30df.parquet'))  # Save as parquet file for efficient loading

In [None]:
# Check that data was saved properly

loaded_df = pd.read_parquet(os.path.join(os.getcwd(), 'src', 'data', 'hp30df.parquet'))
loaded_df.head()

## OMNI

1. Download data manually from: https://omniweb.gsfc.nasa.gov/form/dx1.html

2. Select "Create File" at the top of the web page

3. Select resolution: "Hourly Averaged"

4. Start date: 19950101   End date: 20250101

5. Select only "Flow speed, km/sec" in the variable selection

6. Open the ASCII Data File

7. From this page, save as 'omni.txt' using 'ctrl + s' on windows or 'cmd + s' on mac and choose the 'src/data' path

8. Run the following cell to reformat into required dataset

In [None]:
file_path = os.path.join(os.getcwd(), 'src', 'data', 'omni.txt')
omni_df = pd.read_csv(file_path, sep='\s+', names=['YEAR', 'DOY', 'HR', 'Velocity'], header=None)

datetime_index = pd.to_datetime(omni_df['YEAR'] * 1000 + omni_df['DOY'], format='%Y%j') + pd.to_timedelta(omni_df['HR'], unit='h')

# Set the datetime index to the DataFrame
df = omni_df.set_index(datetime_index, inplace=False)

# Drop the 'Year', 'Day', and 'Hour' columns
df.drop(columns=['YEAR', 'DOY', 'HR'], inplace=True)

save_loc = os.path.join(os.getcwd(), 'src', 'data', 'OMNI_solar_wind.parquet')
df.to_parquet(save_loc)

In [None]:
# Check data was saved correctly

loaded_df = pd.read_parquet(save_loc)
loaded_df.head()