# HUXt Processing

<div style="background-color: #ffaaaa; padding: 15px; border-radius: 5px;">
<b>Important Note:</b> Must create HUXt dataframes from "ambient_huxt.ipynb" first.
</div>

In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

huxt_utils_dir = os.path.join(os.getcwd(), 'src', 'huxt')
ml_utils_dir = os.path.join(os.getcwd(), 'src', 'ml')
data_dir = os.path.join(os.getcwd(), 'src', 'data')

# Add my utils to the path
import sys
sys.path.append(huxt_utils_dir)
sys.path.append(ml_utils_dir)

from data_loader import load_huxt_data_as_windows, load_omni_data
import huxt_utils as HU
import fastparquet

In [6]:
%%time 

# Specify which CRs 
start_cr = 1892   # Min 1892
end_cr = 2290     # Max 2290

additional_cols = ['hp30', 'velocity gradient']
folder_name = f'HUXt1'
print(f'Processing {folder_name}')
for cr in range(start_cr, end_cr):
    df = HU.huxt_output_to_ml_df(rotation_number=cr, extra_columns=additional_cols,
                                 folder_name=folder_name, save=True, overwrite=True)

Processing HUXt1
CPU times: user 13.4 s, sys: 2.04 s, total: 15.5 s
Wall time: 18.1 s


In [9]:
chunk_size = 20  # Process [chunk_size] files at a time
output_file = os.path.join(data_dir, 'HUXt', f'{folder_name}_modified', 'full_df.parquet')
huxt_data_dir = os.path.join(data_dir, 'HUXt', f'{folder_name}_modified')

dfs = []  
last_index = None  # Keep track of last index to remove overlap
Nens = 100         # Specify no. ensembles

OMNI = load_omni_data(data_dir)

def process_chunk():
    """Saves the current chunk of data, removing duplicates and appending to the Parquet file."""
    global dfs, last_index  # Ensure we modify the global list and last index
    
    if not dfs:  # If there's no data, skip saving
        return

    # Concatenate and remove duplicates
    chunk_df = pd.concat(dfs, ignore_index=False)

    # Drop duplicated indices while keeping the last occurrence
    chunk_df = chunk_df[~chunk_df.index.duplicated(keep='last')]

    # Remove overlap with the last processed chunk
    if last_index is not None:
        chunk_df = chunk_df.loc[chunk_df.index > last_index]

    # Update last index for next batch
    last_index = chunk_df.index[-1]
    
    OMNI_half_hourly = OMNI.reindex(chunk_df.index, method='ffill')

    v_columns = chunk_df.columns[:Nens]
    v_grad_columns = chunk_df.columns[Nens:2*Nens]
    remainder = chunk_df.columns[2*Nens:]
    
    chunk_df = pd.concat((chunk_df, OMNI_half_hourly), axis=1)
    
    v_minus_omni = pd.DataFrame(
        chunk_df[v_columns].values - chunk_df['Velocity'].values[:, None],
        columns=[f'v_minus_omni_{i}' for i in range(Nens)],
        index=chunk_df.index
    )
    
    chunk_df = pd.concat((chunk_df, v_minus_omni), axis=1)

    v_minus_omni_columns = v_minus_omni.columns
    
    # Interleave the columns
    interleaved_columns = []
    for a_col, b_col, c_col in zip(v_columns, v_grad_columns, v_minus_omni_columns):
        interleaved_columns.extend([a_col, b_col, c_col])
    interleaved_columns.extend(remainder)
    
    # Reorder the DataFrame using the new column order
    chunk_df_rearranged = chunk_df[interleaved_columns]

    # Append to output file using fastparquet
    fastparquet.write(output_file, chunk_df_rearranged, compression="snappy", append=os.path.exists(output_file))

    # Clear memory
    dfs.clear()
    print(f"Saved batch at CR {cr}")

if not os.path.exists(output_file):
    # Main processing loop
    for i in range(start_cr, end_cr + 1, chunk_size):
        for cr in range(i, min(i + chunk_size, end_cr + 1)):
            file_path = os.path.join(huxt_data_dir, f'HUXt_rotation_{cr}')
            try:
                df = pd.read_parquet(file_path)
                dfs.append(df)
            except:
                print(f'file for CR {cr} not created')
    
        process_chunk()
    
    print('Done')

else:
    print(f'File exists at {output_file}\nDelete old file first')


File exists at /Users/matthewbillcliff/storm_forecasting_MB/src/data/HUXt/HUXt1_modified/full_df.parquet
Delete old file first


In [10]:
%%time

# Check that we have processed correctly

print('File location:', output_file)
df = pd.read_parquet(output_file, engine="fastparquet")

# Check if dataset has unique indices
print('Unique indices:', len(df.index) == len(set(df.index)))

# Columns should read: v_0, v_0_gradient, v_minus_omni_0, v_1, ... , v_99, v_99_gradient, v_minus_omni_99, hpo
print('Data points:', len(df))
df

File location: /Users/matthewbillcliff/storm_forecasting_MB/src/data/HUXt/HUXt1_modified/full_df.parquet
Unique indices: True
Data points: 521249
CPU times: user 823 ms, sys: 283 ms, total: 1.11 s
Wall time: 1.25 s


Unnamed: 0_level_0,v_0,v_0_gradient,v_minus_omni_0,v_1,v_1_gradient,v_minus_omni_1,v_2,v_2_gradient,v_minus_omni_2,v_3,...,v_97,v_97_gradient,v_minus_omni_97,v_98,v_98_gradient,v_minus_omni_98,v_99,v_99_gradient,v_minus_omni_99,hp30
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1995-01-23 12:00:00,328.940684,-0.268183,-98.059316,410.082112,4.797968,-16.917888,334.594021,-0.152879,-92.405979,553.186576,...,325.692944,-0.250994,-101.307056,322.459970,-0.204344,-104.540030,467.250065,-0.522598,40.250065,1.333
1995-01-23 12:30:00,328.410484,-0.258920,-98.589516,419.800254,4.952422,-7.199746,334.297600,-0.138876,-92.702400,549.546368,...,325.194779,-0.245221,-101.805221,322.053542,-0.200905,-104.946458,466.102915,-0.667925,39.102915,1.333
1995-01-23 13:00:00,328.154662,-0.252712,-109.845338,424.784923,4.985332,-13.215077,334.163390,-0.129546,-103.836610,547.742332,...,324.951508,-0.241283,-113.048492,321.853817,-0.198498,-116.146183,465.391618,-0.748103,27.391618,0.333
1995-01-23 13:30:00,327.661699,-0.240233,-110.338301,434.692553,4.856977,-3.307447,333.922940,-0.110919,-104.077060,544.166332,...,324.476971,-0.233179,-113.523029,321.461824,-0.193401,-116.538176,463.759490,-0.873521,25.759490,0.667
1995-01-23 14:00:00,327.424594,-0.233971,-101.575406,439.484872,4.697148,10.484872,333.816670,-0.101629,-95.183330,542.394336,...,324.245854,-0.229017,-104.754146,321.269745,-0.190713,-107.730255,462.859666,-0.921970,33.859666,0.333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-16 18:00:00,318.074723,-0.132611,-129.925277,329.729774,-0.303402,-118.270226,318.188499,-0.124339,-129.811501,309.395338,...,330.321487,-0.354043,-117.678513,336.137693,-0.398741,-111.862307,322.167533,-0.238005,-125.832467,2.333
2024-10-16 18:30:00,317.943213,-0.130399,-130.056787,329.429753,-0.296617,-118.570247,318.065215,-0.122219,-129.934785,309.369035,...,329.971681,-0.345495,-118.028319,335.743931,-0.388668,-112.256069,321.930863,-0.235324,-126.069137,3.000
2024-10-16 19:00:00,317.813926,-0.128174,-122.186074,329.136541,-0.289791,-110.863459,317.944060,-0.120090,-122.055940,309.342191,...,329.630497,-0.336804,-110.369503,335.360358,-0.378360,-104.639642,321.696886,-0.232627,-118.303114,2.667
2024-10-16 19:30:00,317.686866,-0.125954,-122.313134,328.850171,-0.282942,-111.149829,317.825035,-0.117969,-122.174965,309.314583,...,329.298074,-0.327979,-110.701926,334.987211,-0.367812,-105.012789,321.465609,-0.229926,-118.534391,2.667
