# HUXt Processing

<div style="background-color: #ffaaaa; padding: 15px; border-radius: 5px;">
<b>Important Note:</b> Must create HUXt dataframes from "ambient_huxt.ipynb" first.
</div>

In [8]:
%load_ext autoreload
%autoreload 2

In [9]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

huxt_utils_dir = os.path.join(os.getcwd(), 'src', 'huxt')
ml_utils_dir = os.path.join(os.getcwd(), 'src', 'ml')
data_dir = os.path.join(os.getcwd(), 'src', 'data')

# Add my utils to the path
import sys
sys.path.append(huxt_utils_dir)
sys.path.append(ml_utils_dir)

from data_loader import load_huxt_data_as_windows, load_omni_data
import huxt_utils as HU
import fastparquet

In [14]:
%%time 

# Specify which CRs 
start_cr = 1892   # Min 1892
end_cr = 2290     # Max 2290

additional_cols = ['hp30', 'velocity gradient']
folder_name = f'HUXt1'
print(f'Processing {folder_name}')
for cr in range(start_cr, end_cr):
    df = HU.huxt_output_to_ml_df(rotation_number=cr, extra_columns=additional_cols,
                                 folder_name=folder_name, save=True, overwrite=True)

Processing HUXt1
CPU times: user 15.3 s, sys: 2.28 s, total: 17.6 s
Wall time: 19.9 s


In [18]:
chunk_size = 20  # Process [chunk_size] files at a time
output_file = os.path.join(data_dir, 'HUXt', f'{folder_name}_modified', 'full_df.parquet')
huxt_data_dir = os.path.join(data_dir, 'HUXt', f'{folder_name}_modified')

dfs = []  
last_index = None  # Keep track of last index to remove overlap
Nens = 100         # Specify no. ensembles

OMNI = load_omni_data(data_dir)

def process_chunk():
    """Saves the current chunk of data, removing duplicates and appending to the Parquet file."""
    global dfs, last_index  # Ensure we modify the global list and last index
    
    if not dfs:  # If there's no data, skip saving
        return

    # Concatenate and remove duplicates
    chunk_df = pd.concat(dfs, ignore_index=False)

    # Drop duplicated indices while keeping the last occurrence
    chunk_df = chunk_df[~chunk_df.index.duplicated(keep='last')]

    # Remove overlap with the last processed chunk
    if last_index is not None:
        chunk_df = chunk_df.loc[chunk_df.index > last_index]

    # Update last index for next batch
    last_index = chunk_df.index[-1]
    
    OMNI_half_hourly = OMNI.reindex(chunk_df.index, method='ffill')

    v_columns = chunk_df.columns[:Nens]
    v_grad_columns = chunk_df.columns[Nens:2*Nens]
    remainder = chunk_df.columns[2*Nens:]
    
    chunk_df = pd.concat((chunk_df, OMNI_half_hourly), axis=1)
    
    v_minus_omni = pd.DataFrame(
        chunk_df[v_columns].values - chunk_df['Velocity'].values[:, None],
        columns=[f'v_minus_omni_{i}' for i in range(Nens)],
        index=chunk_df.index
    )
    
    chunk_df = pd.concat((chunk_df, v_minus_omni), axis=1)

    v_minus_omni_columns = v_minus_omni.columns
    
    # Interleave the columns
    interleaved_columns = []
    for a_col, b_col, c_col in zip(v_columns, v_grad_columns, v_minus_omni_columns):
        interleaved_columns.extend([a_col, b_col, c_col])
    interleaved_columns.extend(remainder)
    
    # Reorder the DataFrame using the new column order
    chunk_df_rearranged = chunk_df[interleaved_columns]

    # Append to output file using fastparquet
    fastparquet.write(output_file, chunk_df_rearranged, compression="snappy", append=os.path.exists(output_file))

    # Clear memory
    dfs.clear()
    print(f"Saved batch at CR {cr}")

if not os.path.exists(output_file):
    # Main processing loop
    for i in range(start_cr, end_cr + 1, chunk_size):
        for cr in range(i, min(i + chunk_size, end_cr + 1)):
            file_path = os.path.join(huxt_data_dir, f'HUXt_rotation_{cr}')
            try:
                df = pd.read_parquet(file_path)
                dfs.append(df)
            except:
                print(f'file for CR {cr} not created')
    
        process_chunk()
    
    print('Done')

else:
    print(f'File exists at {output_file}')


Saved batch at CR 1911
Saved batch at CR 1931
Saved batch at CR 1951
Saved batch at CR 1971
Saved batch at CR 1991
Saved batch at CR 2011
Saved batch at CR 2031
Saved batch at CR 2051
Saved batch at CR 2071
Saved batch at CR 2091
Saved batch at CR 2111
Saved batch at CR 2131
Saved batch at CR 2151
Saved batch at CR 2171
Saved batch at CR 2191
Saved batch at CR 2211
Saved batch at CR 2231
Saved batch at CR 2251
Saved batch at CR 2271
file for CR 2290 not created
Saved batch at CR 2290
Done


In [22]:
%%time

# Check that we have processed correctly

print('File location:', output_file)
df = pd.read_parquet(output_file, engine="fastparquet")

# Check if dataset has unique indices
print('Unique indices:', len(df.index) == len(set(df.index)))

# Columns should read: v_0, v_0_gradient, v_minus_omni_0, v_1, ... , v_99, v_99_gradient, v_minus_omni_99, hpo
print('Data points:', len(df))
df

File location: /Users/matthewbillcliff/storm_forecasting_MB/src/data/HUXt/HUXt1_modified/full_df.parquet
Unique indices: True
Data points: 521201
CPU times: user 801 ms, sys: 291 ms, total: 1.09 s
Wall time: 1.33 s


Unnamed: 0_level_0,v_0,v_0_gradient,v_minus_omni_0,v_1,v_1_gradient,v_minus_omni_1,v_2,v_2_gradient,v_minus_omni_2,v_3,...,v_97,v_97_gradient,v_minus_omni_97,v_98,v_98_gradient,v_minus_omni_98,v_99,v_99_gradient,v_minus_omni_99,hp30
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1995-01-24 12:00:00,323.647807,0.176736,-37.352193,396.294698,-0.768079,35.294698,331.401977,0.557320,-29.598023,317.861001,...,406.132005,-1.310546,45.132005,417.081478,-1.565302,56.081478,323.226582,-0.004929,-37.773418,2.000
1995-01-24 12:30:00,323.824543,0.183487,-37.175457,395.526619,-0.796997,34.526619,331.959297,0.568086,-29.040703,317.950429,...,404.821460,-1.311941,43.821460,415.516176,-1.557915,54.516176,323.221653,0.000237,-37.778347,1.000
1995-01-24 13:00:00,324.014782,0.196918,-32.985218,394.700704,-0.850781,37.700704,332.538149,0.588137,-24.461851,318.041737,...,403.508123,-1.313800,46.508123,413.965649,-1.543142,56.965649,323.227055,0.010693,-33.772945,1.000
1995-01-24 13:30:00,324.218379,0.210136,-32.781621,393.825057,-0.896932,36.825057,333.135571,0.605018,-23.864429,318.134809,...,402.193859,-1.313934,45.193859,412.429892,-1.528375,55.429892,323.243039,0.021361,-33.756961,0.667
1995-01-24 14:00:00,324.435055,0.223003,-45.564945,392.906839,-0.936349,22.906839,333.748186,0.618329,-36.251814,318.229529,...,400.880256,-1.312594,30.880256,410.908898,-1.513613,40.908898,323.269778,0.032163,-46.730222,0.333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-16 18:00:00,318.074723,-0.132611,-129.925277,329.729774,-0.303402,-118.270226,318.188499,-0.124339,-129.811501,309.395338,...,330.321487,-0.354043,-117.678513,336.137693,-0.398741,-111.862307,322.167533,-0.238005,-125.832467,2.333
2024-10-16 18:30:00,317.943213,-0.130399,-130.056787,329.429753,-0.296617,-118.570247,318.065215,-0.122219,-129.934785,309.369035,...,329.971681,-0.345495,-118.028319,335.743931,-0.388668,-112.256069,321.930863,-0.235324,-126.069137,3.000
2024-10-16 19:00:00,317.813926,-0.128174,-122.186074,329.136541,-0.289791,-110.863459,317.944060,-0.120090,-122.055940,309.342191,...,329.630497,-0.336804,-110.369503,335.360358,-0.378360,-104.639642,321.696886,-0.232627,-118.303114,2.667
2024-10-16 19:30:00,317.686866,-0.125954,-122.313134,328.850171,-0.282942,-111.149829,317.825035,-0.117969,-122.174965,309.314583,...,329.298074,-0.327979,-110.701926,334.987211,-0.367812,-105.012789,321.465609,-0.229926,-118.534391,2.667
