# Data Labeling and Unification

Here we combine the data from cactus and Adtiya L1-SWISS to create one HDF5 file which contains all the features from level 2, from all three files BLK,TH1 and TH2. 

We also label them as 

In [None]:
#!/usr/bin/env python3
"""
Store ALL SWIS Level-2 CDF variables (including multi-D)
in a single HDF5 file, grouped by date, with event labels
and a global summary. Handles scalar vs array datasets correctly.
"""

import os
import glob
import re
from datetime import timedelta

import numpy as np
import pandas as pd
import h5py
from cdflib import CDF
try:
    from cdflib.epochs import CDFepoch
except ImportError:
    from cdflib.cdfepoch import CDFepoch

from tqdm import tqdm
import tables  # for clearing stale HDF5 handles

# ─── CONFIG ────────────────────────────────────────────────────────────────────
BASE_DIR     = "/Volumes/Samsung_PSSD_T7_Shield/Aditya L1 Swiss/pradan1.issdc.gov.in/al1/protected/downloadData/aspex/swis/level2"
CACTUS_CSV   = "/Volumes/Samsung_PSSD_T7_Shield/Aditya L1 Swiss/Cactus/cactus_all_cmes.csv"
OUTPUT_FILE  = "/Volumes/Samsung_PSSD_T7_Shield/Aditya L1 Swiss/sw_level2_full.h5"
ONSET_COL    = "onset_time_utc"
WIDTH_COL    = "angular_width_deg"
MATCH_WINDOW = timedelta(minutes=30)

# ─── HELPERS ────────────────────────────────────────────────────────────────────
def load_cactus(path):
    df = pd.read_csv(path)
    df['event_time'] = pd.to_datetime(
        df[ONSET_COL], format="%Y/%m/%d %H:%M", utc=True
    )
    def cat(w):
        if pd.isna(w) or w == 0:      return 0
        if w < 20:                     return 1
        if w < 60:                     return 2
        if w < 120:                    return 3
        return 4
    df['label_code'] = df[WIDTH_COL].apply(cat)
    print(f"✔️  Loaded {len(df)} CACTUS events")
    return df[['event_time','label_code']]

def read_cdf_vars(path):
    """Return (vars_dict, time_var_name) for a CDF file."""
    cdf = CDF(path)
    info = cdf.cdf_info()
    out = {v: cdf.varget(v) for v in info.zVariables}
    # pick the first variable containing "epoch"
    time_vars = [v for v in out if 'epoch' in v.lower()]
    if not time_vars:
        raise KeyError(f"No 'epoch' variable found in {path}")
    return out, time_vars[0]

# ─── MAIN ─────────────────────────────────────────────────────────────────────
def main():
    cactus = load_cactus(CACTUS_CSV)
    stats = dict(cdf_files=0, triplets=0, timestamps=0, labeled=0)

    # gather all UNP_9999 dirs
    unp_dirs = []
    for yr in ("2024","2025"):
        for m in range(1,13):
            d = os.path.join(BASE_DIR, yr, f"{m:02d}", "UNP_9999")
            if os.path.isdir(d):
                unp_dirs.append(d)
    print(f"📂 Found {len(unp_dirs)} UNP_9999 folders\n")

    # clear stale HDF5 handles and remove old file
    tables.file._open_files.close_all()
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

    with h5py.File(OUTPUT_FILE, 'w') as hf:
        stat_grp = hf.create_group('stats')

        for unp in tqdm(unp_dirs, desc="Folders"):
            cdfs = glob.glob(os.path.join(unp, "*.cdf"))
            stats['cdf_files'] += len(cdfs)

            # extract all YYYYMMDD dates from filenames
            dates = sorted({
                re.search(r"_(\d{8})_", os.path.basename(f)).group(1)
                for f in cdfs if re.search(r"_(\d{8})_", os.path.basename(f))
            })

            for dt in dates:
                # pick BLK/TH1/TH2 files, prefer V02
                trip = {}
                for inst in ("BLK","TH1","TH2"):
                    pat = f"_L2_{inst}_{dt}_"
                    v02 = [f for f in cdfs if pat in f and "_V02.cdf" in f]
                    v01 = [f for f in cdfs if pat in f and "_V01.cdf" in f]
                    trip[inst] = v02[0] if v02 else (v01[0] if v01 else None)
                if None in trip.values():
                    continue
                stats['triplets'] += 1

                # read variables and detect time axis
                blk_vars, blk_tv = read_cdf_vars(trip['BLK'])
                th1_vars, th1_tv = read_cdf_vars(trip['TH1'])
                th2_vars, th2_tv = read_cdf_vars(trip['TH2'])
                assert blk_tv == th1_tv == th2_tv
                tv = blk_tv

                # convert epochs → UTC datetimes
                times = CDFepoch.to_datetime(blk_vars[tv])
                times = pd.to_datetime(times).tz_localize('UTC')
                stats['timestamps'] += len(times)

                # build label array
                labels = np.zeros(len(times), dtype='i4')
                for evt, code in zip(cactus['event_time'], cactus['label_code']):
                    mask = ((times >= evt - MATCH_WINDOW) &
                            (times <= evt + MATCH_WINDOW))
                    labels[mask] = code
                stats['labeled'] += int((labels != 0).sum())

                # write date group
                grp = hf.require_group(dt)
                # store times in ms since epoch
                ms = (times.view('int64') // 1_000_000).astype('i8')
                grp.create_dataset('utc_time_ms', data=ms, dtype='i8')
                grp.create_dataset('label_code', data=labels, dtype='i4')

                # write each instrument's variables (multi-D OK)
                for inst, varset in (('BLK',blk_vars),
                                     ('TH1',th1_vars),
                                     ('TH2',th2_vars)):
                    ig = grp.require_group(inst)
                    for name, arr in varset.items():
                        if np.ndim(arr) == 0:
                            ig.create_dataset(name, data=arr)
                        else:
                            ig.create_dataset(name,
                                              data=arr,
                                              compression='gzip',
                                              compression_opts=4)

        # attach global stats as attributes
        for k, v in stats.items():
            stat_grp.attrs[k] = v

    # final summary print
    print("\n📊 **Summary**")
    print(f"  • .cdf files scanned:    {stats['cdf_files']}")
    print(f"  • Triplets processed:    {stats['triplets']}")
    print(f"  • Timestamps stored:     {stats['timestamps']}")
    print(f"  • Timestamps labeled:    {stats['labeled']}")

if __name__ == "__main__":
    main()


In [1]:
#!/usr/bin/env python3
import h5py

def print_datasets(grp, parent_path=""):
    """
    Recursively print every dataset under grp, showing:
      <full_path>   shape=<shape>   dtype=<dtype>
    """
    for name, item in grp.items():
        path = f"{parent_path}/{name}" if parent_path else name
        if isinstance(item, h5py.Dataset):
            print(f"{path}   shape={item.shape}   dtype={item.dtype}")
        else:  # Group
            print_datasets(item, path)

if __name__ == "__main__":
    fp = "sw_level2_full.h5"   # adjust if needed
    with h5py.File(fp, "r") as hf:
        date = "20240801"
        if date not in hf:
            raise KeyError(f"Group '{date}' not found")
        print(f"Contents of /{date} (dataset paths, shapes, dtypes):\n")
        print_datasets(hf[date])


Contents of /20240801 (dataset paths, shapes, dtypes):

BLK/alpha_bulk_speed   shape=(17275,)   dtype=float64
BLK/alpha_density   shape=(17275,)   dtype=float64
BLK/alpha_thermal   shape=(17275,)   dtype=float64
BLK/bulk_a_uncer   shape=(17275,)   dtype=float64
BLK/bulk_p_uncer   shape=(17275,)   dtype=float64
BLK/epoch_for_cdf_mod   shape=(17275,)   dtype=float64
BLK/numden_a_uncer   shape=(17275,)   dtype=float64
BLK/numden_p_uncer   shape=(17275,)   dtype=float64
BLK/proton_bulk_speed   shape=(17275,)   dtype=float64
BLK/proton_density   shape=(17275,)   dtype=float64
BLK/proton_thermal   shape=(17275,)   dtype=float64
BLK/proton_xvelocity   shape=(17275,)   dtype=float64
BLK/proton_yvelocity   shape=(17275,)   dtype=float64
BLK/proton_zvelocity   shape=(17275,)   dtype=float64
BLK/spacecraft_xpos   shape=(17275,)   dtype=float64
BLK/spacecraft_ypos   shape=(17275,)   dtype=float64
BLK/spacecraft_zpos   shape=(17275,)   dtype=float64
BLK/thermal_a_uncer   shape=(17275,)   dtype=floa