In [35]:
import xarray as xr
import matplotlib.pyplot as plt
import pandas as pd
import torch
import numpy as np
from datetime import datetime
import os

In [None]:
def prepare_catalog(year, station):
    catalog = pd.read_csv(f"D:/Capstone/data/catalog/csv/{station}_catalog_1960.csv")
    catalog['time'] = pd.to_datetime(catalog['time'])
    catalog_year = catalog[catalog['time'].dt.year == year]
    print(f"catalog: {len(catalog_year)}")
    return catalog_year

def open_raw_data(year):
    print("opening raw data...")
    ds = xr.open_dataset(f"D:/Capstone/data/geomagnetic_data/all_stations_all{year}.netcdf", engine="netcdf4")
    print("raw data access success...")
    return ds
    
def prepare_raw_data(ds, station):
    # Extract the station IDs for the first block (time step)
    station_ids = ds['id'].isel(block=0)

    ds = ds.assign_coords(station_id=('vector', station_ids.data))
    ds = ds.swap_dims({'vector':'station_id'})
    
    time = pd.to_datetime({
        'year': ds['time_yr'].values,
        'month': ds['time_mo'].values,
        'day': ds['time_dy'].values,
        'hour': ds['time_hr'].values,
        'minute': ds['time_mt'].values,
        'second': ds['time_sc'].values
    })

    ds = ds.assign_coords(time=('block', time))
    ds = ds.swap_dims({'block': 'time'})
   
    station_ds = ds.sel(station_id=(ds['station_id'] == station))
    
    print(f"station ds: {station_ds.coords['station_id'].size}")
    # print(station_ds)
    return station_ds


In [None]:
def seg_to_tensor(segment, station):
    # print(segment)
    selected_vars = ['dbn_nez', 'dbe_nez', 'dbz_nez']
    variable_tensors = [torch.tensor(segment[var].sel(station_id=station).values, dtype=torch.float32) for var in selected_vars]
    final_tensor = torch.stack(variable_tensors)
    # print("Shape of final tensor:", final_tensor.shape)
    return final_tensor

def build_dataset(year, station, ds, catalog, seg_len):
    print("building dataset...")
    pre_EQ = []
    non_EQ = []
    for i in range(len(catalog)):
        # print(f"i: {i}")
        current_EQ= catalog.iloc[i]
        end_time = current_EQ['time'].tz_localize(None)
        start_time = (current_EQ['time'] - pd.Timedelta(hours=seg_len)).tz_localize(None)
        if i == len(catalog)-1:
            prev_EQ_time = datetime(year, 1, 1, 0, 0, 0)
            prev_EQ_time = pd.Timestamp(prev_EQ_time).tz_localize(None)
        else:
            prev_EQ = catalog.iloc[i+1]
            prev_EQ_time = prev_EQ['time'].tz_localize(None)
        # print(f"start_time: {start_time}")
        # print(f"prev_EQ_time: {prev_EQ_time}")
        while start_time >= prev_EQ_time:
            segment = ds.sel(time=slice(start_time, end_time))
            seg_tensor = seg_to_tensor(segment=segment, station=station)
            if end_time == current_EQ['time'].tz_localize(None):
                pre_EQ.append(seg_tensor)
            else:
                non_EQ.append(seg_tensor)

            start_time -= pd.Timedelta(hours=seg_len)
            end_time -= pd.Timedelta(hours=seg_len)
            

    print(f"Pre EQ:{len(pre_EQ)}")
    # print(pre_EQ)
    print(f"non EQ:{len(non_EQ)}")
    # print(non_EQ)
    station_directory = f"D:\Capstone\data\dataset\{station}"
    os.makedirs(station_directory, exist_ok=True)

    if len(pre_EQ) > 0:
        pre_EQ_tensor = torch.stack(pre_EQ)
        torch.save(pre_EQ_tensor, f"D:\Capstone\data\dataset\{station}\\test_{year}_{station}_{seg_len}hr.pt")

    if len(non_EQ) > 0:
        non_EQ_tensor = torch.stack(non_EQ)
        torch.save(non_EQ_tensor, f"D:\Capstone\data\dataset\{station}\\normal_{year}_{station}_{seg_len}hr.pt")



In [47]:
year = 2022
stations=['ANT', 'BIK', 'BLC', 'FUR', 'KAK', 'KNY', 'LMA', 'MMB', 'PBQ', 'PEL', 'SHU', 'SIT', 'TND']
seg_len = 168


In [48]:
raw_data = open_raw_data(year=year)

opening raw data...
raw data access success...


In [49]:
for station in stations:
    print(f"station: {station}")
    catalog = prepare_catalog(year=year, station=station)
    ds = prepare_raw_data(ds=raw_data, station=station)
    if ds.coords['station_id'].size != 0:
        build_dataset(year=year, station=station, ds=ds, catalog=catalog, seg_len=seg_len)
    else:
        print(f"{station} not available for {year}")
    print('\n')

station: ANT
catalog: 8
station ds: 0
ANT not available for 2022


station: BIK
catalog: 8
station ds: 0
BIK not available for 2022


station: BLC
catalog: 1
station ds: 1
building dataset...
Pre EQ:1
non EQ:41


station: FUR
catalog: 0
station ds: 1
building dataset...
Pre EQ:0
non EQ:0


station: KAK
catalog: 62
station ds: 1
building dataset...
Pre EQ:17
non EQ:7


station: KNY
catalog: 22
station ds: 1
building dataset...
Pre EQ:12
non EQ:30


station: LMA
catalog: 8
station ds: 0
LMA not available for 2022


station: MMB
catalog: 12
station ds: 1
building dataset...
Pre EQ:9
non EQ:33


station: PBQ
catalog: 0
station ds: 0
PBQ not available for 2022


station: PEL
catalog: 33
station ds: 1
building dataset...
Pre EQ:12
non EQ:27


station: SHU
catalog: 11
station ds: 1
building dataset...
Pre EQ:11
non EQ:35


station: SIT
catalog: 1
station ds: 1
building dataset...
Pre EQ:1
non EQ:0


station: TND
catalog: 67
station ds: 0
TND not available for 2022


