In [None]:
import glob
import os

import pandas as pd
from dask import compute
from dask.delayed import delayed
import dask.dataframe as dd

from config import NOAA_RAW_ROOT, NOAA_GOLD_ROOT

# Transform NOAA Observations

In [None]:
filenames = glob.glob(f"{NOAA_RAW_ROOT}/*/*.gz", recursive=True)
filenames[0:3]

In [None]:
def to_station(filename):   
    split = filename.split("\\")[-1].split("-")
    return split[0]+"-"+split[1]

result = to_station('../data/raw/noaa\\1981\\722430-12960-1981.gz')
result

In [None]:
def read_csv(filename):
    station = to_station(filename)
    df = pd.read_csv(filename, header=None)
    df["station"]=station
    return df

dfs = [delayed(read_csv)(fn) for fn in filenames]
df = dd.from_delayed(dfs)  # df is a dask dataframe
df = df.rename(columns={0: "raw"})
df.head(3)

 Source Description: https://www1.ncdc.noaa.gov/pub/data/noaa/isd-lite/isd-lite-format.txt

In [None]:
df["year"] = df["raw"].str[0:4].astype(int)
df["month"] = df["raw"].str[5:7].astype(int)
df["day"] = df["raw"].str[8:10].astype(int)
df["hour"] = df["raw"].str[11:13].astype(int)
df["datetime"] = dd.to_datetime(df[["year", "month", "day", "hour"]])
df["day_of_year"] = df["datetime"].dt.dayofyear
df["quarter"] = df["datetime"].dt.quarter

df["temperature"] = df["raw"].str[13:19].astype(float).replace(-9999, pd.NA) / 10
df["dewpoint"] = (
    df["raw"].str[19:25].astype(float).replace(-9999, pd.NA) / 10
)  # Different than source docs
df["pressure"] = df["raw"].str[26:32].astype(float).replace(-9999, pd.NA) / 10
df["wind_direction"] = df["raw"].str[34:37].astype(float).replace(-9999, pd.NA)
df["wind_speed"] = df["raw"].str[40:43].astype(float).replace(-9999, pd.NA) / 10
df["sky_condition"] = df["raw"].str[44:50].astype(float).replace(-9999, pd.NA)
df["precipitation_one_hour"] = df["raw"].str[53:55].astype(float).replace(-9999, pd.NA) / 10
df["precipitation_six_hour"] = df["raw"].str[56:61].astype(float).replace(-9999, pd.NA) / 10
df.head()

In [None]:
gold_path = f"{NOAA_GOLD_ROOT}.parquet"
df.to_parquet(gold_path, partition_on=["station", "year"])

Previous: [Ingest Ercot Loads](ingest_ercot_loads.ipynb)
Next: [Transform Ercot Loads](transform_ercot_loads.ipynb)