# Build interim data with cleaning and transformation

In [1]:
%load_ext autoreload
%autoreload 2


from ecoindex.pipeline import make_interim
from ecoindex.data_io import load_interim

make_interim()  # reads Excel -> cleans -> writes to data/interim/

taxa = load_interim("taxa_clean.parquet")
chem = load_interim("chem_clean.parquet")
env  = load_interim("env_clean.parquet")

from ecoindex.dataframe_ops import wrap_columns, add_site_block, concat_blocks, flatten_columns, get_block, align_blocks_by_index
from ecoindex.transform import hellinger_transform, log1p_standardize

# 1) Your three clean site-level tables (indexed by StationID)
taxa = taxa.set_index("StationID").sort_index()
chem = chem.set_index("StationID").sort_index()
env  = env.set_index("StationID").sort_index()

## (a) Strict intersection (keep only sites in all three)
taxa, chem, env = align_blocks_by_index([taxa, chem, env], how="inner")

# 2) Wrap each into a clear block
taxa_wrapped = wrap_columns(taxa, "taxa", "raw")
chem_wrapped   = wrap_columns(chem, "chemical", "raw")
env_wrapped   = wrap_columns(env, "environmental", "raw")  # no subblockb

# 3) Build master from blocks
master = concat_blocks([taxa_wrapped, chem_wrapped, env_wrapped])

In [2]:
# Apply hellinger transformation on the raw taxa data
taxa_hell = hellinger_transform(get_block(master, "taxa", "raw"))
# Wrap and add the transformed data into the 'taxa' block with 'hellinger' subblock
master = add_site_block(master, taxa_hell, "taxa", "hellinger")

# Do similar work to logarithmic transformation on the raw chemical data
chem_logz = log1p_standardize(get_block(master, "chemical", "raw"))
master = add_site_block(master, chem_logz, "chemical", "logz")

# check the added blocks
master.to_csv("../data/processed/master_example.csv")

In [4]:
master

block,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical,chemical
subblock,raw,raw,raw,raw,raw,raw,raw,raw,raw,raw,...,logz,logz,logz,logz,logz,logz,logz,logz,logz,logz
var,1234TCB,1245TCB,Al,As,Bi,Ca,Cd,Co,Cr,Cu,...,OCS,Pb,QCB,Sb,V,Zn,mirex,ppDDD,ppDDE,total_PCB
StationID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
A10,0.835583,0.775732,3041,1.939,18.45000,28170,0.2950,2.723,8.766,17.64,...,-0.312165,0.280413,-0.737303,0.703623,-0.289330,0.099916,-0.593026,-0.190979,-0.343200,-0.397416
A23,0.639983,0.697265,4483,2.512,17.03000,42110,0.3986,4.009,10.850,17.28,...,-0.687740,-0.023640,0.328420,-0.117357,-0.041387,0.190802,0.111417,0.077590,0.137696,0.365410
A27,0.451838,0.815149,13620,2.759,0.05370,41610,0.2180,6.273,21.080,25.00,...,-0.123819,0.006849,-0.756687,-1.917417,1.440670,0.683376,1.056451,0.868995,0.149458,0.442485
A28,0.224379,0.483363,12750,2.609,0.06617,33280,0.1197,5.824,18.700,24.07,...,-0.483904,-0.088969,-0.570351,-1.917417,1.212692,0.578433,-0.593026,0.894114,0.404618,-0.504533
A29,0.299715,0.695356,23740,3.735,0.15290,40450,0.1536,9.618,44.370,44.72,...,0.223999,0.703332,-0.343399,-1.831041,2.455776,1.536552,2.136226,-0.366882,0.230192,0.256692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S99,0.655186,0.951855,2826,1.214,15.12000,14960,0.3079,4.025,12.260,12.72,...,-0.687740,0.137733,-0.452841,-0.096333,0.355380,0.467869,0.121598,-0.067086,0.571341,0.246995
UBC1,0.000000,12.135559,6757,1.581,21.01000,43310,0.5146,5.263,12.800,16.00,...,3.159716,0.382723,2.446885,0.191014,0.398688,0.387140,-0.593026,-0.912801,-0.992074,1.229426
UCC1,0.000000,4.319792,5945,2.260,23.62000,50540,0.5658,5.303,11.400,15.81,...,1.655922,0.234947,1.577347,0.282724,0.224726,0.358275,-0.593026,-0.912801,-0.992074,1.119945
UCE1,0.000000,0.552417,7050,4.475,0.00010,38090,0.3449,6.232,11.860,14.62,...,0.945726,0.942971,0.296544,-1.916620,0.635558,0.126120,-0.593026,0.215085,-0.992074,-0.032193
