In [132]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sdv
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error, r2_score, root_mean_squared_error
)
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer

In [153]:
df_raw = pd.read_csv('dataset_1_item_independent.csv')
df_raw.shape

(3267, 94)

In [149]:
col_miss = df_raw.isna().sum().sort_values()
col_miss.tail(15)

emission_share_agri_waste_mgt              32
total_fdi_inflows                          32
emission_share_farmgate                    32
emission_share_land_use_change             32
emission_share_energy_use                  32
emission_share_crops                       32
emission_share_pre_and_post_production     32
value_added_aff_per_total_fdi              32
emission_share_end_to_end_agrifood         32
emission_share_ipcc_agriculture            32
total_pesticide_export_value               46
phosphorus_production                      47
potassium_agri_use                         48
emission_share_livestock                   54
aoi_credit_to_ag_forest_fish              979
dtype: int64

In [154]:
targets= col_miss.loc[col_miss > 0].keys()
targets

Index(['emission_share_agri_waste_mgt', 'total_fdi_inflows',
       'emission_share_farmgate', 'emission_share_land_use_change',
       'emission_share_energy_use', 'emission_share_crops',
       'emission_share_pre_and_post_production',
       'value_added_aff_per_total_fdi', 'emission_share_end_to_end_agrifood',
       'emission_share_ipcc_agriculture', 'total_pesticide_export_value',
       'phosphorus_production', 'potassium_agri_use',
       'emission_share_livestock', 'aoi_credit_to_ag_forest_fish'],
      dtype='object')

In [340]:

df = df_raw.copy()

# Reproducibility
SEED = 42
rng = np.random.default_rng(SEED)

# Pool of rows fully observed on all 15 TARGETS
pool_rows = df.loc[df[targets].notna().all(axis=1)]
pool_rows

Unnamed: 0,area,area_code,year_code,year,area_agri_land,area_arable_land,area_cropland,area_with_irrigation,area_permanent_crops,area_temporary_crops,...,least_developed_country,land_locked_developing_country,small_island_developing_state,low_income_food_deficit_country,net_food_importing_developing_country,temp_change_meteorological_year,temp_change_dec_jan_feb,temp_change_jun_jul_aug,temp_change_sep_oct_nov,temp_change_mar_apr_may
29,Albania,3,2007,2007,1119.0,578.0,698.0,356.5,120.0,186.0380,...,0.0,0.0,0.0,0.0,0.0,1.389,1.741,2.519,-0.718,2.015
30,Albania,3,2008,2008,1181.0,610.0,697.0,348.0,87.0,206.0000,...,0.0,0.0,0.0,0.0,0.0,1.043,0.238,1.862,0.685,1.387
31,Albania,3,2009,2009,1201.3,609.0,696.0,339.5,87.0,202.0000,...,0.0,0.0,0.0,0.0,0.0,0.977,0.390,1.261,0.873,1.383
32,Albania,3,2010,2010,1201.3,626.0,696.0,331.0,70.0,202.0000,...,0.0,0.0,0.0,0.0,0.0,1.261,1.234,1.607,0.932,1.271
33,Albania,3,2011,2011,1201.0,622.0,696.0,332.0,74.0,205.0000,...,0.0,0.0,0.0,0.0,0.0,1.125,0.630,1.659,0.970,1.243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3262,Zambia,251,2005,2005,22762.0,2727.0,2762.0,156.0,35.0,1967.3335,...,1.0,1.0,0.0,0.0,1.0,1.151,0.986,1.288,0.904,1.426
3263,Zambia,251,2006,2006,23048.0,3013.0,3048.0,156.0,35.0,2188.7428,...,1.0,1.0,0.0,0.0,1.0,0.760,1.013,0.906,0.406,0.715
3264,Zambia,251,2007,2007,22984.0,2949.0,2984.0,156.0,35.0,2148.4400,...,1.0,1.0,0.0,0.0,1.0,0.962,0.915,1.087,0.811,1.034
3265,Zambia,251,2008,2008,23087.0,3052.0,3087.0,156.0,35.0,2250.1707,...,1.0,1.0,0.0,0.0,1.0,0.518,0.270,0.638,1.238,-0.076


In [341]:
# For each 'area', pick exactly 1 row at random
val_per_area = (
    pool_rows.groupby('area', group_keys=False).apply(lambda g: g.sample(n=1, random_state=SEED))
)
val_per_area


  pool_rows.groupby('area', group_keys=False).apply(lambda g: g.sample(n=1, random_state=SEED))


Unnamed: 0,area,area_code,year_code,year,area_agri_land,area_arable_land,area_cropland,area_with_irrigation,area_permanent_crops,area_temporary_crops,...,least_developed_country,land_locked_developing_country,small_island_developing_state,low_income_food_deficit_country,net_food_importing_developing_country,temp_change_meteorological_year,temp_change_dec_jan_feb,temp_change_jun_jul_aug,temp_change_sep_oct_nov,temp_change_mar_apr_may
29,Albania,3,2007,2007,1119.0000,578.0000,698.0000,356.500000,120.0,186.0380,...,0.0,0.0,0.0,0.0,0.0,1.389000,1.741000,2.519000,-0.718000,2.01500
55,Angola,7,2019,2019,45877.0000,5363.0000,5680.0000,85.530000,317.0,4119.4037,...,1.0,0.0,0.0,0.0,1.0,1.785000,1.535000,1.910000,1.240000,2.45600
75,Antigua and Barbuda,8,2023,2023,9.0000,4.0000,5.0000,0.285000,1.0,2.9842,...,0.0,0.0,1.0,0.0,1.0,1.116000,0.544000,1.456000,1.755000,0.70800
100,Argentina,9,2018,2018,115930.2309,40181.2309,41249.2309,2357.000000,1068.0,34172.4713,...,0.0,0.0,0.0,0.0,0.0,0.860000,1.222000,-0.415000,1.064000,1.57000
121,Armenia,1,2016,2016,1676.8000,446.4000,504.4000,217.000000,58.0,353.5000,...,0.0,1.0,0.0,0.0,0.0,1.331000,1.986000,1.717000,-0.399000,2.01900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3171,Uruguay,234,2016,2016,14265.3000,2226.3000,2265.3000,258.000000,39.0,1508.7000,...,0.0,0.0,0.0,0.0,0.0,0.362000,1.371000,0.000000,0.462000,-0.38400
3200,Vanuatu,155,2022,2022,187.0000,20.0000,145.0000,3.820499,125.0,15.5742,...,0.0,0.0,1.0,0.0,0.0,1.285000,0.907000,1.890000,1.616000,0.72800
3223,Viet Nam,237,2022,2022,12315.0000,6754.0000,11673.0000,4585.000000,4920.0,6754.0000,...,0.0,0.0,0.0,0.0,0.0,0.945000,0.716000,0.997000,1.345000,0.72300
3234,Yemen,249,2010,2010,23579.0000,1291.0000,1579.0000,680.000000,288.0,1173.0000,...,1.0,0.0,0.0,1.0,1.0,2.416667,2.737917,2.456917,2.569417,1.90175


In [342]:
categorical_cols = ["area", "region", "sub_region"]
for c in categorical_cols:
    if c in df.columns:
        df[c] = df[c].astype('category')

# Original indices of selected validation rows (these align with the original df)
val_index = val_per_area.index

# carve out validation (clean copy) and training sets
df_val_clean = df.loc[val_index].copy()
df_train = df.drop(index=val_index).copy()

In [343]:
# Record the mask coordinates (row_id, col) for every TARGET in the val set
mask_records = []
for idx in df_val_clean.index:
    for col in targets:
        mask_records.append((int(idx), col))

# build a table with true values of 15 target cols for scoring later
y_true_df = pd.DataFrame(mask_records, columns=['row_id', 'target'])

In [344]:
r_idx = df_val_clean.index.get_indexer(y_true_df['row_id'])
c_idx = df_val_clean.columns.get_indexer(y_true_df['target'])

y_true_df['y_true'] = df_val_clean.to_numpy()[r_idx, c_idx]
y_true_df.head()

Unnamed: 0,row_id,target,y_true
0,29,emission_share_agri_waste_mgt,7.49
1,29,total_fdi_inflows,556.430175
2,29,emission_share_farmgate,53.74
3,29,emission_share_land_use_change,0.0
4,29,emission_share_energy_use,50.75


In [345]:
# Set all target cells in the validation copy to NaN
df_val_masked_lgbm = df_val_clean.copy()
df_val_masked_lgbm.loc[:, targets] = np.nan

In [346]:
import lightgbm as lgb
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

In [347]:

categorical_cols = ["area", "region", "sub_region"]
exclude_cols = ["area_code", "area_code_m49", "year_code"]


# Move categoricals to string
for c in categorical_cols:
    if c in df.columns:
        df[c] = df[c].astype('category')


# Store evaluation metrics for each imputed column
pred_records = []

# Loop over each column that needs imputation
for col in targets:
    # Features: all predictors except the target column and excluded ones
    X_cols = [c for c in df.columns if c != col and c not in exclude_cols]

    # train rows where the target column is observed
    train_mask = df_train[col].notna()
    X_train = df_train.loc[train_mask, X_cols]
    y_train = df_train.loc[train_mask, col]
    
    # validation rows where the target is not observed
    X_val = df_val_masked_lgbm.loc[:, X_cols]

    # Choose objective
    y_min = y_train.min()

    if y_min >= 0:
        obj = "tweedie"
    else:
        obj = "regression"

    model = lgb.LGBMRegressor(
        objective=obj,
        n_estimators=800,
        random_state=SEED,
        n_jobs=-1,
        verbosity=-1
    )

    model.fit(X_train, y_train)

    # predict for validation set rows
    y_pred = np.round(model.predict(X_val), 3)

    # Collect long-form predictions
    pred_records.extend([
        {"row_id": int(rid), "target": col, "method": "LightGBM", "y_pred": float(p)}
        for rid, p in zip(X_val.index, y_pred)
    ])

# Assemble predictions for downstream metrics merge
y_pred_df_lgbm = pd.DataFrame(pred_records, columns=["row_id", "target", "method", "y_pred"])
y_pred_df_lgbm.head(10)

Unnamed: 0,row_id,target,method,y_pred
0,29,emission_share_agri_waste_mgt,LightGBM,-665.731
1,55,emission_share_agri_waste_mgt,LightGBM,-637.065
2,75,emission_share_agri_waste_mgt,LightGBM,-744.993
3,100,emission_share_agri_waste_mgt,LightGBM,-521.21
4,121,emission_share_agri_waste_mgt,LightGBM,-502.367
5,144,emission_share_agri_waste_mgt,LightGBM,-607.661
6,179,emission_share_agri_waste_mgt,LightGBM,-745.079
7,213,emission_share_agri_waste_mgt,LightGBM,-623.212
8,236,emission_share_agri_waste_mgt,LightGBM,-532.167
9,244,emission_share_agri_waste_mgt,LightGBM,-466.075


In [348]:
# Merge truth and predictions (only LightGBM here)
eval_df = (
    y_true_df.merge(
        y_pred_df_lgbm[["row_id", "target", "y_pred"]],
        on=["row_id", "target"],
        how="inner"
    )
)
eval_df.head(10)

Unnamed: 0,row_id,target,y_true,y_pred
0,29,emission_share_agri_waste_mgt,7.49,-665.731
1,29,total_fdi_inflows,556.430175,-10551.934
2,29,emission_share_farmgate,53.74,-6645.764
3,29,emission_share_land_use_change,0.0,-2.265
4,29,emission_share_energy_use,50.75,-6325.301
5,29,emission_share_crops,2.83,-145.933
6,29,emission_share_pre_and_post_production,15.7,-1037.805
7,29,value_added_aff_per_total_fdi,3.003491,-473.918
8,29,emission_share_end_to_end_agrifood,69.44,-8312.298
9,29,emission_share_ipcc_agriculture,47.88,-2760.104


In [349]:
# per-target metrics table
metrics_rows = []
for col in targets:
    sub = eval_df.loc[eval_df["target"]==col]
    
    y_val = sub["y_true"].astype(float)
    val_pred = sub["y_pred"].astype(float)

    # Metrics
    rmse = root_mean_squared_error(y_val, val_pred)
    mae = mean_absolute_error(y_val, val_pred)
    r2  = r2_score(y_val, val_pred)

    # Normalizations
    mean_y = float(y_val.mean())
    std_y  = float(y_val.std(ddof=1))  # sample std (ddof=1) is typical; either is fine if consistent

    nrmse_mean = (rmse / mean_y) if mean_y != 0 else np.nan
    nrmse_std  = (rmse / std_y)  if std_y  != 0 else np.nan

    # Mean Absolute Percentage Error (ignoring inf/NaN cases)
    mape = (np.abs((y_val - val_pred)/y_val)
            .replace([np.inf, -np.inf], np.nan)
            .dropna()
            .mean()*100)

    # n_train for this target = observed count in TRAIN
    n_train = int(df_train[col].notna().sum())
    n_val   = int(len(y_val))  

    metrics_rows.append({
        "method": "LightGBM",
        "target": col,
        "n_train": n_train,
        "n_val": n_val,
        "RMSE": np.round(rmse, 3),
        "MAE": np.round(mae, 3),
        "R2": np.round(r2, 3),
        "nRMSE_mean": np.round(nrmse_mean, 3),
        "nRMSE_std": np.round(nrmse_std, 3),
        "MAPE(%)": np.round(mape, 3)
    })

metrics_lgbm = pd.DataFrame(metrics_rows, columns=[
    "method","target","n_train","n_val","RMSE","MAE","R2","nRMSE_mean","nRMSE_std","MAPE(%)"
])
metrics_lgbm

Unnamed: 0,method,target,n_train,n_val,RMSE,MAE,R2,nRMSE_mean,nRMSE_std,MAPE(%)
0,LightGBM,emission_share_agri_waste_mgt,3123,112,676.026,633.411,-14514.482,82.772,119.941,12766.613
1,LightGBM,total_fdi_inflows,3123,112,39649.148,21683.688,0.426,2.824,0.754,9097.175
2,LightGBM,emission_share_farmgate,3123,112,5686.179,5524.954,-89667.999,224.933,298.108,109140.146
3,LightGBM,emission_share_land_use_change,3123,112,5.53,3.496,0.908,0.55,0.302,1219.733
4,LightGBM,emission_share_energy_use,3123,112,5609.277,5454.791,-36455.394,96.953,190.081,15177.845
5,LightGBM,emission_share_crops,3123,112,124.1,120.057,-5642.178,72.503,74.785,55781.214
6,LightGBM,emission_share_pre_and_post_production,3123,112,930.637,914.141,-15513.516,73.532,124.0,10757.139
7,LightGBM,value_added_aff_per_total_fdi,3123,112,755.853,570.278,-708.073,81.668,26.509,396617.415
8,LightGBM,emission_share_end_to_end_agrifood,3123,112,7230.918,7044.046,-84574.583,150.689,289.518,20891.43
9,LightGBM,emission_share_ipcc_agriculture,3123,112,2494.86,2401.434,-20438.222,123.58,142.326,118123.207


## Tabular Variational Auto-Encoder

In [257]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer

In [258]:
rng = np.random.default_rng(SEED)

In [259]:
# Prepare train data
tv_train = df_train.drop(columns=[c for c in exclude_cols if c in df_train.columns], errors="ignore").copy()

tv_train = df_train.loc[df_train['aoi_credit_to_ag_forest_fish'].notna()]

# Year to numeric
tv_train["year"] = pd.to_numeric(tv_train["year"], errors="coerce")

# Ensure SDV-friendly dtypes for declared categoricals
for c in categorical_cols:
    if c in tv_train.columns:
        tv_train[c] = tv_train[c].astype("object")

# identify numeric columns
numeric_cols = tv_train.select_dtypes(include=[np.number]).columns.to_list()

# detect binary {0,1} among numeric columns
for c in numeric_cols:
    uv = pd.Series(tv_train[c]).dropna().unique()
    if len(uv) > 0 and set(uv).issubset({0, 1, 0.0, 1.0}):
        binary_cols.append(c)
        tv_train[c] = tv_train[c].astype("Int64").astype("boolean")

# Build and lock Metadata
md = SingleTableMetadata()
md.detect_from_dataframe(tv_train)  # baseline detection

# Force sdtypes per our simple rules
for c in tv_train.columns:
    if c in categorical_cols:
        md.update_column(c, sdtype="categorical")
    elif c in binary_cols:
        md.update_column(c, sdtype="boolean")
    else:
        md.update_column(c, sdtype="numerical")

md.validate()


In [None]:
# Fit TVAE on train
np.random.seed(SEED)
synth = TVAESynthesizer(
    metadata=md,
    epochs=100,            
    batch_size=512,
    embedding_dim=64,
    compress_dims=(128, 64),
    decompress_dims=(64, 128),
    l2scale=1e-5,
    verbose=False,
    cuda=None   
)

synth.fit(tv_train)

In [None]:
# Set all target cells in the validation copy to NaN
df_val_masked = df_val_clean.copy()
df_val_masked.loc[:, targets] = np.nan

# Step 3a: Build light conditions with only 1 categoricals fixed
categorical_keys = ["area"]

# Make a conditions copy the same shape as df
tv_val_masked = df_val_masked.drop(columns=[c for c in exclude_cols if c in df_val_masked.columns], errors="ignore").copy()

# Coerce 'year' numeric & categoricals to object to mirror TRAIN
if "year" in tv_val_masked.columns:
    tv_val_masked["year"] = pd.to_numeric(tv_val_masked["year"], errors="coerce")
for c in categorical_cols:
    if c in tv_val_masked.columns:
        tv_val_masked[c] = tv_val_masked[c].astype("object")


conditions_df = tv_val_masked.copy()

# 1) Work only on rows that have ANY missing among the targets
need_mask = conditions_df[targets].isna().any(axis=1)
need_idx = conditions_df.index[need_mask]

# 2) Build known-conditions for *those* rows (keys have no missing)
known = conditions_df.loc[need_idx, categorical_keys].copy()

# 3) Sample remaining columns for exactly those rows
samples = synth.sample_remaining_columns(
    known_columns=known,
    batch_size=512,          
    max_tries_per_batch=300
)


Sampling remaining columns:  76%|███████▌  | 85/112 [1:12:30<23:02, 51.19s/it]   


In [310]:
samples.index = samples.index.astype('int')
samples.head()

Unnamed: 0,area,area_code,year_code,year,area_agri_land,area_arable_land,area_cropland,area_with_irrigation,area_permanent_crops,area_temporary_crops,...,least_developed_country,land_locked_developing_country,small_island_developing_state,low_income_food_deficit_country,net_food_importing_developing_country,temp_change_meteorological_year,temp_change_dec_jan_feb,temp_change_jun_jul_aug,temp_change_sep_oct_nov,temp_change_mar_apr_may
55,Angola,108,2023,2023,50159.239851,1025.17984,8555.77677,402.825785,12.4143,3077.35002,...,False,False,False,False,False,1.425361,0.961657,1.303624,1.566056,0.522748
75,Antigua and Barbuda,14,2002,2003,0.66,0.135,0.66,0.03,0.1,605.64271,...,False,False,True,False,True,0.452969,0.739928,0.91324,0.871514,0.0846
100,Argentina,21,2003,2003,5135.144144,7530.39253,9083.21524,1993.81586,658.4111,5040.34803,...,False,False,False,False,False,0.575749,0.437992,0.066648,0.465726,0.055489
121,Armenia,232,2022,2020,3649.959523,1731.73446,1508.96131,245.701336,47.1392,1819.62157,...,False,False,False,False,True,0.678654,1.122248,0.9764,0.66192,0.514367
144,Australia,18,2004,2005,4397.231615,3251.41275,744.43083,71.814489,291.1654,1487.66115,...,False,False,False,False,False,0.641446,0.355993,0.673555,-0.082955,-0.551


In [268]:
samples_idx = samples.index
samples_idx

Index([  55,   75,  100,  121,  144,  179,  213,  236,  244,  267,  305,  328,
        351,  374,  393,  405,  483,  512,  557,  569,  692,  759,  805,  828,
        851,  897,  922,  943,  966,  989, 1104, 1114, 1196, 1256, 1282, 1302,
       1348, 1371, 1379, 1411, 1427, 1463, 1486, 1509, 1534, 1555, 1580, 1601,
       1609, 1772, 1808, 1853, 1909, 1958, 1991, 2060, 2125, 2161, 2184, 2312,
       2358, 2442, 2473, 2522, 2542, 2574, 2625, 2635, 2720, 2810, 2823, 2864,
       2892, 2987, 3010, 3033, 3056, 3079, 3102, 3125, 3148, 3171, 3200, 3234,
       3248],
      dtype='int64')

In [271]:
for col in targets:
    na_mask = conditions_df.loc[samples_idx, col].isna()
    conditions_df.loc[samples_idx[na_mask], col] = samples.loc[samples_idx[na_mask], col].values

conditions_df[targets].isna().any(axis=1).sum()

32

In [273]:
df_val_masked = conditions_df.copy()

In [274]:
# Step 3a: Build light conditions with only 1 categoricals fixed
categorical_keys = ["sub_region"]

# Make a conditions copy the same shape as df
tv_val_masked = df_val_masked.drop(columns=[c for c in exclude_cols if c in df_val_masked.columns], errors="ignore").copy()

# Coerce 'year' numeric & categoricals to object to mirror TRAIN
if "year" in tv_val_masked.columns:
    tv_val_masked["year"] = pd.to_numeric(tv_val_masked["year"], errors="coerce")
for c in categorical_cols:
    if c in tv_val_masked.columns:
        tv_val_masked[c] = tv_val_masked[c].astype("object")


conditions_df = tv_val_masked.copy()

# 1) Work only on rows that have ANY missing among the targets
need_mask = conditions_df[targets].isna().any(axis=1)
need_idx = conditions_df.index[need_mask]

# 2) Build known-conditions for *those* rows (keys have no missing)
known = conditions_df.loc[need_idx, categorical_keys].copy()

# 3) Sample remaining columns for exactly those rows
samples_2 = synth.sample_remaining_columns(
    known_columns=known,
    batch_size=512,          
    max_tries_per_batch=300
)

Sampling remaining columns: 100%|██████████| 32/32 [04:02<00:00,  7.58s/it]


In [276]:
samples_2_idx = samples_2.index
samples_2_idx

Index([  29,  449,  527,  604,  759,  805, 1014, 1061, 1087, 1150, 1161, 1213,
       1696, 1853, 2027, 2086, 2115, 2161, 2249, 2266, 2328, 2413, 2557, 2582,
       2625, 2651, 2695, 2787, 2917, 2948, 2956, 3223],
      dtype='int64')

In [278]:
for col in targets:
    na_mask = conditions_df.loc[samples_2_idx, col].isna()
    conditions_df.loc[samples_2_idx, col] = samples_2.loc[samples_2_idx, col].values

conditions_df[targets].isna().any(axis=1).sum()

2

In [283]:
df_val_masked = conditions_df.copy()

In [285]:
# Step 3a: Build light conditions with only 1 categoricals fixed
categorical_keys = ["region"]

# Make a conditions copy the same shape as df
tv_val_masked = df_val_masked.drop(columns=[c for c in exclude_cols if c in df_val_masked.columns], errors="ignore").copy()

# Coerce 'year' numeric & categoricals to object to mirror TRAIN
if "year" in tv_val_masked.columns:
    tv_val_masked["year"] = pd.to_numeric(tv_val_masked["year"], errors="coerce")
for c in categorical_cols:
    if c in tv_val_masked.columns:
        tv_val_masked[c] = tv_val_masked[c].astype("object")


conditions_df = tv_val_masked.copy()

# 1) Work only on rows that have ANY missing among the targets
need_mask = conditions_df[targets].isna().any(axis=1)
need_idx = conditions_df.index[need_mask]

# 2) Build known-conditions for *those* rows (keys have no missing)
known = conditions_df.loc[need_idx, categorical_keys].copy()

# 3) Sample remaining columns for exactly those rows
samples_3 = synth.sample_remaining_columns(
    known_columns=known,
    batch_size=512,          
    max_tries_per_batch=300
)

Sampling remaining columns: 100%|██████████| 2/2 [00:08<00:00,  4.22s/it]


In [None]:
samples_3_idx = samples_3.index

for col in targets:
    na_mask = conditions_df.loc[samples_3_idx, col].isna()
    conditions_df.loc[samples_3_idx, col] = samples_3.loc[samples_3_idx, col].values

df_val_masked = conditions_df.copy()

0

In [292]:
y_true_df.head()

Unnamed: 0,row_id,target,y_true
0,29,emission_share_agri_waste_mgt,7.49
1,29,total_fdi_inflows,556.430175
2,29,emission_share_farmgate,53.74
3,29,emission_share_land_use_change,0.0
4,29,emission_share_energy_use,50.75


In [296]:
r_idx = df_val_masked.index.get_indexer(y_true_df['row_id'])
c_idx = df_val_masked.columns.get_indexer(y_true_df['target'])

y_true_df['y_pred'] = df_val_masked.to_numpy()[r_idx, c_idx]
eval_df = y_true_df.copy()
eval_df.head(10)

Unnamed: 0,row_id,target,y_true,y_pred
0,29,emission_share_agri_waste_mgt,7.49,5.696515
1,29,total_fdi_inflows,556.430175,762.722234
2,29,emission_share_farmgate,53.74,20.875633
3,29,emission_share_land_use_change,0.0,-0.011
4,29,emission_share_energy_use,50.75,87.52488
5,29,emission_share_crops,2.83,1.479508
6,29,emission_share_pre_and_post_production,15.7,12.266552
7,29,value_added_aff_per_total_fdi,3.003491,-3.571714
8,29,emission_share_end_to_end_agrifood,69.44,21.682
9,29,emission_share_ipcc_agriculture,47.88,17.496


In [298]:
# per-target metrics table
metrics_rows = []
for col in targets:
    sub = eval_df.loc[eval_df["target"]==col]
    
    y_val = sub["y_true"].astype(float)
    val_pred = sub["y_pred"].astype(float)

    # Metrics
    rmse = root_mean_squared_error(y_val, val_pred)
    mae = mean_absolute_error(y_val, val_pred)
    r2  = r2_score(y_val, val_pred)

    # Normalizations
    mean_y = float(y_val.mean())
    std_y  = float(y_val.std(ddof=1))  # sample std (ddof=1) is typical; either is fine if consistent

    nrmse_mean = (rmse / mean_y) if mean_y != 0 else np.nan
    nrmse_std  = (rmse / std_y)  if std_y  != 0 else np.nan

    # Mean Absolute Percentage Error (ignoring inf/NaN cases)
    mape = (np.abs((y_val - val_pred)/y_val)
            .replace([np.inf, -np.inf], np.nan)
            .dropna()
            .mean()*100)

    # n_train for this target = observed count in TRAIN
    n_train = int(df_train[col].notna().sum())
    n_val   = int(len(y_val))  

    metrics_rows.append({
        "method": "TVAE",
        "target": col,
        "n_train": n_train,
        "n_val": n_val,
        "RMSE": np.round(rmse, 3),
        "MAE": np.round(mae, 3),
        "R2": np.round(r2, 3),
        "nRMSE_mean": np.round(nrmse_mean, 3),
        "nRMSE_std": np.round(nrmse_std, 3),
        "MAPE(%)": np.round(mape, 3)
    })

metrics_tvae = pd.DataFrame(metrics_rows, columns=[
    "method","target","n_train","n_val","RMSE","MAE","R2","nRMSE_mean","nRMSE_std","MAPE(%)"
])
metrics_tvae

Unnamed: 0,method,target,n_train,n_val,RMSE,MAE,R2,nRMSE_mean,nRMSE_std,MAPE(%)
0,TVAE,emission_share_agri_waste_mgt,3123,112,6.089,4.104,-0.178,0.746,1.08,57.954
1,TVAE,total_fdi_inflows,3123,112,49394.008,13191.742,0.109,3.519,0.94,1223.363
2,TVAE,emission_share_farmgate,3123,112,22.848,15.176,-0.448,0.904,1.198,113.784
3,TVAE,emission_share_land_use_change,3123,112,19.472,9.822,-0.142,1.938,1.064,2626.315
4,TVAE,emission_share_energy_use,3123,112,24.192,16.712,0.322,0.418,0.82,50.986
5,TVAE,emission_share_crops,3123,112,1.832,1.118,-0.23,1.071,1.104,213.841
6,TVAE,emission_share_pre_and_post_production,3123,112,7.974,6.132,-0.139,0.63,1.063,65.074
7,TVAE,value_added_aff_per_total_fdi,3123,112,29.015,12.765,-0.045,3.135,1.018,3883.647
8,TVAE,emission_share_end_to_end_agrifood,3123,112,21.88,15.618,0.226,0.456,0.876,40.158
9,TVAE,emission_share_ipcc_agriculture,3123,112,14.276,10.162,0.331,0.707,0.814,249.327


In [364]:
metrics_tvae.to_html('metrics_tvae.html', index=False)

In [311]:
metrics_lgbm

Unnamed: 0,method,target,n_train,n_val,RMSE,MAE,R2,nRMSE_mean,nRMSE_std,MAPE(%)
0,LightGBM,emission_share_agri_waste_mgt,3123,112,676.026,633.411,-14514.482,82.772,119.941,12766.613
1,LightGBM,total_fdi_inflows,3123,112,39649.148,21683.688,0.426,2.824,0.754,9097.175
2,LightGBM,emission_share_farmgate,3123,112,5686.179,5524.954,-89667.999,224.933,298.108,109140.146
3,LightGBM,emission_share_land_use_change,3123,112,5.53,3.496,0.908,0.55,0.302,1219.733
4,LightGBM,emission_share_energy_use,3123,112,5609.277,5454.791,-36455.394,96.953,190.081,15177.845
5,LightGBM,emission_share_crops,3123,112,124.1,120.057,-5642.178,72.503,74.785,55781.214
6,LightGBM,emission_share_pre_and_post_production,3123,112,930.637,914.141,-15513.516,73.532,124.0,10757.139
7,LightGBM,value_added_aff_per_total_fdi,3123,112,755.853,570.278,-708.073,81.668,26.509,396617.415
8,LightGBM,emission_share_end_to_end_agrifood,3123,112,7230.918,7044.046,-84574.583,150.689,289.518,20891.43
9,LightGBM,emission_share_ipcc_agriculture,3123,112,2494.86,2401.434,-20438.222,123.58,142.326,118123.207


In [365]:
metrics_lgbm.to_html('metrics_lgbm.html', index=False)

## K-NN Imputer

In [312]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [350]:
df_raw = pd.read_csv('dataset_1_item_independent.csv')

col_miss = df_raw.isna().sum().sort_values() 
targets= col_miss.loc[col_miss > 0].keys() 

df = df_raw.copy()  

# Pool of rows fully observed on all 15 TARGETS
pool_rows = df.loc[df[targets].notna().all(axis=1)] 

# For each 'area', pick exactly 1 row at random 
val_per_area = ( pool_rows.groupby('area', group_keys=False).apply(lambda g: g.sample(n=1, random_state=SEED)) ) 

categorical_cols = ["area", "region", "sub_region"] 
for c in categorical_cols: 
    if c in df.columns: 
        df[c] = df[c].astype('category') 
        
# Original indices of selected validation rows (these align with the original df) 
val_index = val_per_area.index 

# carve out validation (clean copy) and training sets 
df_val_clean = df.loc[val_index].copy() 
df_train = df.drop(index=val_index).copy()

  val_per_area = ( pool_rows.groupby('area', group_keys=False).apply(lambda g: g.sample(n=1, random_state=SEED)) )


In [353]:
exclude_cols = ["area_code", "area_code_m49", "year_code"]
categorical_cols = ["area", "region", "sub_region"]

# Columns used to build the KNN distance space
feature_cols = [c for c in df.columns if c not in exclude_cols]

# Identify dtypes off your working df 
num_cols = df.select_dtypes('number').columns.to_list()
num_cols = [c for c in num_cols if c not in exclude_cols and c not in categorical_cols]

cat_cols = [c for c in categorical_cols if c in feature_cols]

# encode categorical columns and scale numericals
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# --- Donor pool: fully observed across the entire distance space ---
donor_mask = df_train[feature_cols].notna().all(axis=1)
df_donors = df_train.loc[donor_mask, feature_cols].copy()

# Validation copy (for KNN): mask the same target cols as before
df_val_masked_knn = df_val_clean.copy()
df_val_masked_knn.loc[:, targets] = np.nan

print("Donors:", df_donors.shape, " | Train (post-split):", df_train.shape)

Donors: (2029, 91)  | Train (post-split): (3155, 94)


In [325]:
from sklearn.impute import KNNImputer

In [354]:
# Fit the pre-processor on donors
preprocessor.fit(df_donors)

# Transform donors and validation
X_donors = preprocessor.transform(df_donors)
X_val = preprocessor.transform(df_val_masked_knn[feature_cols])

# Fit KNNImputer on donor space, then impute the validation rows
imputer = KNNImputer(n_neighbors=5, weights="uniform")
imputer.fit(X_donors)

X_val_imp = imputer.transform(X_val)

In [356]:
# Get the names of features after preprocessing
feature_names = preprocessor.get_feature_names_out()

# Identify which positions correspond to our numeric cols
num_positions = [i for i, f in enumerate(feature_names) if f in num_cols]

# Build back a dataframe of imputed numerics
df_val_imputed_num = pd.DataFrame(
    X_val_imp[:, num_positions],
    index = df_val_masked_knn.index,
    columns = num_cols
)

# From these, extract only the target cols
df_val_imputed_targets = df_val_imputed_num[targets].copy()

In [362]:
# Collect predictions for just target columns
pred_records = []
for idx in df_val_imputed_targets.index:
    for col in targets:
        pred_records.append({
            "row_id": int(idx),
            "target": col,
            "method": "KNN",
            "y_pred": float(df_val_imputed_targets.loc[idx, col])
        })

y_pred_df_knn = pd.DataFrame(pred_records, columns=["row_id", "target", "method", "y_pred"])
y_pred_df_knn.head(10)

Unnamed: 0,row_id,target,method,y_pred
0,29,emission_share_agri_waste_mgt,KNN,-0.121991
1,29,total_fdi_inflows,KNN,-0.281669
2,29,emission_share_farmgate,KNN,0.907182
3,29,emission_share_land_use_change,KNN,-0.5881
4,29,emission_share_energy_use,KNN,-0.145293
5,29,emission_share_crops,KNN,0.939926
6,29,emission_share_pre_and_post_production,KNN,0.346702
7,29,value_added_aff_per_total_fdi,KNN,-0.040429
8,29,emission_share_end_to_end_agrifood,KNN,0.406339
9,29,emission_share_ipcc_agriculture,KNN,0.996027


In [361]:
# Build metrics_knn

# Merge truth and KNN predictions
eval_df_knn = (
    y_true_df.merge(
        y_pred_df_knn[['row_id', 'target', 'y_pred']],
        on=['row_id', 'target'],
        how='inner'
    )
)
eval_df_knn.head(10)

Unnamed: 0,row_id,target,y_true,y_pred
0,29,emission_share_agri_waste_mgt,7.49,-0.121991
1,29,total_fdi_inflows,556.430175,-0.281669
2,29,emission_share_farmgate,53.74,0.907182
3,29,emission_share_land_use_change,0.0,-0.5881
4,29,emission_share_energy_use,50.75,-0.145293
5,29,emission_share_crops,2.83,0.939926
6,29,emission_share_pre_and_post_production,15.7,0.346702
7,29,value_added_aff_per_total_fdi,3.003491,-0.040429
8,29,emission_share_end_to_end_agrifood,69.44,0.406339
9,29,emission_share_ipcc_agriculture,47.88,0.996027


In [360]:
# Per-target metrics loop (mirrors your LightGBM block)
metrics_rows = []
for col in targets:
    sub = eval_df_knn.loc[eval_df_knn["target"] == col]

    y_val   = sub["y_true"].astype(float)
    val_pred = sub["y_pred"].astype(float)

    # Metrics
    rmse = root_mean_squared_error(y_val, val_pred)
    mae  = mean_absolute_error(y_val, val_pred)
    r2   = r2_score(y_val, val_pred)

    # Normalizations
    mean_y = float(y_val.mean())
    std_y  = float(y_val.std(ddof=1))

    nrmse_mean = (rmse / mean_y) if mean_y != 0 else np.nan
    nrmse_std  = (rmse / std_y)  if std_y  != 0 else np.nan

    mape = (np.abs((y_val - val_pred)/y_val)
            .replace([np.inf, -np.inf], np.nan)
            .dropna()
            .mean()*100)

    n_train = int(df_train[col].notna().sum())
    n_val   = int(len(y_val))

    metrics_rows.append({
        "method": "KNN",
        "target": col,
        "n_train": n_train,
        "n_val": n_val,
        "RMSE": np.round(rmse, 3),
        "MAE": np.round(mae, 3),
        "R2": np.round(r2, 3),
        "nRMSE_mean": np.round(nrmse_mean, 3),
        "nRMSE_std": np.round(nrmse_std, 3),
        "MAPE(%)": np.round(mape, 3)
    })

metrics_knn = pd.DataFrame(metrics_rows, columns=[
    "method","target","n_train","n_val","RMSE","MAE","R2","nRMSE_mean","nRMSE_std","MAPE(%)"
])
metrics_knn

Unnamed: 0,method,target,n_train,n_val,RMSE,MAE,R2,nRMSE_mean,nRMSE_std,MAPE(%)
0,KNN,emission_share_agri_waste_mgt,3123,112,9.492,8.171,-1.862,1.162,1.684,109.48
1,KNN,total_fdi_inflows,3123,112,54179.843,14321.9,-0.072,3.859,1.031,100.116
2,KNN,emission_share_farmgate,3123,112,31.151,25.325,-1.691,1.232,1.633,122.833
3,KNN,emission_share_land_use_change,3123,112,20.009,10.075,-0.206,1.991,1.093,507.881
4,KNN,emission_share_energy_use,3123,112,64.468,57.827,-3.816,1.114,2.185,101.974
5,KNN,emission_share_crops,3123,112,1.881,1.699,-0.296,1.099,1.133,670.147
6,KNN,emission_share_pre_and_post_production,3123,112,14.258,12.632,-2.642,1.127,1.9,105.589
7,KNN,value_added_aff_per_total_fdi,3123,112,29.764,11.14,-0.099,3.216,1.044,133.845
8,KNN,emission_share_end_to_end_agrifood,3123,112,53.694,48.031,-3.664,1.119,2.15,101.656
9,KNN,emission_share_ipcc_agriculture,3123,112,26.137,20.228,-1.243,1.295,1.491,146.09


In [363]:
metrics_knn.to_html('metrics_knn.html', index=False)