In [132]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sdv
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error, r2_score, root_mean_squared_error
)
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer

In [153]:
df_raw = pd.read_csv('dataset_1_item_independent.csv')
df_raw.shape

(3267, 94)

In [149]:
col_miss = df_raw.isna().sum().sort_values()
col_miss.tail(15)

emission_share_agri_waste_mgt              32
total_fdi_inflows                          32
emission_share_farmgate                    32
emission_share_land_use_change             32
emission_share_energy_use                  32
emission_share_crops                       32
emission_share_pre_and_post_production     32
value_added_aff_per_total_fdi              32
emission_share_end_to_end_agrifood         32
emission_share_ipcc_agriculture            32
total_pesticide_export_value               46
phosphorus_production                      47
potassium_agri_use                         48
emission_share_livestock                   54
aoi_credit_to_ag_forest_fish              979
dtype: int64

In [154]:
targets= col_miss.loc[col_miss > 0].keys()
targets

Index(['emission_share_agri_waste_mgt', 'total_fdi_inflows',
       'emission_share_farmgate', 'emission_share_land_use_change',
       'emission_share_energy_use', 'emission_share_crops',
       'emission_share_pre_and_post_production',
       'value_added_aff_per_total_fdi', 'emission_share_end_to_end_agrifood',
       'emission_share_ipcc_agriculture', 'total_pesticide_export_value',
       'phosphorus_production', 'potassium_agri_use',
       'emission_share_livestock', 'aoi_credit_to_ag_forest_fish'],
      dtype='object')

In [155]:

df = df_raw.copy()

# Reproducibility
SEED = 42
rng = np.random.default_rng(SEED)

# Pool of rows fully observed on all 15 TARGETS
pool_rows = df.loc[df[targets].notna().all(axis=1)]
pool_rows

Unnamed: 0,area,area_code,year_code,year,area_agri_land,area_arable_land,area_cropland,area_with_irrigation,area_permanent_crops,area_temporary_crops,...,least_developed_country,land_locked_developing_country,small_island_developing_state,low_income_food_deficit_country,net_food_importing_developing_country,temp_change_meteorological_year,temp_change_dec_jan_feb,temp_change_jun_jul_aug,temp_change_sep_oct_nov,temp_change_mar_apr_may
29,Albania,3,2007,2007,1119.0,578.0,698.0,356.5,120.0,186.0380,...,0.0,0.0,0.0,0.0,0.0,1.389,1.741,2.519,-0.718,2.015
30,Albania,3,2008,2008,1181.0,610.0,697.0,348.0,87.0,206.0000,...,0.0,0.0,0.0,0.0,0.0,1.043,0.238,1.862,0.685,1.387
31,Albania,3,2009,2009,1201.3,609.0,696.0,339.5,87.0,202.0000,...,0.0,0.0,0.0,0.0,0.0,0.977,0.390,1.261,0.873,1.383
32,Albania,3,2010,2010,1201.3,626.0,696.0,331.0,70.0,202.0000,...,0.0,0.0,0.0,0.0,0.0,1.261,1.234,1.607,0.932,1.271
33,Albania,3,2011,2011,1201.0,622.0,696.0,332.0,74.0,205.0000,...,0.0,0.0,0.0,0.0,0.0,1.125,0.630,1.659,0.970,1.243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3262,Zambia,251,2005,2005,22762.0,2727.0,2762.0,156.0,35.0,1967.3335,...,1.0,1.0,0.0,0.0,1.0,1.151,0.986,1.288,0.904,1.426
3263,Zambia,251,2006,2006,23048.0,3013.0,3048.0,156.0,35.0,2188.7428,...,1.0,1.0,0.0,0.0,1.0,0.760,1.013,0.906,0.406,0.715
3264,Zambia,251,2007,2007,22984.0,2949.0,2984.0,156.0,35.0,2148.4400,...,1.0,1.0,0.0,0.0,1.0,0.962,0.915,1.087,0.811,1.034
3265,Zambia,251,2008,2008,23087.0,3052.0,3087.0,156.0,35.0,2250.1707,...,1.0,1.0,0.0,0.0,1.0,0.518,0.270,0.638,1.238,-0.076


In [156]:
# For each 'area', pick exactly 1 row at random
val_per_area = (
    pool_rows.groupby('area', group_keys=False).apply(lambda g: g.sample(n=1, random_state=SEED))
)
val_per_area


  pool_rows.groupby('area', group_keys=False).apply(lambda g: g.sample(n=1, random_state=SEED))


Unnamed: 0,area,area_code,year_code,year,area_agri_land,area_arable_land,area_cropland,area_with_irrigation,area_permanent_crops,area_temporary_crops,...,least_developed_country,land_locked_developing_country,small_island_developing_state,low_income_food_deficit_country,net_food_importing_developing_country,temp_change_meteorological_year,temp_change_dec_jan_feb,temp_change_jun_jul_aug,temp_change_sep_oct_nov,temp_change_mar_apr_may
29,Albania,3,2007,2007,1119.0000,578.0000,698.0000,356.500000,120.0,186.0380,...,0.0,0.0,0.0,0.0,0.0,1.389000,1.741000,2.519000,-0.718000,2.01500
55,Angola,7,2019,2019,45877.0000,5363.0000,5680.0000,85.530000,317.0,4119.4037,...,1.0,0.0,0.0,0.0,1.0,1.785000,1.535000,1.910000,1.240000,2.45600
75,Antigua and Barbuda,8,2023,2023,9.0000,4.0000,5.0000,0.285000,1.0,2.9842,...,0.0,0.0,1.0,0.0,1.0,1.116000,0.544000,1.456000,1.755000,0.70800
100,Argentina,9,2018,2018,115930.2309,40181.2309,41249.2309,2357.000000,1068.0,34172.4713,...,0.0,0.0,0.0,0.0,0.0,0.860000,1.222000,-0.415000,1.064000,1.57000
121,Armenia,1,2016,2016,1676.8000,446.4000,504.4000,217.000000,58.0,353.5000,...,0.0,1.0,0.0,0.0,0.0,1.331000,1.986000,1.717000,-0.399000,2.01900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3171,Uruguay,234,2016,2016,14265.3000,2226.3000,2265.3000,258.000000,39.0,1508.7000,...,0.0,0.0,0.0,0.0,0.0,0.362000,1.371000,0.000000,0.462000,-0.38400
3200,Vanuatu,155,2022,2022,187.0000,20.0000,145.0000,3.820499,125.0,15.5742,...,0.0,0.0,1.0,0.0,0.0,1.285000,0.907000,1.890000,1.616000,0.72800
3223,Viet Nam,237,2022,2022,12315.0000,6754.0000,11673.0000,4585.000000,4920.0,6754.0000,...,0.0,0.0,0.0,0.0,0.0,0.945000,0.716000,0.997000,1.345000,0.72300
3234,Yemen,249,2010,2010,23579.0000,1291.0000,1579.0000,680.000000,288.0,1173.0000,...,1.0,0.0,0.0,1.0,1.0,2.416667,2.737917,2.456917,2.569417,1.90175


In [157]:
categorical_cols = ["area", "region", "sub_region"]
for c in categorical_cols:
    if c in df.columns:
        df[c] = df[c].astype('category')

# Original indices of selected validation rows (these align with the original df)
val_index = val_per_area.index

# carve out validation (clean copy) and training sets
df_val_clean = df.loc[val_index].copy()
df_train = df.drop(index=val_index).copy()

# Make a masked copy for later masking step
df_val_masked = df_val_clean.copy()

In [158]:
# Record the mask coordinates (row_id, col) for every TARGET in the val set
mask_records = []
for idx in df_val_clean.index:
    for col in targets:
        mask_records.append((int(idx), col))

# build a table with true values of 15 target cols for scoring later
y_true_df = pd.DataFrame(mask_records, columns=['row_id', 'target'])

In [159]:
r_idx = df_val_clean.index.get_indexer(y_true_df['row_id'])
c_idx = df_val_clean.columns.get_indexer(y_true_df['target'])

y_true_df['y_true'] = df_val_clean.to_numpy()[r_idx, c_idx]
y_true_df.head()

Unnamed: 0,row_id,target,y_true
0,29,emission_share_agri_waste_mgt,7.49
1,29,total_fdi_inflows,556.430175
2,29,emission_share_farmgate,53.74
3,29,emission_share_land_use_change,0.0
4,29,emission_share_energy_use,50.75


In [160]:
# Set all target cells in the validation copy to NaN
df_val_masked.loc[:, targets] = np.nan

In [142]:
import lightgbm as lgb
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

In [165]:

categorical_cols = ["area", "region", "sub_region"]
exclude_cols = ["area_code", "area_code_m49", "year_code"]


# Move categoricals to string
for c in categorical_cols:
    if c in df.columns:
        df[c] = df[c].astype('category')


# Store evaluation metrics for each imputed column
pred_records = []

# Loop over each column that needs imputation
for col in targets:
    # Features: all predictors except the target column and excluded ones
    X_cols = [c for c in df.columns if c != col and c not in exclude_cols]

    # train rows where the target column is observed
    train_mask = df_train[col].notna()
    X_train = df_train.loc[train_mask, X_cols]
    y_train = df_train.loc[train_mask, col]
    
    # validation rows where the target is not observed
    X_val = df_val_masked.loc[:, X_cols]

    # Choose objective
    y_min = y_train.min()

    if y_min >= 0:
        obj = "tweedie"
    else:
        obj = "regression"

    model = lgb.LGBMRegressor(
        objective=obj,
        n_estimators=800,
        random_state=SEED,
        n_jobs=-1,
        verbosity=-1
    )

    model.fit(X_train, y_train)

    # predict for validation set rows
    y_pred = np.round(model.predict(X_val), 3)

    # Collect long-form predictions
    pred_records.extend([
        {"row_id": int(rid), "target": col, "method": "LightGBM", "y_pred": float(p)}
        for rid, p in zip(X_val.index, y_pred)
    ])

# Assemble predictions for downstream metrics merge
y_pred_df_lgbm = pd.DataFrame(pred_records, columns=["row_id", "target", "method", "y_pred"])
y_pred_df_lgbm

Unnamed: 0,row_id,target,method,y_pred
0,29,emission_share_agri_waste_mgt,LightGBM,-665.731
1,55,emission_share_agri_waste_mgt,LightGBM,-637.065
2,75,emission_share_agri_waste_mgt,LightGBM,-744.993
3,100,emission_share_agri_waste_mgt,LightGBM,-521.210
4,121,emission_share_agri_waste_mgt,LightGBM,-502.367
...,...,...,...,...
1675,3171,aoi_credit_to_ag_forest_fish,LightGBM,0.000
1676,3200,aoi_credit_to_ag_forest_fish,LightGBM,0.000
1677,3223,aoi_credit_to_ag_forest_fish,LightGBM,0.000
1678,3234,aoi_credit_to_ag_forest_fish,LightGBM,0.000


In [166]:
# Merge truth and predictions (only LightGBM here)
eval_df = (
    y_true_df.merge(
        y_pred_df_lgbm[["row_id", "target", "y_pred"]],
        on=["row_id", "target"],
        how="inner"
    )
)
eval_df

Unnamed: 0,row_id,target,y_true,y_pred
0,29,emission_share_agri_waste_mgt,7.49,-665.731
1,29,total_fdi_inflows,556.430175,-10551.934
2,29,emission_share_farmgate,53.74,-6645.764
3,29,emission_share_land_use_change,0.0,-2.265
4,29,emission_share_energy_use,50.75,-6325.301
...,...,...,...,...
1675,3248,total_pesticide_export_value,659.3456,722.248
1676,3248,phosphorus_production,0.0,0.000
1677,3248,potassium_agri_use,13148.0,11618.367
1678,3248,emission_share_livestock,6.67,67.385


In [None]:
# per-target metrics table
metrics_rows = []
for col in targets:
    sub = eval_df.loc[eval_df["target"]==col]
    
    y_val = sub["y_true"].astype(float)
    val_pred = sub["y_pred"].astype(float)

    # Metrics
    rmse = root_mean_squared_error(y_val, val_pred)
    mae = mean_absolute_error(y_val, val_pred)
    r2  = r2_score(y_val, val_pred)

    # Normalizations
    mean_y = float(y_val.mean())
    std_y  = float(y_val.std(ddof=1))  # sample std (ddof=1) is typical; either is fine if consistent

    nrmse_mean = (rmse / mean_y) if mean_y != 0 else np.nan
    nrmse_std  = (rmse / std_y)  if std_y  != 0 else np.nan

    # Mean Absolute Percentage Error (ignoring inf/NaN cases)
    mape = (np.abs((y_val - val_pred)/y_val)
            .replace([np.inf, -np.inf], np.nan)
            .dropna()
            .mean()*100)

    # n_train for this target = observed count in TRAIN
    n_train = int(df_train[col].notna().sum())
    n_val   = int(len(y_val))  

    metrics_rows.append({
        "method": "LightGBM",
        "target": col,
        "n_train": n_train,
        "n_val": n_val,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2,
        "nRMSE_mean": nrmse_mean,
        "nRMSE_std": nrmse_std,
        "MAPE(%)": mape
    })

metrics_lgbm = pd.DataFrame(metrics_rows, columns=[
    "method","target","n_train","n_val","RMSE","MAE","R2","nRMSE_mean","nRMSE_std","MAPE(%)"
])
metrics_lgbm

Unnamed: 0,method,target,n_train,n_val,RMSE,MAE,R2,nRMSE_mean,nRMSE_std,MAPE(%)
0,LightGBM,emission_share_agri_waste_mgt,3123,112,676.0257,633.410875,-14514.48215,82.771566,119.941151,12766.613195
1,LightGBM,total_fdi_inflows,3123,112,39649.15,21683.688021,0.425938,2.824359,0.754278,9097.175441
2,LightGBM,emission_share_farmgate,3123,112,5686.179,5524.954455,-89667.998564,224.933112,298.108005,109140.145588
3,LightGBM,emission_share_land_use_change,3123,112,5.530177,3.495661,0.907874,0.550276,0.302164,1219.732821
4,LightGBM,emission_share_energy_use,3123,112,5609.277,5454.790982,-36455.394136,96.953232,190.081274,15177.844944
5,LightGBM,emission_share_crops,3123,112,124.1004,120.057223,-5642.177693,72.503319,74.784973,55781.214214
6,LightGBM,emission_share_pre_and_post_production,3123,112,930.6367,914.140759,-15513.515879,73.532304,123.999973,10757.13924
7,LightGBM,value_added_aff_per_total_fdi,3123,112,755.8535,570.278248,-708.073039,81.667594,26.509282,396617.414972
8,LightGBM,emission_share_end_to_end_agrifood,3123,112,7230.918,7044.046223,-84574.583127,150.689398,289.517606,20891.429746
9,LightGBM,emission_share_ipcc_agriculture,3123,112,2494.86,2401.433929,-20438.222454,123.579747,142.326137,118123.206663


## Tabular Variational Auto-Encoder

In [169]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer

In [None]:
rng = np.random.default_rng(SEED)

In [173]:
# Prepare train data
tv_train = df_train.drop(columns=[c for c in exclude_cols if c in df_train.columns], errors="ignore").copy()

# Year to numeric
tv_train["year"] = pd.to_numeric(tv_train["year"], errors="coerce")

# Ensure SDV-friendly dtypes for declared categoricals
for c in categorical_cols:
    if c in tv_train.columns:
        tv_train[c] = tv_train[c].astype("object")

# identify numeric columns
numeric_cols = tv_train.select_dtypes(include=[np.number]).columns.to_list()

# detect binary {0,1} among numeric columns
for c in numeric_cols:
    uv = pd.Series(tv_train[c]).dropna().unique()
    if len(uv) > 0 and set(uv).issubset({0, 1, 0.0, 1.0}):
        binary_cols.append(c)
        tv_train[c] = tv_train[c].astype("Int64").astype("boolean")

# Build and lock Metadata
md = SingleTableMetadata()
md.detect_from_dataframe(tv_train)  # baseline detection

# Force sdtypes per our simple rules
for c in tv_train.columns:
    if c in categorical_cols:
        md.update_column(c, sdtype="categorical")
    elif c in binary_cols:
        md.update_column(c, sdtype="boolean")
    else:
        md.update_column(c, sdtype="numerical")

md.validate()


In [None]:
# Fit TVAE on train
np.random.seed(SEED)
synth = TVAESynthesizer(
    metadata=md,
    epochs=100,            
    batch_size=512,
    embedding_dim=64,
    compress_dims=(128, 64),
    decompress_dims=(64, 128),
    l2scale=1e-5,
    verbose=True,
    cuda=None   
)

synth.fit(tv_train)

In [None]:
# Step 3a: Build light conditions with only 1 categoricals fixed
categorical_keys = ["area"]

# Make a conditions copy the same shape as df
tv_val_masked = df_val_masked.drop(columns=[c for c in exclude_cols if c in df_val_masked.columns], errors="ignore").copy()

# Coerce 'year' numeric & categoricals to object to mirror TRAIN
if "year" in tv_val_masked.columns:
    tv_val_masked["year"] = pd.to_numeric(tv_val_masked["year"], errors="coerce")
for c in categorical_cols:
    if c in tv_val_masked.columns:
        tv_val_masked[c] = tv_val_masked[c].astype("object")

        
conditions_df = tv_val_masked.copy()

# 1) Work only on rows that have ANY missing among the targets
need_mask = conditions_df[targets].isna().any(axis=1)
need_idx = conditions_df.index[need_mask]

# 2) Build known-conditions for *those* rows (keys have no missing)
known = conditions_df.loc[need_idx, categorical_keys].copy()

# 3) Sample remaining columns for exactly those rows
samples = synth.sample_remaining_columns(
    known_columns=known,
    batch_size=1024,          
    max_tries_per_batch=500
)


Sampling remaining columns:   5%|▌         | 6/112 [08:57<1:44:20, 59.06s/it] 