In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sdv
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error, r2_score, root_mean_squared_error
)
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer

In [15]:
df_raw = pd.read_csv('dataset_1_item_independent.csv')
df_raw.shape

(3267, 94)

In [None]:
col_miss = df_raw.isna().sum().sort_values()
col_miss.tail(15)

emission_share_agri_waste_mgt              32
total_fdi_inflows                          32
emission_share_farmgate                    32
emission_share_land_use_change             32
emission_share_energy_use                  32
emission_share_crops                       32
emission_share_pre_and_post_production     32
value_added_aff_per_total_fdi              32
emission_share_end_to_end_agrifood         32
emission_share_ipcc_agriculture            32
total_pesticide_export_value               46
phosphorus_production                      47
potassium_agri_use                         48
emission_share_livestock                   54
aoi_credit_to_ag_forest_fish              979
dtype: int64

In [45]:
# Step 1: create and lock metadata
df = df_raw.copy()

categorical_cols = ["area", "region", "sub_region", "year"]
exclude_cols = ["area_code", "area_code_m49", "year_code"]

df = df.drop(columns=exclude_cols, axis=1)
df['year'] = df['year'].astype('string')
for c in categorical_cols:
    if c in df.columns:
        df[c] = df[c].astype('object')

# identify numeric and binary columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()

numeric_cols = [c for c in numeric_cols if c not in categorical_cols]

binary_cols = []
for c in numeric_cols:
    unique_vals = df[c].dropna().unique()
    if len(unique_vals) > 0 and set(unique_vals).issubset({0,1}):
        binary_cols.append(c)
        df[c] = df[c].map({0: False, 1: True})

# build metadata
md = SingleTableMetadata()
md.detect_from_dataframe(df) # baseline detection

# Force your sdtypes
for c in df.columns:
    if c in categorical_cols:
        md.update_column(c, sdtype="categorical")
    elif c in binary_cols:
        md.update_column(c, sdtype="boolean")
    else:
        md.update_column(c, sdtype="numerical")

md.validate()

print("Locked sdtypes:")
for c in df.columns:
    print(f"{c} -> {md.columns[c]['sdtype']}")

Locked sdtypes:
area -> categorical
year -> categorical
area_agri_land -> numerical
area_arable_land -> numerical
area_cropland -> numerical
area_with_irrigation -> numerical
area_permanent_crops -> numerical
area_temporary_crops -> numerical
value_per_unit_agri_land -> numerical
cropland_area_per_capita -> numerical
gross_domestic_product -> numerical
gross_fixed_capital_formation -> numerical
value_added_ag_forest_fish -> numerical
gdp_annual_growth -> numerical
gfcf_annual_growth -> numerical
value_added_ag_forest_fish_annual_growth -> numerical
gfcf_share_in_total_gdp -> numerical
ag_forest_fish_share_in_total_gdp -> numerical
total_fdi_inflows -> numerical
agri_orientation_index_govt_expenditure -> numerical
govt_expenditure_on_ag_forest_fish -> numerical
total_govt_expenditure -> numerical
credit_to_ag_forest_fish -> numerical
credit_to_ag_forest_fish_share_total_credit -> numerical
aoi_credit_to_ag_forest_fish -> numerical
total_credit -> numerical
emission_share_farmgate -> num

In [None]:
# Step 2: Fit a tiny TVAE (smoke test)
np.random.seed(42)
synth = TVAESynthesizer(
    metadata=md,
    epochs=30,
    batch_size=512,
    embedding_dim=64,
    compress_dims=(128, 64),
    decompress_dims=(64, 128),
    l2scale=1e-5,
    verbose=True,
    cuda=None
)

synth.fit(df)
print("TVAE smoke-fit complete.")

In [None]:
# Step 3a: Build light conditions with only 4 categoricals fixed
categorical_keys = ["area", "region", "sub_region", "year"]

# Make a conditions copy the same shape as df
conditions_df = df.copy()

# For every non-key column, blank it out so TVAE can generate it
for c in conditions_df.columns:
    if c not in categorical_keys:
        conditions_df[c] = pd.NA # leave as missing --> to be sampled

print("Conditions DataFrame ready. Columns fixed (non-NaN):", categorical_keys)
print(conditions_df[categorical_keys].head())

Conditions DataFrame ready. Columns fixed (non-NaN): ['area', 'region', 'sub_region', 'year']
          area region  sub_region  year
0  Afghanistan   Asia  South Asia  2001
1  Afghanistan   Asia  South Asia  2002
2  Afghanistan   Asia  South Asia  2003
3  Afghanistan   Asia  South Asia  2004
4  Afghanistan   Asia  South Asia  2005


In [None]:
# Step 3b: Sample remaininng columns from TVAE using only the 4 keys

# Vectorized sampling: fills all non-key columns, keeping keys fixed
sampled_full = synth.sample_remaining_columns(conditions_df[categorical_keys])

sampled_full.head()