## Data Imputation (item-dependent columns)

In [9]:
# Load the libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error, r2_score, root_mean_squared_error)
from lightgbm import LGBMRegressor

In [10]:
df = pd.read_csv('/Users/gurjitsingh/Desktop/MS Data Science/MS_Project_Python/cleaned_datasets/dataset_1_filtered.csv')
df.head()

Unnamed: 0,area,item,area_code,year_code,year,item_code,producer_price_index,area_harvested,production,yield,...,value_added_aff_per_total_fdi,area_code_m49,region,sub_region,european_union_country,least_developed_country,land_locked_developing_country,small_island_developing_state,low_income_food_deficit_country,net_food_importing_developing_country
0,Afghanistan,"Almonds, in shell",2,2001,2001,221,38.92,9000.0,15000.0,1666.7,...,3039.575733,4,Asia,South Asia,0.0,1.0,1.0,0.0,1.0,1.0
1,Afghanistan,"Almonds, in shell",2,2002,2002,221,35.17,5500.0,11773.99,2140.7,...,39.664672,4,Asia,South Asia,0.0,1.0,1.0,0.0,1.0,1.0
2,Afghanistan,"Almonds, in shell",2,2003,2003,221,41.73,5700.0,14000.0,2456.1,...,37.788767,4,Asia,South Asia,0.0,1.0,1.0,0.0,1.0,1.0
3,Afghanistan,"Almonds, in shell",2,2004,2004,221,42.67,12000.0,14700.0,1225.0,...,11.065925,4,Asia,South Asia,0.0,1.0,1.0,0.0,1.0,1.0
4,Afghanistan,"Almonds, in shell",2,2005,2005,221,42.29,11768.0,15630.0,1328.1,...,9.244645,4,Asia,South Asia,0.0,1.0,1.0,0.0,1.0,1.0


In [11]:
df.shape

(137356, 106)

In [13]:
# List of columns that are both area- and item-dependent 
cols_item_dependent = [
    'item', 'export_quantity', 'export_value', 'import_quantity',
    'import_value', 'area_harvested', 'production', 'yield',
    'gross_production_value', 'gross_production_index', 'producer_price_index'
]

df_item_depend = df[['area', 'year'] + cols_item_dependent]
df_item_depend[0:20]

Unnamed: 0,area,year,item,export_quantity,export_value,import_quantity,import_value,area_harvested,production,yield,gross_production_value,gross_production_index,producer_price_index
0,Afghanistan,2001,"Almonds, in shell",10900.0,3000.0,,,9000.0,15000.0,1666.7,,53.26,38.92
1,Afghanistan,2002,"Almonds, in shell",7355.0,2041.0,,,5500.0,11773.99,2140.7,,41.81,35.17
2,Afghanistan,2003,"Almonds, in shell",7395.0,3219.0,,,5700.0,14000.0,2456.1,,49.71,41.73
3,Afghanistan,2004,"Almonds, in shell",6163.0,3342.0,,,12000.0,14700.0,1225.0,,52.2,42.67
4,Afghanistan,2005,"Almonds, in shell",4719.0,5888.0,,,11768.0,15630.0,1328.1,,55.5,42.29
5,Afghanistan,2006,"Almonds, in shell",2871.0,1729.0,,,12000.0,20000.0,1666.7,,71.02,61.61
6,Afghanistan,2007,"Almonds, in shell",3150.0,4315.0,,,12000.0,31481.0,2623.4,,111.78,80.93
7,Afghanistan,2008,"Almonds, in shell",3285.0,6440.0,,,12000.0,42000.0,3500.0,,149.13,100.25
8,Afghanistan,2009,"Almonds, in shell",11065.0,23966.0,,,11029.0,43183.0,3915.4,,153.33,87.21
9,Afghanistan,2010,"Almonds, in shell",778.0,1915.0,,,11210.0,56000.0,4995.5,,198.84,101.14


In [14]:
df_item_depend.shape

(137356, 13)

In [15]:
df_item_depend.isna().mean()*100

area                       0.000000
year                       0.000000
item                       0.000000
export_quantity           22.853024
export_value              22.762748
import_quantity           20.511663
import_value              20.491278
area_harvested             1.333760
production                 1.301727
yield                      1.721803
gross_production_value    18.595475
gross_production_index     3.774134
producer_price_index       0.000000
dtype: float64

Both the columns- 'gross_production_value' (GPV) and 'gross_production_index' 
(GPI) provide the same information. GPI represents the relative change in gross 
production value over time, but expressed as an index rather than in dollar amounts. 
GPI is basically a rescaled version of GPV. It measures the change in GPV relative 
to 2014-2016. We can safely drop the column- 'gross_production_value' from our 
dataset because it most likely adds to multi-collinearity in the dataset. 

In [16]:
df_item_depend = df_item_depend.drop('gross_production_value', axis=1)
df_item_depend.shape

(137356, 12)

Let's explore some items to understand the missingness pattern in these columns.

In [17]:
df_item_depend.loc[df_item_depend['item']=='Oilcrops, Oil Equivalent'].isna().mean()*100

area                        0.000000
year                        0.000000
item                        0.000000
export_quantity           100.000000
export_value              100.000000
import_quantity           100.000000
import_value              100.000000
area_harvested              5.169575
production                  5.268357
yield                       5.334211
gross_production_index    100.000000
producer_price_index        0.000000
dtype: float64

In [18]:
df_item_depend.loc[df_item_depend['item']=='Seed cotton, unginned'].isna().mean()*100

area                        0.000000
year                        0.000000
item                        0.000000
export_quantity           100.000000
export_value              100.000000
import_quantity           100.000000
import_value              100.000000
area_harvested              0.375375
production                  0.375375
yield                       0.600601
gross_production_index      0.600601
producer_price_index        0.000000
dtype: float64

In [19]:
df_item_depend.loc[df_item_depend['item']=='Seed cotton, unginned'][0:20]

Unnamed: 0,area,year,item,export_quantity,export_value,import_quantity,import_value,area_harvested,production,yield,gross_production_index,producer_price_index
460,Afghanistan,2001,"Seed cotton, unginned",,,,,50000.0,57000.0,1140.0,112.27,48.48
461,Afghanistan,2002,"Seed cotton, unginned",,,,,50000.0,57000.0,1140.0,112.27,49.07
462,Afghanistan,2003,"Seed cotton, unginned",,,,,30000.0,33000.0,1100.0,65.0,49.07
463,Afghanistan,2004,"Seed cotton, unginned",,,,,25500.0,28000.0,1098.0,55.15,58.02
464,Afghanistan,2005,"Seed cotton, unginned",,,,,30000.0,33000.0,1100.0,65.0,61.3
465,Afghanistan,2006,"Seed cotton, unginned",,,,,31950.0,32000.0,1001.6,63.03,66.76
466,Afghanistan,2007,"Seed cotton, unginned",,,,,35000.0,35051.0,1001.5,69.04,85.52
467,Afghanistan,2008,"Seed cotton, unginned",,,,,35000.0,35000.0,1000.0,68.94,86.23
468,Afghanistan,2009,"Seed cotton, unginned",,,,,33000.0,42872.0,1299.2,84.45,86.23
469,Afghanistan,2010,"Seed cotton, unginned",,,,,33000.0,33000.0,1000.0,65.0,105.21


In [20]:
df_item_depend.loc[df_item_depend['item']=='Cassava, fresh'].isna().mean()*100

area                       0.000000
year                       0.000000
item                       0.000000
export_quantity           93.829787
export_value              92.907801
import_quantity           95.531915
import_value              95.531915
area_harvested             0.425532
production                 0.212766
yield                      0.496454
gross_production_index     0.496454
producer_price_index       0.000000
dtype: float64

In [21]:
df_item_depend.loc[df_item_depend['item']=='Groundnuts, excluding shelled'].isna().mean()*100

area                       0.000000
year                       0.000000
item                       0.000000
export_quantity           67.178276
export_value              67.178276
import_quantity           63.164109
import_value              63.164109
area_harvested             1.416765
production                 1.416765
yield                      1.475797
gross_production_index     1.416765
producer_price_index       0.000000
dtype: float64

In [22]:
df_item_depend.loc[df_item_depend['item']=='Sugar cane'].isna().mean()*100

area                       0.000000
year                       0.000000
item                       0.000000
export_quantity           77.388767
export_value              77.388767
import_quantity           77.097009
import_value              77.097009
area_harvested             0.437637
production                 0.437637
yield                      0.437637
gross_production_index     0.437637
producer_price_index       0.000000
dtype: float64

In [23]:
df_item_depend.loc[df_item_depend['item']=='Raspberries'].isna().mean()*100

area                       0.000000
year                       0.000000
item                       0.000000
export_quantity           59.546539
export_value              59.546539
import_quantity           61.097852
import_value              61.097852
area_harvested             0.357995
production                 0.357995
yield                      1.431981
gross_production_index     1.312649
producer_price_index       0.000000
dtype: float64

Some items show entire columns of missing data, especially for trade variables 
such as imports and exports. Imputing a full time series in those cases would be 
unreliable, so we chose to remove items missing more than 50% of their values in 
the import and export columns before proceeding.

In [None]:
# Filtering out items that are missing over 50% of data in trade related columns
commodities = [item for item in df_item_depend['item'].unique()]
commodities_to_remove = []
for commodity in commodities:
    percent_miss_export = (df_item_depend.loc[df_item_depend['item']==commodity].isna().mean())['export_quantity']
    percent_miss_import = (df_item_depend.loc[df_item_depend['item']==commodity].isna().mean())['import_quantity']
    if percent_miss_export > 0.5 or percent_miss_import > 0.5:
        commodities_to_remove.append(commodity)

print(f"There are {len(commodities_to_remove)} items to be removed.")
commodities_to_remove

There are 30 items to be removed.


['Oilcrops, Oil Equivalent',
 'Seed cotton, unginned',
 'Sugar cane',
 'Broad beans and horse beans, green',
 'Okra',
 'Vetches',
 'Cassava, fresh',
 'Groundnuts, excluding shelled',
 'Yams',
 'Lupins',
 'Safflower seed',
 'Green corn (maize)',
 'Persimmons',
 'Raspberries',
 'Gooseberries',
 'Mixed grain',
 'Castor oil seeds',
 'Kenaf, and other textile bast fibres, raw or retted',
 'Pigeon peas, dry',
 'Taro',
 'Chicory roots',
 'Plantains and cooking bananas',
 'Yautia',
 'Cow peas, dry',
 'Sisal, raw',
 'Bambara beans, dry',
 'Fonio',
 'Melonseed',
 'Locust beans (carobs)',
 'Pyrethrum, dried flowers']

In [25]:
# Dataset after dropping problematic items
df_item_depend_filtered = df_item_depend.loc[~df_item_depend['item'].isin(commodities_to_remove)]
df_item_depend_filtered.shape

(117207, 12)

In [27]:
# Impoprting imputed item-independent dataset
df_item_independent = pd.read_csv('/Users/gurjitsingh/Desktop/MS Data Science/MS_Project_Python/dataset_1_item_independent_imputed.csv')
df_item_independent.head()

Unnamed: 0,area,area_code,year_code,year,area_agri_land,area_arable_land,area_cropland,area_with_irrigation,area_permanent_crops,area_temporary_crops,...,least_developed_country,land_locked_developing_country,small_island_developing_state,low_income_food_deficit_country,net_food_importing_developing_country,temp_change_meteorological_year,temp_change_dec_jan_feb,temp_change_jun_jul_aug,temp_change_sep_oct_nov,temp_change_mar_apr_may
0,Afghanistan,2,2001,2001,37795.0,7683.0,7795.0,3203.0,112.0,2502.0,...,1.0,1.0,0.0,1.0,1.0,1.377,0.433,1.09,1.209,2.778
1,Afghanistan,2,2002,2002,37790.0,7678.0,7790.0,3208.0,112.0,2111.0,...,1.0,1.0,0.0,1.0,1.0,1.457,1.722,0.991,1.589,1.524
2,Afghanistan,2,2003,2003,37884.0,7772.0,7884.0,3208.0,112.0,3761.0,...,1.0,1.0,0.0,1.0,1.0,0.71,1.105,0.947,0.787,0.002
3,Afghanistan,2,2004,2004,37928.0,7816.0,7928.0,3208.0,112.0,2934.0,...,1.0,1.0,0.0,1.0,1.0,1.482,1.959,0.789,1.182,1.999
4,Afghanistan,2,2005,2005,37917.0,7805.0,7917.0,3208.0,112.0,3385.0,...,1.0,1.0,0.0,1.0,1.0,0.513,-0.305,0.702,1.306,0.348


In [28]:
df_item_independent.shape

(3267, 93)

In [29]:
# Merging the item-independent and item-dependent datasets
df_merged = pd.merge(
    df_item_depend_filtered, df_item_independent,
    on = ['area', 'year'],
    how = 'left'
)

df_merged.shape

(117207, 103)

In [None]:
# proportion of missingness in columns of merged dataset
miss_prop = (df_merged.isna().mean()*100).sort_values()
miss_prop.tail(20)

emission_share_pre_and_post_production          0.000000
emission_share_ipcc_agriculture                 0.000000
emission_share_farmgate                         0.000000
agri_orientation_index_govt_expenditure         0.000000
govt_expenditure_on_ag_forest_fish              0.000000
emission_share_land_use_change                  0.000000
total_govt_expenditure                          0.000000
temp_change_mar_apr_may                         0.000000
credit_to_ag_forest_fish_share_total_credit     0.000000
aoi_credit_to_ag_forest_fish                    0.000000
total_credit                                    0.000000
credit_to_ag_forest_fish                        0.000000
production                                      1.058810
area_harvested                                  1.094645
gross_production_index                          1.414591
yield                                           1.468342
import_value                                   10.147858
import_quantity                

In [31]:
# extract the columns to be imputed
cols_to_impute = [c for c in miss_prop.loc[miss_prop > 0].index]
cols_to_impute

['production',
 'area_harvested',
 'gross_production_index',
 'yield',
 'import_value',
 'import_quantity',
 'export_value',
 'export_quantity']

We are using LightGBM to impute missing values in the item-dependent columns

In [33]:
# Work on a copy of the original dataset
df_merged_imputed = df_merged.copy()

# Identify columns with missing values that need imputation
cols_to_impute = [c for c in miss_prop.loc[miss_prop > 0].index]

# List of all potential predictor columns
predictors = df_merged_imputed.columns.to_list()

# Columns to exclude from predictors (likely identifiers/ not useful for ML)
exclude_cols = ['area_code', 'area_code_m49', 'year_code']

# Columns that should be explicitly treated as categorical variables
categoricals = ['area', 'item', 'region', 'sub_region']
for c in categoricals:
    if c in df_merged_imputed.columns:
        df_merged_imputed[c] = df_merged_imputed[c].astype('category')

# Store evaluation metrics for each imputed column
metrics_rows = []

# Loop over each column that needs imputation
for col in cols_to_impute:
    # Mask for rows where the column is observed (non-missing) within 2001–2021
    X_train_mask = (df_merged_imputed[col].notna()) & (df_merged_imputed['year'].isin(list(range(2001, 2022))))
    # Mask for rows where the column is missing (to be predicted)
    pred_mask = df_merged_imputed[col].isna()

    # Features: all predictors except the target column and excluded ones
    X_cols = [c for c in predictors if c != col and c not in exclude_cols]

    # Training data (non-missing rows)
    X_train = df_merged_imputed.loc[X_train_mask, X_cols]
    y_train = df_merged_imputed.loc[X_train_mask, col]

    # Data for which predictions are needed
    X_pred = df_merged_imputed.loc[pred_mask, X_cols]

    # If enough data exists, create a 90/10 train-validation split
    if len(y_train) >= 20:
        # 90/10 train-val split
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train, y_train, test_size=0.1, random_state=1
        )
    else:
        # If data is too small, skip validation
        X_tr, y_tr = X_train, y_train
        X_val, y_val = None, None

    # Define the LightGBM model with Tweedie objective (handles skewed non-negative data)
    model = LGBMRegressor(
        objective='tweedie',
        n_estimators=800,
        random_state=2,
        n_jobs=-1,
        verbosity=-1
    )

    # Train the model
    model.fit(X_tr, y_tr, categorical_feature=[c for c in categoricals if c in X_tr.columns])

    # If validation set exists, calculate error metrics
    if X_val is not None and len(X_val) > 0:
        # Predictions clipped at 0 to enforce non-negativity
        val_pred = np.clip(model.predict(X_val), 0, None) 
        rmse = root_mean_squared_error(y_val, val_pred)
        mae = mean_absolute_error(y_val, val_pred)
        r2 = r2_score(y_val, val_pred)

        # Normalized errors for interpretability
        mean_y = y_val.mean()
        std_y = y_val.std()

        nrmse_mean = rmse/mean_y if mean_y != 0 else np.nan
        nrmse_std = rmse/std_y if std_y != 0 else np.nan

        # Mean Absolute Percentage Error (ignoring inf/NaN cases)
        mape = (np.abs((y_val - val_pred)/y_val)
                .replace([np.inf, -np.inf], np.nan)
                .dropna()
                .mean()*100)

        # Append metrics to the summary list
        metrics_rows.append({
            'column': col,
            'n_train': len(y_tr),
            'n_val': len(y_val),
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2,
            'nRMSE_mean': nrmse_mean,
            'nRMSE_std': nrmse_std,
            'MAPE(%)': mape
        })
    else:
        # If no validation, fill with NaNs for metrics
        metrics_rows.append({
            'column': col,
            'n_train': len(y_tr),
            'n_val': 0,
            'RMSE': np.nan,
            'MAE': np.nan,
            'R2': np.nan,
            'nRMSE_mean': np.nan,
            'nRMSE_std': np.nan,
            'MAPE(%)': np.nan
        })

    # Predict missing values for the entire column (2001–2023)
    y_pred = np.clip(model.predict(X_pred), 0, None)

    # Fill the missing values in the dataset
    df_merged_imputed.loc[pred_mask, col] = y_pred

# Collect metrics into a DataFrame for inspection
impute_metrics = pd.DataFrame(metrics_rows).sort_values('column').reset_index(drop=True)
impute_metrics

Unnamed: 0,column,n_train,n_val,RMSE,MAE,R2,nRMSE_mean,nRMSE_std,MAPE(%)
0,area_harvested,95246,10583,101675.663156,10259.548773,0.995597,0.502734,0.06635,4.044125
1,export_quantity,83879,9320,349635.17187,17670.131397,0.943786,2.86583,0.237082,54.477809
2,export_value,83980,9332,109640.923476,7582.046139,0.958535,1.885035,0.20362,42.332058
3,gross_production_index,94967,10552,361.081086,25.729706,0.892116,2.536507,0.328442,18.04995
4,import_quantity,86361,9596,303464.714672,14643.053635,0.956276,2.949271,0.209092,95.799495
5,import_value,86386,9599,46159.929613,5321.396003,0.940091,1.179805,0.24475,38.345396
6,production,95274,10587,498547.016722,43388.618258,0.995334,0.513507,0.068304,4.747042
7,yield,94939,10549,3359.883483,699.104392,0.986019,0.253628,0.118234,5.738313


In [35]:
# Check if all the missingness have been imputed or not
(df_merged_imputed.isna().mean()*100).sort_values(ascending=False).head(10)

area                                         0.0
phosphorus_production                        0.0
cropland_nitrogen_per_unit_area              0.0
potassium_use_per_value_of_ag_production     0.0
potassium_use_per_capita                     0.0
potassium_use_per_area_of_cropland           0.0
potassium_import_quantity                    0.0
potassium_export_quantity                    0.0
potassium_agri_use                           0.0
phosphorus_use_per_value_of_ag_production    0.0
dtype: float64

In [36]:
df_merged_imputed.shape

(117207, 103)

In [37]:
# Save the final completely imputed dataset
df_merged_imputed.to_csv('fully_imputed.csv', index=False)

So, finally we have imputed all the missingness in the dataset. The final fully 
imputed dataset has 117207 rows and 103 columns.