In [184]:
# loading required libraries
import pandas as pd
import matplotlib.pyplot as plt

In [185]:
# loading pre-processed dataset
data = pd.read_csv("pre_processed_data.csv")
data.head()

Unnamed: 0,area_code,area,year_code,year,item_code,item,producer_price,producer_price_index,area_harvested,production,...,total_pesticide_use_per_value_of_agri_production,total_pesticide_export_quantity,total_pesticide_export_value,total_pesticide_import_quantity,total_pesticide_import_value,temp_change_dec_jan_feb,temp_change_mar_apr_may,temp_change_jun_jul_aug,temp_change_sep_oct_nov,temp_change_meteorological_year
0,1,Armenia,1997,1997,15,Wheat,225.3,89.4,108000.0,183700.0,...,0.05,3.604,3.687,117.861,298.379,2.166,-0.319,0.87,0.184,0.725
1,1,Armenia,1998,1998,15,Wheat,205.6,83.91,118300.0,244300.0,...,0.06,3.604,3.687,207.171,340.124,-0.259,1.483,2.145,2.027,1.349
2,1,Armenia,1999,1999,15,Wheat,177.5,76.79,110129.0,214380.0,...,0.07,3.604,3.687,296.481,381.869,3.535,0.814,1.611,0.264,1.556
3,1,Armenia,2000,2000,15,Wheat,163.5,71.3,106581.0,177762.0,...,0.08,3.604,3.687,160.902,482.955,1.148,0.723,2.166,0.364,1.1
4,1,Armenia,2001,2001,15,Wheat,166.0,74.51,108554.0,241679.0,...,0.09,1.145,9.068,200.884,698.022,1.818,2.072,1.578,0.409,1.469


In [186]:
data.shape

(189304, 98)

In [187]:
# Columns with > 10% missing values in descending order
prop_na = (data.isna().mean()*100).sort_values(ascending=False)
prop_na = prop_na.loc[prop_na > 10]
prop_na

fdi_ag_forest_fish_share                      57.006191
fdi_ag_forest_fish                            57.006191
govt_expenditure_on_ag_forest_fish            47.790327
producer_price                                44.675760
agri_orientation_index_2015_usd               39.907767
credit_to_ag_forest_fish_share_totalcredit    39.907767
credit_to_ag_forest_fish_2015_usd             39.907767
afs_employment_share_in_total_employment      38.795271
total_employment_afs                          37.964333
total_govt_expenditure                        32.366458
agri_orientation_index_govt_expenditure       32.339517
area_temporary_crops                          32.057960
phosphorus_production                         27.906436
export_quantity                               26.466953
export_value                                  26.245087
gross_production_value                        26.147361
import_quantity                               25.503951
import_value                                  25

In [188]:
# list of feature names
feature_list = list(prop_na.keys())

# list of proportion of missing values
prop_list = list(round(val,3) for val in prop_na.values)

# Initialize lists to store the number of countries and items with non-missing data
num_countries = []
num_items = []

# Loop over each feature to calculate the number of unique countries and items
# for which the data is present (i.e., not NA)
for feature in feature_list:
    country_count = len(data['area'].loc[data[feature].notna()].unique())
    num_countries.append(country_count)
    item_count = len(data['item'].loc[data[feature].notna()].unique())
    num_items.append(item_count)

# Create a summary DataFrame with feature name, number of countries/items with data, 
# and % missing
df = pd.DataFrame({
    'Feature': feature_list,
    'Number_of_countries': num_countries,
    'Number_of_items': num_items,
    'Percent_NA': prop_list
})

df


Unnamed: 0,Feature,Number_of_countries,Number_of_items,Percent_NA
0,fdi_ag_forest_fish_share,95,133,57.006
1,fdi_ag_forest_fish,95,133,57.006
2,govt_expenditure_on_ag_forest_fish,121,134,47.79
3,producer_price,122,133,44.676
4,agri_orientation_index_2015_usd,103,134,39.908
5,credit_to_ag_forest_fish_share_totalcredit,103,134,39.908
6,credit_to_ag_forest_fish_2015_usd,103,134,39.908
7,afs_employment_share_in_total_employment,113,134,38.795
8,total_employment_afs,116,134,37.964
9,total_govt_expenditure,124,134,32.366


The five features with the highest proportion of missing data are: 
- fdi_ag_forest_fish, 
- govt_expenditure_on_ag_forest_fish, 
- producer_price, 
- credit_to_ag_forest_fish_2015_usd, and 
- total_employment_afs 

Among these, the missing values in the **govt_expenditure_on_ag_forest_fish** column 
can be partially addressed by leveraging the following related variables: 
**agri_orientation_index_govt_expenditure, ag_forest_fish_share_in_total_gdp, and total_govt_expenditure**.

The Agricultural Orientation Index (AOI) for governmental expenditure is defined as:
$$
AOI = \frac{\text{Agriculture's share of government expenditure}}{\text{Agriculture's share of GDP}}
$$

And, 
$$
\text{Agriculture's share of government expenditure} = \frac{\text{govt\_expenditure\_on\_ag\_forest\_fish}}{\text{total\_govt\_expenditure}}
$$

Hence, 
$$
\text{govt\_expenditure\_on\_ag\_forest\_fish} = \\
\text{(agri\_orientation\_index\_govt\_expenditure)} \times \text{(ag\_forest\_fish\_share\_in\_total\_gdp)} \times \text{(total\_govt\_expenditure)}
$$


In [189]:
# Estimate missing values in 'govt_expenditure_on_ag_forest_fish' 
# using the formula above
estimated_govt_expenditure = (
    data['agri_orientation_index_govt_expenditure'] * 
    data['ag_forest_fish_share_in_total_gdp'] * 
    data['total_govt_expenditure']
    )

# Fill missing values in 'govt_expenditure_on_ag_forest_fish' with the estimated values,
# but only where all three input components are non-missing (non-NaN)
data['govt_expenditure_on_ag_forest_fish'] = data['govt_expenditure_on_ag_forest_fish'].fillna(
    estimated_govt_expenditure.where(
        data[[
            'agri_orientation_index_govt_expenditure', 
            'ag_forest_fish_share_in_total_gdp', 
            'total_govt_expenditure'
            ]].notna().all(axis=1) # Ensures all required inputs are available
    ) 
)

# Recalculate missing value proportions for all columns
prop_na = (data.isna().mean()*100).sort_values(ascending=False)

# Keep only those features where more than 10% of the data is missing
prop_na = prop_na.loc[prop_na > 10]

# Convert column names and their missing value percentages to lists
feature_list = list(prop_na.keys())
prop_list = list(round(val,3) for val in prop_na.values)

# Count how many unique countries and items are present for each feature with >10% missing data
num_countries = []
num_items = []
for feature in feature_list:
    country_count = len(data['area'].loc[data[feature].notna()].unique())
    num_countries.append(country_count)
    item_count = len(data['item'].loc[data[feature].notna()].unique())
    num_items.append(item_count)

# Create a summary DataFrame showing missing data patterns
df = pd.DataFrame({
    'Feature': feature_list,
    'Number_of_countries': num_countries,
    'Number_of_items': num_items,
    'Percent_NA': prop_list
})

df

Unnamed: 0,Feature,Number_of_countries,Number_of_items,Percent_NA
0,fdi_ag_forest_fish_share,95,133,57.006
1,fdi_ag_forest_fish,95,133,57.006
2,producer_price,122,133,44.676
3,credit_to_ag_forest_fish_share_totalcredit,103,134,39.908
4,agri_orientation_index_2015_usd,103,134,39.908
5,credit_to_ag_forest_fish_2015_usd,103,134,39.908
6,afs_employment_share_in_total_employment,113,134,38.795
7,total_employment_afs,116,134,37.964
8,total_govt_expenditure,124,134,32.366
9,agri_orientation_index_govt_expenditure,124,134,32.34


So, we were able to reduce the amount of missing data in feature 'govt_expenditure_on_ag_forest_fish' 
from 47.790% to 32.340%. 

Next, the other columns with high missing data have data for 95, 103, 116 countries 
out of 124 total. In order to reduce the amount of missingness in these columns, 
we can try to find the common countries that have missing data for all these columns.

In [196]:
# Get list of countries (areas) with non-missing values in each respective column
area_fdi = list(data.loc[data['fdi_ag_forest_fish'].notna()]['area'].unique())
area_credit = list(data.loc[data['credit_to_ag_forest_fish_2015_usd'].notna()]['area'].unique())
area_afs = list(data.loc[data['afs_employment_share_in_total_employment'].notna()]['area'].unique())

# All countries present in the dataset
area_total = list(data['area'].unique())

# Identify countries missing each variable by subtracting from the full country list
missing_credit = set(area_total) - set(area_credit)
missing_fdi = set(area_total) - set(area_fdi)
missing_afs = set(area_total) - set(area_afs)

# Find countries missing both credit and FDI data
common_area = set(missing_credit).intersection(missing_fdi)
print(f"Number of common countries that are missing in both 'fdi_ag_forest_fish' and 'credit_to_ag_forest_fish_2015_usd': {len(common_area)}")

# Find countries missing both credit and AFS employment share data
common_area = set(missing_credit).intersection(missing_afs)
print(f"Number of common countries that are missing in both 'afs_employment_share_in_total_employment' and 'credit_to_ag_forest_fish_2015_usd': {len(common_area)}")

# Find countries missing both FDI and AFS employment share data
common_area = set(missing_afs).intersection(missing_fdi)
print(f"Number of common countries that are missing in both 'fdi_ag_forest_fish' and 'afs_employment_share_in_total_employment': {len(common_area)}")

Number of common countries that are missing in both 'fdi_ag_forest_fish' and 'credit_to_ag_forest_fish_2015_usd': 0
Number of common countries that are missing in both 'afs_employment_share_in_total_employment' and 'credit_to_ag_forest_fish_2015_usd': 0
Number of common countries that are missing in both 'fdi_ag_forest_fish' and 'afs_employment_share_in_total_employment': 0


Unfortunately, there is limited overlap among these columns in terms of countries 
with missing data.