Import Packages

In [64]:
import pandas as pd
import numpy as np
from pyampute.exploration.mcar_statistical_tests import MCARTest
from sklearn.impute import KNNImputer

Read Data and format numbers

In [65]:
columns=['Entity', 'Year', 'Access to electricity (% of population)', 'Access to clean fuels for cooking', 'Renewable-electricity-generating-capacity-per-capita', 'Financial flows to developing countries (US $)', 'Renewable energy share in the total final energy consumption (%)', 'Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)', 'Low-carbon electricity (% electricity)', 'Primary energy consumption per capita (kWh/person)', 'Energy intensity level of primary energy (MJ/$2017 PPP GDP)', 'Value_co2_emissions_kt_by_country', 'Renewables (% equivalent primary energy)', 'gdp_growth', 'gdp_per_capita', 'Density\\n(P/Km2)', 'Land Area(Km2)', 'Latitude', 'Longitude']

def convert_to_float(value):
    if isinstance(value, str) and ',' in value:
        return float(value.replace(',', '.'))
    return value

data = pd.read_csv('./co2_emissions/emission_data.csv', converters={'Density\\n(P/Km2)': convert_to_float})
pd.DataFrame(data)

Determine Number of zeros per column

In [66]:
num_zeros = (data == 0).sum()*100 / len(data)
print(num_zeros)

Determine the number of NaNs

In [67]:
nan_counts = data.isna().sum() *100 / len(data)
print(nan_counts)

Remove Columns with more than 20% zeros as they cannot be good reference for the entire set

In [68]:
data.drop('Electricity from nuclear (TWh)', axis=1)
pd.DataFrame(data)

Unnamed: 0,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from renewables (TWh),Low-carbon electricity (% electricity),Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,Renewables (% equivalent primary energy),gdp_growth,gdp_per_capita,Density\n(P/Km2),Land Area(Km2),Latitude,Longitude
0,1.61,6.2,9.22,20000.0,44.99,0.16,0.31,65.96,302.59,1.64,760.0,,,,60.00,652230.0,33.94,67.71
1,4.07,7.2,8.86,130000.0,45.60,0.09,0.50,84.75,236.89,1.74,730.0,,,,60.00,652230.0,33.94,67.71
2,9.41,8.2,8.47,3950000.0,37.83,0.13,0.56,81.16,210.86,1.40,1030.0,,,179.43,60.00,652230.0,33.94,67.71
3,14.74,9.5,8.09,25970000.0,36.66,0.31,0.63,67.02,229.97,1.40,1220.0,,8.83,190.68,60.00,652230.0,33.94,67.71
4,20.06,10.9,7.75,,44.24,0.33,0.56,62.92,204.23,1.20,1030.0,,1.41,211.38,60.00,652230.0,33.94,67.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3644,42.56,29.8,62.88,30000.0,81.90,3.50,3.32,48.68,3227.68,10.00,11020.0,,0.76,1464.59,38.00,390757.0,-19.02,29.15
3645,44.18,29.8,62.33,5570000.0,82.46,3.05,4.30,58.50,3068.01,9.51,10340.0,,4.71,1235.19,38.00,390757.0,-19.02,29.15
3646,45.57,29.9,82.53,10000.0,80.23,3.73,5.46,59.41,3441.99,9.83,12380.0,,4.82,1254.64,38.00,390757.0,-19.02,29.15
3647,46.78,30.1,81.40,250000.0,81.50,3.66,4.58,55.58,3003.66,10.47,11760.0,,-6.14,1316.74,38.00,390757.0,-19.02,29.15


Can MCAR Methods be used?

In [69]:
data_no_countries = pd.read_table('./co2_emissions/emission_data_modified.csv', sep=',')
mt = MCARTest(method="little")
if mt.little_mcar_test(data_no_countries) > 0.05:
    print('Is MCAR')
else:
    print('Not MCAR')

Not MCAR


Data is not MCAR so KNN can't really be used, but it can be a solution to try with the dataset anyways

In [70]:
knn_imputer = KNNImputer(n_neighbors=1)
imputed_data = knn_imputer.fit_transform(data_no_countries)
pd.DataFrame(imputed_data, columns=columns[2:])

Strategy #1: Take the average by country of the other values for the NaNs, if the country is entirely NaN, take the global average

In [None]:
# data.replace(0, np.nan)
average_per_column = data.groupby('Entity').mean(numeric_only=True)
pd.DataFrame(average_per_column)

Strategy #2: Take the average by year for NaNs

In [None]:
# data.replace(0, np.nan)
average_per_column = data.groupby('Year').mean(numeric_only=True)
pd.DataFrame(average_per_column)