Import Packages

In [12]:
import pandas as pd
import numpy as np
from pyampute.exploration.mcar_statistical_tests import MCARTest
from sklearn.impute import KNNImputer
import scipy as sp

Read Data and format numbers

In [13]:
columns = ['Entity', 'Year', 'Access to electricity (% of population)', 'Access to clean fuels for cooking',
           'Renewable-electricity-generating-capacity-per-capita', 'Financial flows to developing countries (US $)',
           'Renewable energy share in the total final energy consumption (%)', 'Electricity from fossil fuels (TWh)',
           'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)',
           'Low-carbon electricity (% electricity)', 'Primary energy consumption per capita (kWh/person)',
           'Energy intensity level of primary energy (MJ/$2017 PPP GDP)', 'Value_co2_emissions_kt_by_country',
           'Renewables (% equivalent primary energy)', 'gdp_growth', 'gdp_per_capita', 'Density\\n(P/Km2)',
           'Land Area(Km2)', 'Latitude', 'Longitude']


def convert_to_float(value):
    if isinstance(value, str) and ',' in value:
        return float(value.replace(',', '.'))
    return value


data = pd.read_csv('./co2_emissions/emission_data.csv')
pd.DataFrame(data)

Unnamed: 0,Entity,Year,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),...,Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,Renewables (% equivalent primary energy),gdp_growth,gdp_per_capita,Density\n(P/Km2),Land Area(Km2),Latitude,Longitude
0,Afghanistan,2000,1.61,6.2,9.22,20000.0,44.99,0.16,0.0,0.31,...,302.59,1.64,760.0,,,,60.0,652230.0,33.939110,67.709953
1,Afghanistan,2001,4.07,7.2,8.86,130000.0,45.60,0.09,0.0,0.50,...,236.89,1.74,730.0,,,,60.0,652230.0,33.939110,67.709953
2,Afghanistan,2002,9.41,8.2,8.47,3950000.0,37.83,0.13,0.0,0.56,...,210.86,1.40,1030.0,,,179.43,60.0,652230.0,33.939110,67.709953
3,Afghanistan,2003,14.74,9.5,8.09,25970000.0,36.66,0.31,0.0,0.63,...,229.97,1.40,1220.0,,8.83,190.68,60.0,652230.0,33.939110,67.709953
4,Afghanistan,2004,20.06,10.9,7.75,,44.24,0.33,0.0,0.56,...,204.23,1.20,1030.0,,1.41,211.38,60.0,652230.0,33.939110,67.709953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3644,Zimbabwe,2016,42.56,29.8,62.88,30000.0,81.90,3.50,0.0,3.32,...,3227.68,10.00,11020.0,,0.76,1464.59,38.0,390757.0,-19.015438,29.154857
3645,Zimbabwe,2017,44.18,29.8,62.33,5570000.0,82.46,3.05,0.0,4.30,...,3068.01,9.51,10340.0,,4.71,1235.19,38.0,390757.0,-19.015438,29.154857
3646,Zimbabwe,2018,45.57,29.9,82.53,10000.0,80.23,3.73,0.0,5.46,...,3441.99,9.83,12380.0,,4.82,1254.64,38.0,390757.0,-19.015438,29.154857
3647,Zimbabwe,2019,46.78,30.1,81.40,250000.0,81.50,3.66,0.0,4.58,...,3003.66,10.47,11760.0,,-6.14,1316.74,38.0,390757.0,-19.015438,29.154857


Determine the number of NaNs

In [14]:
nan_counts = data.isna().sum() * 100 / len(data)
print(nan_counts)

Entity                                                               0.000000
Year                                                                 0.000000
Access to electricity (% of population)                              0.274048
Access to clean fuels for cooking                                    4.631406
Renewable-electricity-generating-capacity-per-capita                25.513839
Financial flows to developing countries (US $)                      57.248561
Renewable energy share in the total final energy consumption (%)     5.316525
Electricity from fossil fuels (TWh)                                  0.575500
Electricity from nuclear (TWh)                                       3.453001
Electricity from renewables (TWh)                                    0.575500
Low-carbon electricity (% electricity)                               1.151000
Primary energy consumption per capita (kWh/person)                   0.000000
Energy intensity level of primary energy (MJ/$2017 PPP GDP)     

Can MCAR Methods be used?

In [15]:
data_no_countries = pd.read_table('./co2_emissions/emission_data_modified.csv', sep=',')
mt = MCARTest(method="little")
if mt.little_mcar_test(data_no_countries) > 0.05:
    print('Is MCAR')
else:
    print('Not MCAR')

Not MCAR


Data is not MCAR so KNN can't really be used, but it can be a solution to try with the dataset anyways

In [16]:
knn_imputer = KNNImputer(n_neighbors=1)
imputed_data = knn_imputer.fit_transform(data_no_countries)
pd.DataFrame(imputed_data, columns=columns[2:])

Unnamed: 0,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),Low-carbon electricity (% electricity),Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,Renewables (% equivalent primary energy),gdp_growth,gdp_per_capita,Density\n(P/Km2),Land Area(Km2),Latitude,Longitude
0,1.61,6.2,9.22,20000.0,44.99,0.16,0.0,0.31,65.96,302.59,1.64,760.0,14.79,1.41,211.38,60.0,652230.0,33.94,67.71
1,4.07,7.2,8.86,130000.0,45.60,0.09,0.0,0.50,84.75,236.89,1.74,730.0,14.79,1.41,211.38,60.0,652230.0,33.94,67.71
2,9.41,8.2,8.47,3950000.0,37.83,0.13,0.0,0.56,81.16,210.86,1.40,1030.0,4.90,1.41,179.43,60.0,652230.0,33.94,67.71
3,14.74,9.5,8.09,25970000.0,36.66,0.31,0.0,0.63,67.02,229.97,1.40,1220.0,4.90,8.83,190.68,60.0,652230.0,33.94,67.71
4,20.06,10.9,7.75,3950000.0,44.24,0.33,0.0,0.56,62.92,204.23,1.20,1030.0,4.90,1.41,211.38,60.0,652230.0,33.94,67.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3644,42.56,29.8,62.88,30000.0,81.90,3.50,0.0,3.32,48.68,3227.68,10.00,11020.0,1.65,0.76,1464.59,38.0,390757.0,-19.02,29.15
3645,44.18,29.8,62.33,5570000.0,82.46,3.05,0.0,4.30,58.50,3068.01,9.51,10340.0,1.65,4.71,1235.19,38.0,390757.0,-19.02,29.15
3646,45.57,29.9,82.53,10000.0,80.23,3.73,0.0,5.46,59.41,3441.99,9.83,12380.0,1.65,4.82,1254.64,38.0,390757.0,-19.02,29.15
3647,46.78,30.1,81.40,250000.0,81.50,3.66,0.0,4.58,55.58,3003.66,10.47,11760.0,1.65,-6.14,1316.74,38.0,390757.0,-19.02,29.15


Pipeline v2
Because intellij lost all my code :((

Set all groups with all NaNs to 0

In [17]:
groups_with_all_nans = data.groupby("Entity").apply(lambda grp: grp.isna().all())
pd.DataFrame(groups_with_all_nans)
axis_a, axis_b = groups_with_all_nans.axes
for axi_a in axis_a:
    for axi_b in axis_b[1:]:
        if groups_with_all_nans.loc[axi_a, axi_b]:
            data.loc[data["Entity"] == axi_a, axi_b] = 0
pd.DataFrame(data)

Unnamed: 0,Entity,Year,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),...,Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,Renewables (% equivalent primary energy),gdp_growth,gdp_per_capita,Density\n(P/Km2),Land Area(Km2),Latitude,Longitude
0,Afghanistan,2000,1.61,6.2,9.22,20000.0,44.99,0.16,0.0,0.31,...,302.59,1.64,760.0,0.0,,,60.0,652230.0,33.939110,67.709953
1,Afghanistan,2001,4.07,7.2,8.86,130000.0,45.60,0.09,0.0,0.50,...,236.89,1.74,730.0,0.0,,,60.0,652230.0,33.939110,67.709953
2,Afghanistan,2002,9.41,8.2,8.47,3950000.0,37.83,0.13,0.0,0.56,...,210.86,1.40,1030.0,0.0,,179.43,60.0,652230.0,33.939110,67.709953
3,Afghanistan,2003,14.74,9.5,8.09,25970000.0,36.66,0.31,0.0,0.63,...,229.97,1.40,1220.0,0.0,8.83,190.68,60.0,652230.0,33.939110,67.709953
4,Afghanistan,2004,20.06,10.9,7.75,,44.24,0.33,0.0,0.56,...,204.23,1.20,1030.0,0.0,1.41,211.38,60.0,652230.0,33.939110,67.709953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3644,Zimbabwe,2016,42.56,29.8,62.88,30000.0,81.90,3.50,0.0,3.32,...,3227.68,10.00,11020.0,0.0,0.76,1464.59,38.0,390757.0,-19.015438,29.154857
3645,Zimbabwe,2017,44.18,29.8,62.33,5570000.0,82.46,3.05,0.0,4.30,...,3068.01,9.51,10340.0,0.0,4.71,1235.19,38.0,390757.0,-19.015438,29.154857
3646,Zimbabwe,2018,45.57,29.9,82.53,10000.0,80.23,3.73,0.0,5.46,...,3441.99,9.83,12380.0,0.0,4.82,1254.64,38.0,390757.0,-19.015438,29.154857
3647,Zimbabwe,2019,46.78,30.1,81.40,250000.0,81.50,3.66,0.0,4.58,...,3003.66,10.47,11760.0,0.0,-6.14,1316.74,38.0,390757.0,-19.015438,29.154857


Interpolate / Extrapolate missing values

In [18]:
def interpolate_group(group):
    for col in columns:
        if group[col].isna().sum() == 0:
            return group
        x_values = np.arange(len(group[col]))
        # Filter NaN values and interpolate using cubic spline
        mask = group[col].isna()
        x = x_values[~mask]
        y = group[col].dropna().values
        try:
            cs = sp.interpolate.CubicSpline(x, y, bc_type='not-a-knot', extrapolate=True)
        
            # Interpolate missing values and replace them in the group
            group.loc[mask, col].values = cs(x_values[mask])
        except:
            print("pain")


    return group


data = data.groupby("Entity").apply(interpolate_group).reset_index(drop=True)
pd.DataFrame(data)
    

Unnamed: 0,Entity,Year,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),...,Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,Renewables (% equivalent primary energy),gdp_growth,gdp_per_capita,Density\n(P/Km2),Land Area(Km2),Latitude,Longitude
0,Afghanistan,2000,1.61,6.2,9.22,20000.0,44.99,0.16,0.0,0.31,...,302.59,1.64,760.0,0.0,,,60.0,652230.0,33.939110,67.709953
1,Afghanistan,2001,4.07,7.2,8.86,130000.0,45.60,0.09,0.0,0.50,...,236.89,1.74,730.0,0.0,,,60.0,652230.0,33.939110,67.709953
2,Afghanistan,2002,9.41,8.2,8.47,3950000.0,37.83,0.13,0.0,0.56,...,210.86,1.40,1030.0,0.0,,179.43,60.0,652230.0,33.939110,67.709953
3,Afghanistan,2003,14.74,9.5,8.09,25970000.0,36.66,0.31,0.0,0.63,...,229.97,1.40,1220.0,0.0,8.83,190.68,60.0,652230.0,33.939110,67.709953
4,Afghanistan,2004,20.06,10.9,7.75,,44.24,0.33,0.0,0.56,...,204.23,1.20,1030.0,0.0,1.41,211.38,60.0,652230.0,33.939110,67.709953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3644,Zimbabwe,2016,42.56,29.8,62.88,30000.0,81.90,3.50,0.0,3.32,...,3227.68,10.00,11020.0,0.0,0.76,1464.59,38.0,390757.0,-19.015438,29.154857
3645,Zimbabwe,2017,44.18,29.8,62.33,5570000.0,82.46,3.05,0.0,4.30,...,3068.01,9.51,10340.0,0.0,4.71,1235.19,38.0,390757.0,-19.015438,29.154857
3646,Zimbabwe,2018,45.57,29.9,82.53,10000.0,80.23,3.73,0.0,5.46,...,3441.99,9.83,12380.0,0.0,4.82,1254.64,38.0,390757.0,-19.015438,29.154857
3647,Zimbabwe,2019,46.78,30.1,81.40,250000.0,81.50,3.66,0.0,4.58,...,3003.66,10.47,11760.0,0.0,-6.14,1316.74,38.0,390757.0,-19.015438,29.154857


Count the number of zeros

In [None]:
num_zeros = (data == 0).sum() * 100 / len(data)
print(num_zeros)

Remove Columns with more than 20% zeros as they cannot be good reference for the entire set

In [None]:
data.drop('Electricity from nuclear (TWh)', axis=1)
pd.DataFrame(data)