In [111]:
import pandas as pd
import warnings
# configs
warnings.filterwarnings('ignore')

In [112]:
data= pd.read_csv('global-data-on-sustainable-energy.csv')

#### Data Quality Report

##### Continuous features report 

In [113]:
def build_continuous_features_report(data_df):

    """Build tabular report for continuous features"""

    stats = {
        "Count": len,
        "Miss %": lambda df: df.isna().sum() / len(df) * 100,
        "Card.": lambda df: df.nunique(),
        "Min": lambda df: df.min(),
        "1st Qrt.": lambda df: df.quantile(0.25),
        "Mean": lambda df: df.mean(),
        "Median": lambda df: df.median(),
        "3rd Qrt": lambda df: df.quantile(0.75),
        "Max": lambda df: df.max(),
        "Std. Dev.": lambda df: df.std(),
    }

    contin_feat_names = data_df.select_dtypes("number").columns
    continuous_data_df = data_df[contin_feat_names]

    report_df = pd.DataFrame(index=contin_feat_names, columns=stats.keys())

    for stat_name, fn in stats.items():
        # NOTE: ignore warnings for empty features
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            report_df[stat_name] = fn(continuous_data_df)

    return report_df


In [114]:
# build continuous features report
con_report_df = build_continuous_features_report(data)
con_report_df

Unnamed: 0,Count,Miss %,Card.,Min,1st Qrt.,Mean,Median,3rd Qrt,Max,Std. Dev.
Year,3649,0.0,21,2000.0,2005.0,2010.038,2010.0,2015.0,2020.0,6.054228
Access to electricity (% of population),3649,0.274048,2040,1.252269,59.80089,78.9337,98.36157,100.0,100.0,30.27554
Access to clean fuels for cooking,3649,4.631406,896,0.0,23.175,63.25529,83.15,100.0,100.0,39.04366
Renewable-electricity-generating-capacity-per-capita,3649,25.513839,2110,0.0,3.54,113.1375,32.91,112.21,3060.19,244.1673
Financial flows to developing countries (US $),3649,57.248561,1017,0.0,260000.0,94224000.0,5665000.0,55347500.0,5202310000.0,298154400.0
Renewable energy share in the total final energy consumption (%),3649,5.316525,2587,0.0,6.515,32.63816,23.3,55.245,96.04,29.8949
Electricity from fossil fuels (TWh),3649,0.5755,1859,0.0,0.29,70.365,2.97,26.8375,5184.13,348.0519
Electricity from nuclear (TWh),3649,3.453001,547,0.0,0.0,13.45019,0.0,0.0,809.41,73.00662
Electricity from renewables (TWh),3649,0.5755,1533,0.0,0.04,23.96801,1.47,9.6,2184.94,104.4311
Low-carbon electricity (% electricity),3649,1.151,2647,0.0,2.877847,36.80118,27.86507,64.40379,100.0,34.31488


In [115]:
print("categorical features are :")
data.select_dtypes(exclude="number").columns

categorical features are :


Index(['Entity', 'Density\n(P/Km2)'], dtype='object')

##### Categorical features Report

In [116]:
def build_categorical_features_report(data_df):

    """Build tabular report for categorical features"""

    def _mode(df):
        return df.apply(lambda ft: ",".join(ft.mode().to_list()))

    def _mode_freq(df):
        return df.apply(lambda ft: ft.value_counts()[ft.mode()].sum())

    def _second_mode(df):
        return df.apply(lambda ft: ",".join(ft[~ft.isin(ft.mode())].mode().to_list()))

    def _second_mode_freq(df):
        return df.apply(
            lambda ft: ft[~ft.isin(ft.mode())]
            .value_counts()[ft[~ft.isin(ft.mode())].mode()]
            .sum()
        )

    stats = {
        "Count": len,
        "Miss %": lambda df: df.isna().sum() / len(df) * 100,
        "Card.": lambda df: df.nunique(),
        "Mode": _mode,
        "Mode Freq": _mode_freq,
        "Mode %": lambda df: _mode_freq(df) / len(df) * 100,
        "2nd Mode": _second_mode,
        "2nd Mode Freq": _second_mode_freq,
        "2nd Mode %": lambda df: _second_mode_freq(df) / len(df) * 100,
    }

    cat_feat_names = data_df.select_dtypes(exclude="number").columns
    cat_data_df = data_df[cat_feat_names]

    report_df = pd.DataFrame(index=cat_feat_names, columns=stats.keys())

    for stat_name, fn in stats.items():
        # NOTE: ignore warnings for empty features
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            report_df[stat_name] = fn(cat_data_df)

    return report_df


In [117]:
# build categorical feature report
cat_report_df = build_categorical_features_report(data)
cat_report_df

Unnamed: 0,Count,Miss %,Card.,Mode,Mode Freq,Mode %,2nd Mode,2nd Mode Freq,2nd Mode %
Entity,3649,0.0,176,"Afghanistan,Albania,Algeria,Angola,Antigua and...",3612,98.986024,"Montenegro,Serbia",28,0.767334
Density\n(P/Km2),3649,0.027405,124,25,147,4.028501,18,113,3.096739


Convert the types to the desired ones

In [118]:
def remove_functuation(density):
    return density.replace(',','')
data['Entity']=data['Entity'].astype(str)
data['Density\\n(P/Km2)']=data['Density\\n(P/Km2)'].astype(str)
data['Density\\n(P/Km2)']=data['Density\\n(P/Km2)'].apply(remove_functuation)
data['Density\\n(P/Km2)']=data['Density\\n(P/Km2)'].astype(float)

In [119]:
#https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns
#https://www.geeksforgeeks.org/remove-multiple-elements-from-a-list-in-python/
columns=data.columns.tolist()
feature_with_null=[column for column in columns if column not in ['Entity','Year','Primary energy consumption per capita (kWh/person)']]
data.fillna(data[feature_with_null].mean(),inplace=True)
data.dropna(inplace=True)
missing_values = data.isnull().sum()
print(missing_values)

Entity                                                              0
Year                                                                0
Access to electricity (% of population)                             0
Access to clean fuels for cooking                                   0
Renewable-electricity-generating-capacity-per-capita                0
Financial flows to developing countries (US $)                      0
Renewable energy share in the total final energy consumption (%)    0
Electricity from fossil fuels (TWh)                                 0
Electricity from nuclear (TWh)                                      0
Electricity from renewables (TWh)                                   0
Low-carbon electricity (% electricity)                              0
Primary energy consumption per capita (kWh/person)                  0
Energy intensity level of primary energy (MJ/$2017 PPP GDP)         0
Value_co2_emissions_kt_by_country                                   0
Renewables (% equiva