In [1]:
# Imports
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# import the world bank utility functions. It requires changing the path to import properly in Python
import sys
sys.path.append('../utility_functions')

from world_bank_utility_functions import (get_world_bank_indicator_name_from_code, 
                                          get_world_bank_indicator_definition_from_code)

# for displaying text with formatting later
from IPython.core.display import display, HTML

In [2]:
# Select data
datasets = ["../OECD/Cleaned/HEALTH_MERGED_Threshold_80_n3.csv", "../WorldBankDatasets/Cleaned/AllMerged_Threshold_85_n3_MMRTNE.csv"]

OECD = False  # False for WB data

if OECD:
    df = pd.read_csv(datasets[0])
    mm_ind = "MATIMATM"
else:
    df = pd.read_csv(datasets[1])
    mm_ind = "SH.STA.MMRT.NE"

In [3]:
df

Unnamed: 0,Country,Year,SH.ANM.ALLW.ZS,SH.ANM.CHLD.ZS,SH.ANM.NPRG.ZS,SH.DTH.IMRT,SH.DTH.IMRT.FE,SH.DTH.IMRT.MA,SH.DTH.MORT,SH.DTH.MORT.FE,...,SI.DST.50MD,SI.DST.FRST.10,SI.DST.FRST.20,SI.POV.DDAY,SI.POV.GAPS,SI.POV.GINI,SI.POV.LMIC,SI.POV.LMIC.GP,SI.POV.UMIC,SI.POV.UMIC.GP
0,AUS,1997,0.011645,0.194260,0.015942,0.012698,0.012426,0.012914,0.012039,0.011477,...,0.457778,0.520000,0.540816,0.021807,0.028986,0.338596,0.018565,0.022807,0.015423,0.017691
1,AUS,2000,0.026201,0.198675,0.030435,0.011791,0.011541,0.011989,0.011114,0.010627,...,0.306667,0.620000,0.653061,0.017134,0.026087,0.265789,0.012658,0.017544,0.013433,0.013035
2,AUS,2003,0.008734,0.192053,0.013043,0.011370,0.011162,0.011535,0.010641,0.010209,...,0.377778,0.520000,0.561224,0.046729,0.069565,0.336842,0.030380,0.047368,0.025373,0.030726
3,AUS,2006,0.000000,0.192053,0.004348,0.011501,0.011289,0.011670,0.010634,0.010209,...,0.306667,0.620000,0.653061,0.017134,0.026087,0.265789,0.012658,0.017544,0.013433,0.013035
4,AUS,2011,0.000000,0.192053,0.000000,0.010594,0.010467,0.010695,0.009775,0.009439,...,0.465185,0.513333,0.540816,0.021807,0.031884,0.333333,0.018565,0.024561,0.016915,0.018622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150,USA,2010,0.091703,0.000000,0.100000,0.238710,0.237590,0.239604,0.219547,0.213272,...,0.711111,0.340000,0.346939,0.046729,0.078261,0.507895,0.030380,0.052632,0.029851,0.036313
1151,USA,2011,0.104803,0.000000,0.117391,0.229902,0.229566,0.230170,0.211664,0.206145,...,0.711111,0.340000,0.336735,0.046729,0.069565,0.531579,0.030380,0.047368,0.029851,0.036313
1152,USA,2014,0.135371,0.000000,0.147826,0.218682,0.218740,0.218635,0.199531,0.194587,...,0.666667,0.320000,0.336735,0.056075,0.086957,0.547368,0.037975,0.063158,0.029851,0.039106
1153,USA,2015,0.144105,0.000000,0.156522,0.216840,0.217097,0.216634,0.197406,0.192613,...,0.675556,0.340000,0.346939,0.056075,0.078261,0.539474,0.030380,0.052632,0.032836,0.039106


In [4]:
# mapping world bank codes to definitions
mapping_world_bank_to_names_and_definitions_df = pd.read_csv('../WorldBankDatasets/Cleaned/World_Bank_Indicator_Definition_Info.csv',
                                                             delimiter='\t')

## Data Preparation

In [6]:
# Reduce to only gender indicators

#if not OECD:
#    gender_ind = pd.read_csv('../WorldBankDatasets/Gender_WorldBankData.csv').columns
#    df = df.drop(columns=[col for col in df if col not in gender_ind])

#df.head()

In [7]:
# Select year
year = 2015
df = df[df["Year"] == year]
df.drop("Year", axis=1, inplace=True)

# Scale predictor, which is currently unscaled
scaler = MinMaxScaler()
df[mm_ind] = scaler.fit_transform(df[mm_ind].to_numpy().reshape(-1, 1))

# Drop country
X = df.drop(columns=['Country'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[mm_ind] = scaler.fit_transform(df[mm_ind].to_numpy().reshape(-1, 1))


## Dimensionality Reduction

# PCA

In [8]:
pca = PCA(n_components=2, random_state=42)
pca.fit(X)

print("Number of PCs: {}".format(len(pca.explained_variance_ratio_)))
print("Explained variation per PC: {}".format(pca.explained_variance_ratio_))
print("Sum of explained variation: {}".format(pca.explained_variance_ratio_.sum()))

Number of PCs: 2
Explained variation per PC: [0.423584   0.10788109]
Sum of explained variation: 0.53146508400629


# Now to show you detailed information of the features that contribute to the PC analysis

In [9]:
def sort_loadings_for_principal_component(loadings, X=X):
    feature_names = X.columns
    
    # make a series of the current pca loadings. Index is the column name (feature) and value is the loading value
    feature_names_series = pd.Series(loadings, index=feature_names)

    # take the absolute value of the series
    feature_names_series_abs = feature_names_series.abs()

    # finally sort descending. The higher the number, the higher the correlatino and explanatory value of the feature for the principal component
    pca_component_explanatory_features_sorted = feature_names_series_abs.sort_values(ascending=False)
    
    return pca_component_explanatory_features_sorted

In [10]:
def get_most_important_features_from_pca_loadings (pca_loadings, n_most_important_features_from_loading = 5, 
                                                   includeName = True, includeDefintion = True,
                                                   mapping_df = mapping_world_bank_to_names_and_definitions_df):
    pca_loadings_explanations = {}

    cur_loadings_sorted_with_n_most_important_features = sort_loadings_for_principal_component(pca_loadings).iloc[
        0:n_most_important_features_from_loading]

    feature_codes = []
    feature_values = []

    feature_names = []
    feature_definitions = []

    for feature_code_index, feature_value in zip (cur_loadings_sorted_with_n_most_important_features.index,
                                                  cur_loadings_sorted_with_n_most_important_features.values):
        feature_codes.append(feature_code_index)
        feature_values.append(feature_value)

        if (includeName):
            # cur_name = get_world_bank_indicator_name_from_code(feature_code_index)
            cur_name = get_world_bank_indicator_name_from_code(feature_code_index, mapping_df)
            # print('cur_name is', cur_name)
            feature_names.append(cur_name)
        
        if (includeDefintion):
            # cur_definition  = get_world_bank_indicator_definition_from_code(feature_code_index)
            cur_definition  = get_world_bank_indicator_definition_from_code(feature_code_index, mapping_df)
            feature_definitions.append(cur_definition)

    # pca_loadings_explanations[f"top_{n_most_important_features_from_loading}_most_important_feature_values"] = feature_values
    pca_loadings_explanations["most_important_features_values"] = feature_values
    pca_loadings_explanations["most_important_features_world_bank_codes"] = feature_codes
    pca_loadings_explanations["most_important_features_world_bank_names"] = feature_names
    pca_loadings_explanations["most_important_features_world_bank_definitions"] = feature_definitions

    return pca_loadings_explanations

In [11]:
# this function will show the top most important components explaining the variance for each of the pcs 
# give a given percentage of variance to explain
def pca_with_detailed_variance_and_components_info(percentage_of_variance_to_explain, X=X, n_most_important_features_from_loading=5, 
                                                   includeValues = False, includeDefinition=False):
    pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
    pca.fit(X)
    pca_loadings = pca.components_

    loadings_information = []
    for loading in pca_loadings:
        cur_loading_features_info = get_most_important_features_from_pca_loadings(loading, n_most_important_features_from_loading)
        loadings_information.append(cur_loading_features_info)
    
    num_principal_components = len(pca.explained_variance_ratio_)

    print(f'in order to explain {percentage_of_variance_to_explain * 100} percentage of the variance with pca it requires')
    print(f'{num_principal_components} principal components')
    print(f'the most important feature names, for the top {n_most_important_features_from_loading} from each PC, are\n')

    # print the most important value names
    for i in range(0, len(loadings_information)):
        # print('\n')
        print('---------')
        print(f'For Principal Component {i+1}, the most important features names are:\n')
        # print('\n')
        cur_loadings_info = loadings_information[i]

        cur_loadings_names = cur_loadings_info['most_important_features_world_bank_names']
        cur_loadings_values = cur_loadings_info['most_important_features_values']
        cur_loadings_codes = cur_loadings_info['most_important_features_world_bank_codes']
        cur_loadings_definitions = cur_loadings_info['most_important_features_world_bank_definitions']

        for code, value, name, definition in zip(cur_loadings_codes, cur_loadings_values, cur_loadings_names, cur_loadings_definitions):
            print(name)
            
            # uncomment for testing to debug
            # print(code)

            if (includeValues):
                print(f"with value {value}")

            if (includeDefinition):
                print(definition)
        print('\n')

## Modify variance_percentage_to_explain below for detailed information based on what percentage of the variance in principal components you want to explain maternal mortality. (see section below for just the top feature per PC)

In [12]:
# pca_with_detailed_variance_and_components_info(0.8)
# pca_with_detailed_variance_and_components_info(0.9, includeValues=True)
num_top_features_per_pc = 5
pca_with_detailed_variance_and_components_info(0.9, includeValues=True, 
                                               n_most_important_features_from_loading = num_top_features_per_pc)

in order to explain 90.0 percentage of the variance with pca it requires
18 principal components
the most important feature names, for the top 5 from each PC, are

---------
For Principal Component 1, the most important features names are:

There are periods of absence due to childcare accounted for in pension benefits (1=yes; 0=no)
with value 0.07151985342348977
Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)
with value 0.06792436978253891
Secondary education, vocational pupils
with value 0.06718282519487066
Enrolment in lower secondary education, female (number)
with value 0.06649493079538346
Enrolment in lower secondary education, both sexes (number)
with value 0.0654514707178874


---------
For Principal Component 2, the most important features names are:

The age at which men and women can retire with full pension benefits is the same (1=yes; 0=no)
with value 0.15122940803154433
Domestic general government health expenditure per capita, 

In [13]:
# returns a list of the top feature name for each principal component
def get_top_feature_for_each_pc(percentage_of_variance_to_explain, X=X, n_most_important_features_from_loading=5):
    pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
    pca.fit(X)
    pca_loadings = pca.components_

    loadings_information = []
    for loading in pca_loadings:
        cur_loading_features_info = get_most_important_features_from_pca_loadings(loading, n_most_important_features_from_loading)
        loadings_information.append(cur_loading_features_info)
    
    top_feature_name_per_pc = []
    for i in range(0, len(loadings_information)):
        cur_loadings_info = loadings_information[i]

        cur_features_names = cur_loadings_info['most_important_features_world_bank_names']
        cur_top_feature = cur_features_names[0]
        top_feature_name_per_pc.append(cur_top_feature)
        # cur_loadings_values = cur_loadings_info['most_important_features_values']
        # cur_loadings_codes = cur_loadings_info['most_important_features_world_bank_codes']
        # cur_loadings_definitions = cur_loadings_info['most_important_features_world_bank_definitions']
    
    return top_feature_name_per_pc

## Modify variance_percentage_to_explain below based on what percentage of the variance in principal components. You may find the results interesting!

In [14]:
# below starts to use html formatting

In [15]:
# variance_percentage_to_explain = 0.2
# variance_percentage_to_explain = 0.3
# variance_percentage_to_explain = 0.5
# variance_percentage_to_explain = 0.8
variance_percentage_to_explain = 0.85
# variance_percentage_to_explain = 0.9
# variance_percentage_to_explain = 0.95

top_feature_names_for_pc_loadings = get_top_feature_for_each_pc(variance_percentage_to_explain)

# print(f"""Assuming you want to explain {variance_percentage_to_explain * 100} percent of the PC variation, 
# for maternal morality, the top feature names that explain this per principal component are:\n""")

display( HTML(f"""<h2>Assuming you want to explain {variance_percentage_to_explain * 100} percent of the PC variation, 
              the top feature names that explain this per principal component are:</h2>\n"""))

for i in range(0, len(top_feature_names_for_pc_loadings)):
    #print(f"for PC {i+1} the top explaining feature is: {top_feature_names_for_pc_loadings[i]}")
    # print(f"PC {i+1}: {top_feature_names_for_pc_loadings[i]}")
    display( HTML( f"<strong>PC {i+1}:</strong> {top_feature_names_for_pc_loadings[i]}" ) )
    # print("\n")