In [35]:
import pandas as pd
from sklearn.decomposition import PCA

# Used for displaying HTML
from IPython.display import display, HTML

In [36]:
# Import utility functions
import sys

sys.path.append("../utility_functions")
from ind_utility_functions import (
    get_ind_name,
    get_ind_info,
)

In [37]:
# Select data
datasets = [
    "../OECD/Cleaned/HEALTH_MERGED_Threshold_80_n3.csv",
    "../WorldBankDatasets/Cleaned/AllMerged_Threshold_85_n3_MMRTNE.csv",
]

OECD = False  # False for WB data

if OECD:
    df = pd.read_csv(datasets[0])
    mm_ind = "MATIMATM"
else:
    df = pd.read_csv(datasets[1])
    mm_ind = "SH.STA.MMRT.NE"

## Data Preparation

In [38]:
# Reduce to only gender-themed indicators for WB

if not OECD:
    gender_ind = pd.read_csv("../WorldBankDatasets/Gender_WorldBankData.csv").columns
    df = df.drop(columns=[col for col in df if col not in gender_ind])

# Select year
year = 2015
df = df[df["Year"] == year]

# Scale predictor, which is currently unscaled
# scaler = MinMaxScaler()
# df[mm_ind] = scaler.fit_transform(df[mm_ind].to_numpy().reshape(-1, 1))

In [39]:
# Pull out predictor
y = df[mm_ind]

# Drop country, year, and predictor
X = df.drop(columns=["Country", "Year", mm_ind])

# PCA

In [40]:
pca = PCA(n_components=0.95, random_state=42)
pca.fit(X)

print("Number of PCs: {}".format(len(pca.explained_variance_ratio_)))
print("Explained variation per PC: {}".format(pca.explained_variance_ratio_))
print("Sum of explained variation: {}".format(pca.explained_variance_ratio_.sum()))

Number of PCs: 22
Explained variation per PC: [0.19456131 0.13225224 0.09247485 0.08903783 0.06641937 0.0528512
 0.0457819  0.04365733 0.03245155 0.02615284 0.02435279 0.02258434
 0.02018065 0.01793234 0.01682751 0.01452492 0.01348546 0.01139115
 0.01037036 0.00911611 0.00767807 0.00622649]
Sum of explained variation: 0.9503106136546872


# Detailed information of the features that contribute to PCs

In [41]:
# note this only works one loading set at a time - Ex: pca_loadings[0]
def sort_loading_set_for_principal_component(loadings, X=X):
    feature_names = X.columns

    # make a series of the absolute values of the current pca loadings. Index is the column name (feature) and value is the loading value
    feature_names_series = pd.Series(loadings, index=feature_names).abs()

    # finally sort descending. The higher the number, the higher the correlation and explanatory value of the
    # feature for the principal component
    pca_component_explanatory_features_sorted = feature_names_series.sort_values(
        ascending=False
    )

    return pca_component_explanatory_features_sorted

In [42]:
# uncomment below to test the function sort_loading_set_for_principal_component

percentage_of_variance_to_explain = 0.8
pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
pca.fit(X)
pca_loadings = pca.components_

sort_loading_set_for_principal_component(pca_loadings[0])

SG.ABS.PENB             0.255515
SH.PAR.LEVE.AL          0.239041
SG.LAW.EQRM.WK          0.224057
SH.HIV.1524.MA.ZS       0.217053
SH.HIV.INCD.YG.MA.P3    0.182648
                          ...   
SG.LOC.LIVE.EQ          0.000000
SG.NGT.WORK.EQ          0.000000
SG.OPN.BANK.EQ          0.000000
SH.HIV.1524.FE.ZS       0.000000
SG.GET.JOBS.EQ          0.000000
Length: 161, dtype: float64

In [43]:
def get_most_important_features_from_pca_loading_set(
    pca_loading_set,
    n_most_important_features_from_loading=5,
    includeName=True,
    includeDefinitionOrAdditionalInfo=True,
):
    pca_loadings_explanations = {}

    # get just the top n_most_important_features_from_loading features
    cur_loadings_sorted_with_n_most_important_features = (
        sort_loading_set_for_principal_component(pca_loading_set).iloc[
            0:n_most_important_features_from_loading
        ]
    )

    feature_codes = []
    feature_values = []
    feature_names = []
    feature_definitions_or_additional_info = []

    for feature_code_index, feature_value in zip(
        cur_loadings_sorted_with_n_most_important_features.index,
        cur_loadings_sorted_with_n_most_important_features.values,
    ):
        feature_codes.append(feature_code_index)
        feature_values.append(feature_value)

        if includeName:
            cur_name = get_ind_name(feature_code_index)
            # print('cur_name is', cur_name)
            feature_names.append(cur_name)

        if includeDefinitionOrAdditionalInfo:
            cur_definition_or_additional_info = get_ind_info(feature_code_index)
            feature_definitions_or_additional_info.append(
                cur_definition_or_additional_info
            )

    pca_loadings_explanations["most_important_feature_values"] = feature_values
    pca_loadings_explanations["most_important_codes"] = feature_codes
    pca_loadings_explanations["most_important_feature_names"] = feature_names
    pca_loadings_explanations[
        "most_important_definitions_or_additional_info"
    ] = feature_definitions_or_additional_info

    return pca_loadings_explanations

In [44]:
# uncomment below to test the function get_most_important_features_from_pca_loading_set

percentage_of_variance_to_explain = 0.8
pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
pca.fit(X)
pca_loadings = pca.components_

get_most_important_features_from_pca_loading_set(pca_loadings[0])

{'most_important_feature_values': [0.2555150590681781,
  0.23904125608925914,
  0.2240565441305023,
  0.21705336176274082,
  0.18264835997444911],
 'most_important_codes': ['SG.ABS.PENB',
  'SH.PAR.LEVE.AL',
  'SG.LAW.EQRM.WK',
  'SH.HIV.1524.MA.ZS',
  'SH.HIV.INCD.YG.MA.P3'],
 'most_important_feature_names': ['There are periods of absence due to childcare accounted for in pension benefits (1=yes; 0=no)',
  'There is paid parental leave (1=yes; 0=no)',
  'Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)',
  'Prevalence of HIV, male (% ages 15-24)',
  'Incidence of HIV, ages 15-24, male (per 1,000 uninfected male population ages 15-24)'],
 'most_important_definitions_or_additional_info': ['The indicator measures whether pension contributions are paid or credited during maternity or parental leave, or the leave period is considered a qualifying period of employment used for the purpose of calculating pension benefits; or there are mechanisms to 

In [45]:
# this function will show the top most important components explaining the variance for each PC
def pca_with_detailed_variance_and_components_info(
    percentage_of_variance_to_explain,
    X=X,
    n_most_important_features_from_loading=5,
    includeValues=False,
    includeDefinitionOrAdditionalInfo=False,
):
    pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
    pca.fit(X)
    pca_loadings = pca.components_

    loadings_information = []
    for loading in pca_loadings:
        cur_loading_features_info = get_most_important_features_from_pca_loading_set(
            loading, n_most_important_features_from_loading
        )
        loadings_information.append(cur_loading_features_info)

    num_principal_components = len(pca.explained_variance_ratio_)

    print(
        f"in order to explain {percentage_of_variance_to_explain * 100} percentage of the variance with pca it requires"
    )
    print(f"{num_principal_components} principal components")
    print(
        f"the most important feature names, for the top {n_most_important_features_from_loading} from each PC, are\n"
    )

    # print the most important value names
    for i in range(0, len(loadings_information)):
        # print('\n')
        print("---------")
        print(
            f"For Principal Component {i+1}, the most important features names are:\n"
        )
        # print('\n')
        cur_loadings_info = loadings_information[i]

        cur_loadings_names = cur_loadings_info["most_important_feature_names"]
        cur_loadings_values = cur_loadings_info["most_important_feature_values"]
        cur_loadings_codes = cur_loadings_info["most_important_codes"]
        cur_loadings_definitions_or_additional_info = cur_loadings_info[
            "most_important_definitions_or_additional_info"
        ]
        # most_important_definitions_or_additional_info

        for code, value, name, definition_or_additional_info in zip(
            cur_loadings_codes,
            cur_loadings_values,
            cur_loadings_names,
            cur_loadings_definitions_or_additional_info,
        ):
            print(name)

            if includeValues:
                print(f"with value {value}")

            if includeDefinitionOrAdditionalInfo:
                print(definition_or_additional_info)
        print("\n")

In [46]:
variance_percentage_to_explain = 0.8
num_top_features_per_pc = 5

pca_with_detailed_variance_and_components_info(
    variance_percentage_to_explain,
    includeValues=True,
    n_most_important_features_from_loading=num_top_features_per_pc,
)

in order to explain 80.0 percentage of the variance with pca it requires
12 principal components
the most important feature names, for the top 5 from each PC, are

---------
For Principal Component 1, the most important features names are:

There are periods of absence due to childcare accounted for in pension benefits (1=yes; 0=no)
with value 0.2555150590681781
There is paid parental leave (1=yes; 0=no)
with value 0.23904125608925914
Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)
with value 0.2240565441305023
Prevalence of HIV, male (% ages 15-24)
with value 0.21705336176274082
Incidence of HIV, ages 15-24, male (per 1,000 uninfected male population ages 15-24)
with value 0.18264835997444911


---------
For Principal Component 2, the most important features names are:

The age at which men and women can retire with full pension benefits is the same (1=yes; 0=no)
with value 0.413640411932588
The age at which men and women can retire with par

In [47]:
# returns a list of the top feature name for each principal component
def get_top_feature_for_each_pc(
    percentage_of_variance_to_explain, X=X, n_most_important_features_from_loading=5
):
    pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
    pca.fit(X)
    pca_loadings = pca.components_

    loadings_information = []
    for loading in pca_loadings:
        cur_loading_features_info = get_most_important_features_from_pca_loading_set(
            loading, n_most_important_features_from_loading
        )
        loadings_information.append(cur_loading_features_info)

    top_feature_name_per_pc = []
    for i in range(0, len(loadings_information)):
        cur_loadings_info = loadings_information[i]

        cur_features_names = cur_loadings_info["most_important_feature_names"]
        cur_top_feature = cur_features_names[0]
        top_feature_name_per_pc.append(cur_top_feature)
        # cur_loadings_values = cur_loadings_info['most_important_features_values']
        # cur_loadings_codes = cur_loadings_info['most_important_features_world_bank_codes']
        # cur_loadings_definitions = cur_loadings_info['most_important_features_world_bank_definitions']

    return top_feature_name_per_pc

In [48]:
variance_percentage_to_explain = 0.95

top_feature_names_for_pc_loadings = get_top_feature_for_each_pc(
    variance_percentage_to_explain
)

if OECD:
    dataset_using = "OECD"
else:
    dataset_using = "World Bank Dataset"

display(
    HTML(
        f"""<h2>Assuming you want to explain {variance_percentage_to_explain * 100} percent of the PC variation in the {dataset_using}, 
              the top feature names that explain this per principal component are:</h2>\n"""
    )
)

for i in range(0, len(top_feature_names_for_pc_loadings)):
    # print(f"for PC {i+1} the top explaining feature is: {top_feature_names_for_pc_loadings[i]}")
    # print(f"PC {i+1}: {top_feature_names_for_pc_loadings[i]}")
    display(HTML(f"<strong>PC {i+1}:</strong> {top_feature_names_for_pc_loadings[i]}"))
    # print("\n")