In [78]:
import pandas as pd
from sklearn.decomposition import PCA

# Used for displaying HTML
from IPython.display import display, HTML

In [79]:
# Import utility functions
import sys

sys.path.append("../utility_functions")
from ind_utility_functions import (
    get_ind_name,
    get_ind_info,
)

In [80]:
# Select data
datasets = [
    "../OECD/Cleaned/HEALTH_MERGED_Threshold_80_n3.csv",
    "../WorldBankDatasets/Cleaned/AllMerged_Threshold_85_n3_MMRTNE.csv",
]

OECD = False  # False for WB data

if OECD:
    df = pd.read_csv(datasets[0])
    mm_ind = "MATIMATM"
else:
    df = pd.read_csv(datasets[1])
    mm_ind = "SH.STA.MMRT.NE"

## Data Preparation

In [81]:
# Reduce to only gender indicators

# if not OECD:
#    gender_ind = pd.read_csv('../WorldBankDatasets/Gender_WorldBankData.csv').columns
#    df = df.drop(columns=[col for col in df if col not in gender_ind])

# Select year
# year = 2015
# df = df[df["Year"] == year]
# df.drop("Year", axis=1, inplace=True)

# Scale predictor, which is currently unscaled
# scaler = MinMaxScaler()
# df[mm_ind] = scaler.fit_transform(df[mm_ind].to_numpy().reshape(-1, 1))

In [82]:
# Pull out predictor
y = df[mm_ind]

# Drop country, year, and predictor
X = df.drop(columns=["Country", "Year", mm_ind])

# PCA

In [83]:
pca = PCA(n_components=2, random_state=42)
pca.fit(X)

print("Number of PCs: {}".format(len(pca.explained_variance_ratio_)))
print("Explained variation per PC: {}".format(pca.explained_variance_ratio_))
print("Sum of explained variation: {}".format(pca.explained_variance_ratio_.sum()))

Number of PCs: 2
Explained variation per PC: [0.31839939 0.15997576]
Sum of explained variation: 0.47837515227490146


# Detailed information of the features that contribute to PCs

In [84]:
# note this only works one loading set at a time - Ex: pca_loadings[0]
def sort_loading_set_for_principal_component(loadings, X=X):
    feature_names = X.columns

    # make a series of the absolute values of the current pca loadings. Index is the column name (feature) and value is the loading value
    feature_names_series = pd.Series(loadings, index=feature_names).abs()

    # finally sort descending. The higher the number, the higher the correlation and explanatory value of the
    # feature for the principal component
    pca_component_explanatory_features_sorted = feature_names_series.sort_values(
        ascending=False
    )

    return pca_component_explanatory_features_sorted

In [85]:
# uncomment below to test the function sort_loading_set_for_principal_component

percentage_of_variance_to_explain = 0.8
pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
pca.fit(X)
pca_loadings = pca.components_

sort_loading_set_for_principal_component(pca_loadings[0])

SH.MMR.LEVE.AL            0.067942
SG.ABS.PENB               0.065640
UIS.E.2.F                 0.063285
SE.PRM.ENRL               0.063041
SE.PRM.ENRL.FE            0.062892
                            ...   
SG.HME.TRVL.EQ            0.000197
SP.POP.3034.MA.5Y         0.000150
SL.TLF.ACTI.1524.FE.ZS    0.000114
SG.LOC.LIVE.EQ            0.000000
SG.CTR.TRVL.EQ            0.000000
Length: 777, dtype: float64

In [86]:
def get_most_important_features_from_pca_loading_set(
    pca_loading_set,
    n_most_important_features_from_loading=5,
    includeName=True,
    includeDefinitionOrAdditionalInfo=True,
):
    pca_loadings_explanations = {}

    # get just the top n_most_important_features_from_loading features
    cur_loadings_sorted_with_n_most_important_features = (
        sort_loading_set_for_principal_component(pca_loading_set).iloc[
            0:n_most_important_features_from_loading
        ]
    )

    feature_codes = []
    feature_values = []
    feature_names = []
    feature_definitions_or_additional_info = []

    for feature_code_index, feature_value in zip(
        cur_loadings_sorted_with_n_most_important_features.index,
        cur_loadings_sorted_with_n_most_important_features.values,
    ):
        feature_codes.append(feature_code_index)
        feature_values.append(feature_value)

        if includeName:
            cur_name = get_ind_name(feature_code_index)
            # print('cur_name is', cur_name)
            feature_names.append(cur_name)

        if includeDefinitionOrAdditionalInfo:
            cur_definition_or_additional_info = get_ind_info(feature_code_index)
            feature_definitions_or_additional_info.append(
                cur_definition_or_additional_info
            )

    pca_loadings_explanations["most_important_feature_values"] = feature_values
    pca_loadings_explanations["most_important_codes"] = feature_codes
    pca_loadings_explanations["most_important_feature_names"] = feature_names
    pca_loadings_explanations[
        "most_important_definitions_or_additional_info"
    ] = feature_definitions_or_additional_info

    return pca_loadings_explanations

In [87]:
# uncomment below to test the function get_most_important_features_from_pca_loading_set

percentage_of_variance_to_explain = 0.8
pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
pca.fit(X)
pca_loadings = pca.components_

get_most_important_features_from_pca_loading_set(pca_loadings[0])

{'most_important_feature_values': [0.06794199650536144,
  0.06563954219032192,
  0.06328470775380518,
  0.06304078994328643,
  0.06289246667113423],
 'most_important_codes': ['SH.MMR.LEVE.AL',
  'SG.ABS.PENB',
  'UIS.E.2.F',
  'SE.PRM.ENRL',
  'SE.PRM.ENRL.FE'],
 'most_important_feature_names': ['Paid leave of at least 14 weeks available to mothers (1=yes; 0=no)',
  'There are periods of absence due to childcare accounted for in pension benefits (1=yes; 0=no)',
  'Enrolment in lower secondary education, female (number)',
  'Primary education, pupils',
  'Enrolment in primary education, female (number)'],
 'most_important_definitions_or_additional_info': ['The indicator measures  whether women are legally entitled to at least 14 weeks (98 calendar days) of paid leave for the birth of a child through maternity leave, parental leave or a combination of both.',
  'The indicator measures whether pension contributions are paid or credited during maternity or parental leave, or the leave peri

In [88]:
# this function will show the top most important components explaining the variance for each PC
def pca_with_detailed_variance_and_components_info(
    percentage_of_variance_to_explain,
    X=X,
    n_most_important_features_from_loading=5,
    includeValues=False,
    includeDefinitionOrAdditionalInfo=False,
):
    pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
    pca.fit(X)
    pca_loadings = pca.components_

    loadings_information = []
    for loading in pca_loadings:
        cur_loading_features_info = get_most_important_features_from_pca_loading_set(
            loading, n_most_important_features_from_loading
        )
        loadings_information.append(cur_loading_features_info)

    num_principal_components = len(pca.explained_variance_ratio_)

    print(
        f"in order to explain {percentage_of_variance_to_explain * 100} percentage of the variance with pca it requires"
    )
    print(f"{num_principal_components} principal components")
    print(
        f"the most important feature names, for the top {n_most_important_features_from_loading} from each PC, are\n"
    )

    # print the most important value names
    for i in range(0, len(loadings_information)):
        # print('\n')
        print("---------")
        print(
            f"For Principal Component {i+1}, the most important features names are:\n"
        )
        # print('\n')
        cur_loadings_info = loadings_information[i]

        cur_loadings_names = cur_loadings_info["most_important_feature_names"]
        cur_loadings_values = cur_loadings_info["most_important_feature_values"]
        cur_loadings_codes = cur_loadings_info["most_important_codes"]
        cur_loadings_definitions_or_additional_info = cur_loadings_info[
            "most_important_definitions_or_additional_info"
        ]
        # most_important_definitions_or_additional_info

        for code, value, name, definition_or_additional_info in zip(
            cur_loadings_codes,
            cur_loadings_values,
            cur_loadings_names,
            cur_loadings_definitions_or_additional_info,
        ):
            print(name)

            if includeValues:
                print(f"with value {value}")

            if includeDefinitionOrAdditionalInfo:
                print(definition_or_additional_info)
        print("\n")

In [89]:
variance_percentage_to_explain = 0.8
num_top_features_per_pc = 5

pca_with_detailed_variance_and_components_info(
    variance_percentage_to_explain,
    includeValues=True,
    n_most_important_features_from_loading=num_top_features_per_pc,
)

in order to explain 80.0 percentage of the variance with pca it requires
17 principal components
the most important feature names, for the top 5 from each PC, are

---------
For Principal Component 1, the most important features names are:

Paid leave of at least 14 weeks available to mothers (1=yes; 0=no)
with value 0.06794199650536144
There are periods of absence due to childcare accounted for in pension benefits (1=yes; 0=no)
with value 0.06563954219032192
Enrolment in lower secondary education, female (number)
with value 0.06328470775380518
Primary education, pupils
with value 0.06304078994328643
Enrolment in primary education, female (number)
with value 0.06289246667113423


---------
For Principal Component 2, the most important features names are:

A woman can sign a contract in the same way as a man (1=yes; 0=no)
with value 0.13351260165312756
The law prohibits discrimination in employment based on gender (1=yes; 0=no)
with value 0.12596759346794192
There is legislation on sexu

In [90]:
# returns a list of the top feature name for each principal component
def get_top_feature_for_each_pc(
    percentage_of_variance_to_explain, X=X, n_most_important_features_from_loading=5
):
    pca = PCA(n_components=percentage_of_variance_to_explain, random_state=42)
    pca.fit(X)
    pca_loadings = pca.components_

    loadings_information = []
    for loading in pca_loadings:
        cur_loading_features_info = get_most_important_features_from_pca_loading_set(
            loading, n_most_important_features_from_loading
        )
        loadings_information.append(cur_loading_features_info)

    top_feature_name_per_pc = []
    for i in range(0, len(loadings_information)):
        cur_loadings_info = loadings_information[i]

        cur_features_names = cur_loadings_info["most_important_feature_names"]
        cur_top_feature = cur_features_names[0]
        top_feature_name_per_pc.append(cur_top_feature)
        # cur_loadings_values = cur_loadings_info['most_important_features_values']
        # cur_loadings_codes = cur_loadings_info['most_important_features_world_bank_codes']
        # cur_loadings_definitions = cur_loadings_info['most_important_features_world_bank_definitions']

    return top_feature_name_per_pc

In [91]:
variance_percentage_to_explain = 0.95

top_feature_names_for_pc_loadings = get_top_feature_for_each_pc(
    variance_percentage_to_explain
)

if OECD:
    dataset_using = "OECD"
else:
    dataset_using = "World Bank Dataset"

display(
    HTML(
        f"""<h2>Assuming you want to explain {variance_percentage_to_explain * 100} percent of the PC variation in the {dataset_using}, 
              the top feature names that explain this per principal component are:</h2>\n"""
    )
)

for i in range(0, len(top_feature_names_for_pc_loadings)):
    # print(f"for PC {i+1} the top explaining feature is: {top_feature_names_for_pc_loadings[i]}")
    # print(f"PC {i+1}: {top_feature_names_for_pc_loadings[i]}")
    display(HTML(f"<strong>PC {i+1}:</strong> {top_feature_names_for_pc_loadings[i]}"))
    # print("\n")