In [18]:
import pandas as pd
from sklearn.decomposition import PCA

In [19]:
# Get datasets and set % of variance

datasets = [
    "../OECD/Cleaned/HEALTH_MERGED_Threshold_80_n3.csv",
    "../WorldBankDatasets/Cleaned/AllMerged_Threshold_85_n3_MMRTNE.csv",
]
indicator = ["MATIMATM", "SH.STA.MMRT.NE"]
variance_explain = 0.8

str_variance_explain = str(variance_explain)[2:]
if len(str_variance_explain) == 1:
    str_variance_explain += "0"

In [20]:
# Run on both datasets

for i in range(len(indicator)):
    df = pd.read_csv(datasets[i])
    mm_ind = indicator[i]

    # Pull out predictor
    y = df[mm_ind]

    # Drop year, country, and predictor
    X = df.drop(columns=["Country", "Year", mm_ind])

    # Run PCA explaining X% of variance
    pca = PCA(n_components=variance_explain, random_state=42)
    pca.fit(X)

    # Print stats
    print("Number of PCs: {}".format(len(pca.explained_variance_ratio_)))
    print("Explained variation per PC: {}".format(pca.explained_variance_ratio_))
    print("Sum of explained variation: {}".format(pca.explained_variance_ratio_.sum()))
    print("\n")

    # Transform data
    PCA_data = pd.DataFrame(pca.transform(X))

    # Produce reduced dataset
    reduced_data = pd.concat([df.iloc[:, :2], PCA_data], axis=1)
    reduced_data[mm_ind] = y

    # Output
    reduced_data.to_csv(
        "../"
        + datasets[i].split("/")[1]  # Folder
        + "/Reduced/Reduced_"
        + datasets[i].split("/")[-1][:-4]  # Name of dataset
        + "_VAR_"
        + str_variance_explain  # Percent of variance explained
        + ".csv"
    )

Number of PCs: 10
Explained variation per PC: [0.24473793 0.12906961 0.10025901 0.08537377 0.06885357 0.05243722
 0.04031638 0.03297561 0.02999361 0.02344255]
Sum of explained variation: 0.807459264221974


Number of PCs: 17
Explained variation per PC: [0.31810795 0.16028914 0.06789467 0.04488062 0.03199522 0.02573008
 0.02106988 0.01800833 0.01781576 0.01647727 0.01603059 0.01414032
 0.01269869 0.01238702 0.01143714 0.01035387 0.00993645]
Sum of explained variation: 0.8092530048097448


