In [101]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Select data
datasets = ['HEALTH_MERGED.csv', 'Gender_WorldBankData.csv']
OECD = True # False for WB data
if OECD:
    df = pd.read_csv(datasets[0])
    mm_ind = 'MATIMATM'
else:
    df = pd.read_csv(datasets[1])
    mm_ind = 'SH.STA.MMRT'


## Data Preparation

In [102]:
# Encode the country 

# Create a LabelEncoder instance for each categorical column
country_encoder = LabelEncoder()

# Encode the categorical columns
data_encoded = df.copy(deep = True)
data_encoded['Country'] = country_encoder.fit_transform(data_encoded['Country'])

data_encoded.dropna(axis=1, how='all', inplace=True) # Drop features with no data
data_encoded.head()

Unnamed: 0,Country,Year,ACOLALCT,ADMDEALL_F_TOTAL_CRUDE_RATE_MORT,ADMDEALL_M_TOTAL_CRUDE_RATE_MORT,ADMDEALL_T_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_F_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_M_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_T_TOTAL_CRUDE_RATE_MORT,ADMDECAN_F_TOTAL_CRUDE_RATE_MORT,...,SRHSTGHB,SRHSTGHC,SRHSTGHD,SRHSTGHE,STRUPP80,STRUSFPL,TOBATBCT,TPRIBASI,VAPEVAPA,VAPEVAPY
0,0,2010,8.2,,,,,,,,...,,,,,501.375,2074.125,28.433333,,,
1,0,2011,7.7,,,,,,,,...,,,,,513.025,2116.825,,,,
2,0,2012,8.0,,,,,,,,...,,,,,524.475,2163.3,,,,
3,0,2013,8.3,,,,,,,,...,,,,,535.825,2213.025,,,,
4,0,2014,7.9,,,,,,,,...,,,,,547.2,2265.425,,,,


In [103]:
# Replace missing data with median
# Further investigation required
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data_encoded.iloc[:, 2:] = imputer.fit_transform(df.iloc[:, 2:])

data_encoded.head()

Unnamed: 0,Country,Year,ACOLALCT,ADMDEALL_F_TOTAL_CRUDE_RATE_MORT,ADMDEALL_M_TOTAL_CRUDE_RATE_MORT,ADMDEALL_T_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_F_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_M_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_T_TOTAL_CRUDE_RATE_MORT,ADMDECAN_F_TOTAL_CRUDE_RATE_MORT,...,SRHSTGHB,SRHSTGHC,SRHSTGHD,SRHSTGHE,STRUPP80,STRUSFPL,TOBATBCT,TPRIBASI,VAPEVAPA,VAPEVAPY
0,0,2010,8.2,31.5,38.25,34.65,11.6,17.65,14.2,52.2,...,85.75,65.6,40.3,69.45,501.375,2074.125,28.433333,100.0,3.0,3.833333
1,0,2011,7.7,31.5,38.25,34.65,11.6,17.65,14.2,52.2,...,85.75,65.6,40.3,69.45,513.025,2116.825,117.3875,100.0,3.0,3.833333
2,0,2012,8.0,31.5,38.25,34.65,11.6,17.65,14.2,52.2,...,85.75,65.6,40.3,69.45,524.475,2163.3,117.3875,100.0,3.0,3.833333
3,0,2013,8.3,31.5,38.25,34.65,11.6,17.65,14.2,52.2,...,85.75,65.6,40.3,69.45,535.825,2213.025,117.3875,100.0,3.0,3.833333
4,0,2014,7.9,31.5,38.25,34.65,11.6,17.65,14.2,52.2,...,85.75,65.6,40.3,69.45,547.2,2265.425,117.3875,100.0,3.0,3.833333


In [104]:
# Select year
data_2019 = data_encoded[data_encoded['Year'] == 2019]
data_2019.drop('Year', axis=1, inplace=True)

data_2019.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019.drop('Year', axis=1, inplace=True)


Unnamed: 0,Country,ACOLALCT,ADMDEALL_F_TOTAL_CRUDE_RATE_MORT,ADMDEALL_M_TOTAL_CRUDE_RATE_MORT,ADMDEALL_T_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_F_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_M_TOTAL_CRUDE_RATE_MORT,ADMDEALZ_T_TOTAL_CRUDE_RATE_MORT,ADMDECAN_F_TOTAL_CRUDE_RATE_MORT,ADMDECAN_M_TOTAL_CRUDE_RATE_MORT,...,SRHSTGHB,SRHSTGHC,SRHSTGHD,SRHSTGHE,STRUPP80,STRUSFPL,TOBATBCT,TPRIBASI,VAPEVAPA,VAPEVAPY
9,0,8.0,31.5,38.25,34.65,11.6,17.65,14.2,52.2,55.5,...,85.75,65.6,40.3,69.45,608.775,2554.825,24.466667,100.0,3.0,3.833333
22,1,9.1,31.5,38.25,34.65,11.6,17.65,14.2,52.2,55.5,...,85.75,65.6,40.3,69.45,506.475,2024.05,94.925,100.0,2.1,3.166667
35,2,11.6,31.5,38.25,34.65,11.6,17.65,14.2,52.2,55.5,...,85.75,67.8,44.9,71.2,222.5,838.975,19.257143,99.9,0.7,0.5
48,3,9.2,31.5,38.25,34.65,11.6,17.65,14.2,52.2,55.5,...,85.75,69.1,54.8,74.0,324.9,1087.45,117.3875,98.6,3.0,3.833333
61,4,11.2,31.5,38.25,34.65,11.6,17.65,14.2,52.2,55.5,...,85.75,67.3,25.4,67.1,170.525,751.9,29.0,100.0,3.0,3.833333


In [105]:
# Pull out Maternal Mortality as the predictor variable 
X = data_2019.drop(columns=[mm_ind, 'Country'])
y = data_2019[mm_ind]

# Scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Dimensionality Reduction

In [106]:
pca = PCA(n_components=2, random_state=42)
pca.fit(X)

print('Explained variation per PC: {}'.format(pca.explained_variance_ratio_))

# PCA explained variation appears to be low for both datasets, may need to reexamine work or look into other methods of DR


Explained variation per PC: [0.12038912 0.10701917]


## K-Means Clustering

## Association Mining

## Latent Variable Modeling