In [28]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [10]:
import jupyter_black
jupyter_black.load()

In [31]:
df_epi = pd.read_excel("epi_data2.xlsx")
df_epi.shape

(180, 8)

In [36]:
epi_list = sorted(df_epi["country"].values)

In [42]:
print(epi_list)

['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Dem. Rep. Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaic

In [32]:
df_covid_socio = pd.read_csv("Consolidated_COVID_Socioeconomics.csv")
df_covid_socio.shape

(79, 11)

In [37]:
covid_list = sorted(df_covid_socio["Country"].values)

In [40]:
not_in = [country for country in covid_list if country not in epi_list]
not_in

['Egypt, Arab Rep.',
 'Iran, Islamic Rep.',
 'Russian Federation',
 'Turkiye',
 'United States']

In [146]:
def load_and_merge_data():
    # Import EPI data
    df_epi = pd.read_excel("epi_data2.xlsx")

    # Import COVID and socioeconomic data
    df_covid_socio = pd.read_csv("Consolidated_COVID_Socioeconomics.csv")
    df_covid_socio = df_covid_socio.rename(
        columns={col: col.lower() for col in df_covid_socio.columns}
    )

    country_map = {
        "Egypt, Arab Rep.": "Egypt",
        "Iran, Islamic Rep.": "Iran",
        "Russian Federation": "Russia",
        "Turkiye": "Turkey",
        "United States": "United States of America",
    }
    for old, new in country_map.items():
        df_covid_socio.loc[df_covid_socio["country"] == old, "country"] = new

    # Merge the dataframes together
    df_data = df_epi.merge(df_covid_socio, on="country")

    return df_data

In [147]:
df_data = load_and_merge_data()
df_data.head()

Unnamed: 0,country,pm2.5_exposure,overall_epi,environ_health,air_quality,solid_fuels,sanitation_water,unsafe_water,gdp,che_2020,che_2019,cum_cases,cum_cases_100k,cum_deaths,cum_deaths_100k,lexp_avg,smoking_prev,alcohol
0,Afghanistan,16.0,43.6,16.0,15.5,7.4,28.1,27.8,1883.120964,80.29,74.23,225786,580.004,7946,20.412,62.1,7.8,24.4
1,Albania,36.7,47.1,40.0,37.5,34.5,54.1,50.3,29206.92036,-,-,334090,11609.215,3604,125.235,76.65,20.1,13.8
2,Algeria,12.1,29.6,42.0,39.4,78.4,53.3,49.1,19522.1696,214.85,250.56,-,-,-,-,76.45,15.1,25.8
3,Argentina,48.2,41.1,56.3,52.0,60.5,64.8,59.5,54577.38948,863.71,958.52,10044957,22225.434,130472,288.682,75.4,22.5,14.6
4,Armenia,12.4,48.3,40.7,32.1,56.7,57.3,61.7,25707.01597,551.54,549.47,449465,15168.056,8751,295.319,72.0,23.9,21.2


In [60]:
df_data.loc[df_data["country"] == "Russia"]

Unnamed: 0,country,pm2.5_exposure,overall_epi,environ_health,air_quality,solid_fuels,sanitation_water,unsafe_water,gdp,che_2020,che_2019,cum_cases,cum_cases_100k,cum_deaths,cum_deaths_100k,lexp_avg,smoking_prev,alcohol
62,Russia,39.6,37.5,50.6,48.8,63.4,55.5,55.8,60956.30859,773.88,654,23014969,15770.757,400023,274.111,69.5,25.4,20.8


In [57]:
df_data.shape

(79, 18)

In [148]:
def data_preprocessing(df):
    # Drop columns that we don't need
    df = df.drop(["cum_cases", "cum_deaths", "che_2019"], axis="columns")

    # Save column names
    col_names = list(df.columns)
    col_names.remove("country")

    # Change format of missing values
    df = df.replace("-", np.nan)

    # Cast all columns to float
    for col in col_names:
        df[col] = df[col].astype(float)

    # Replace NaN with the mean of each column
    for col in col_names:
        df[col] = df[col].fillna(np.nanmean(df[col]))

    # Do standard scaling
    countries = df["country"]
    data = df.drop("country", axis="columns")
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    df_data = pd.DataFrame(data, columns=col_names)

    # Create output dataframe
    df = pd.concat([countries, df_data], axis="columns")

    return df

In [149]:
df_preprocessed = data_preprocessing(df_data)
df_preprocessed.head()

Unnamed: 0,country,pm2.5_exposure,overall_epi,environ_health,air_quality,solid_fuels,sanitation_water,unsafe_water,gdp,che_2020,cum_cases_100k,cum_deaths_100k,lexp_avg,smoking_prev,alcohol
0,Afghanistan,-0.873488,-0.055078,-1.393668,-1.21266,-1.584896,-1.083314,-1.0015,-0.835658,-0.7062674,-1.176035,-1.172313,-1.562401,-1.133309,1.769052
1,Albania,-0.106044,0.202604,-0.377245,-0.296152,-0.739347,-0.149916,-0.201677,-0.62811,-8.856398e-17,-0.54469,-0.340946,0.411334,0.335265,0.026634
2,Algeria,-1.018079,-1.085805,-0.292543,-0.216999,0.63038,-0.178636,-0.244334,-0.701674,-0.6538551,0.0,0.0,0.384204,-0.261717,1.999183
3,Argentina,0.320315,-0.239136,0.313075,0.307911,0.071881,0.234213,0.125362,-0.435399,-0.4011186,0.063013,0.955375,0.24177,0.621816,0.158137
4,Armenia,-1.006957,0.290952,-0.347599,-0.521113,-0.046683,-0.035036,0.203567,-0.654695,-0.5227115,-0.340972,1.008014,-0.219447,0.78897,1.243039


In [150]:
def pdist2(X, Y):
    """
    Pairwise distance between all points of two datasets.
    X and Y are expected to be numpy arrays of size mx-by-n and my-by-n, respectivley.
    n and m being the amount of observations in the first dimension of each set.
    """
    dist = np.ndarray((X.shape[0], Y.shape[0]))
    for xi, xv in enumerate(X):
        for yi, yv in enumerate(Y):
            dist[xi][yi] = np.sqrt(np.sum((xv - yv) ** 2))
    return dist

In [151]:
def kmeans(data, k=4):
    """
    Applies kmeans clustering to the data using k initial clusters.
    data is expected to be a numpy array of size n*2,
    n being the amount of observations in the data. This function returns
    the centroids and the labels for the clusters data (1,1,3,5,5,5,...)
    """
    # Initial centroids are k random samples from the data.
    centroids = data[np.random.randint(0, data.shape[0], k)]
    old_centroids = np.zeros(centroids.shape)

    # Initial labels are all.. something.
    labels = np.ndarray(data.shape[0])

    # Lets keep count of our iterations to avoid infinite loops.
    iterations = 0

    while (
        np.any(np.abs(centroids - old_centroids) > np.finfo(float).eps)
        and iterations < 1000
    ):
        # Keep count of iterations and remember current centroids for change calculation.
        iterations = iterations + 1
        # copy the centroids and keep them for break condition check
        old_centroids = np.copy(centroids)

        # Calculate new labels. Labels are the index of their minimal distance to any centroid.
        labels = np.argmin(pdist2(centroids, data), axis=0)

        # Update centroids using the new cluster labels.
        for label in range(k):
            # Check for empty clusters.
            if len(labels == label) > 0:
                # Cluster is not empty, move its centroid to new mean.
                centroids[label, :] = np.mean(data[labels == label], axis=0)
            else:
                # Cluster is empty, set its centroid to the furthest outlier.
                blacksheep = np.argmax(pdist2(centroids, data), axis=0)
                centroids[label, :] = data[blacksheep, :]

    # Return labels and centroids for pretty plotting.
    return (labels, centroids)

In [152]:
data = df_preprocessed.drop("country", axis="columns").to_numpy()
labels, centroids = kmeans(data, k=4)

In [153]:
print(labels, centroids)

[1 0 0 0 0 2 3 3 1 0 2 1 0 0 0 2 0 3 0 0 0 2 1 1 0 3 1 2 2 1 1 1 0 2 1 1 0
 3 2 2 2 2 0 3 2 1 3 1 0 1 1 2 1 2 3 1 0 1 0 2 3 0 0 3 1 0 2 1 2 1 2 2 0 0
 3 2 2 1 1] [[-0.27179802  0.05413028 -0.09084679 -0.16613922 -0.1840978   0.00400466
  -0.03371405 -0.47637612 -0.463584   -0.20393696  0.8433213  -0.03999085
   0.49296719  0.58141316]
 [-0.59610426 -0.90430765 -1.09039812 -0.98135973 -1.1776539  -1.2058418
  -1.15172716 -0.74984647 -0.68752255 -0.71782301 -0.65658512 -1.1389313
  -0.62002988 -0.12130764]
 [ 1.39547858  1.22351526  1.4607859   1.47854301  1.22676608  1.34420384
   1.4125263   1.35416621  1.45941175  1.29049764  0.21678248  1.16032696
   0.18004953 -0.15183519]
 [-0.82468177 -0.56307921 -0.31063857 -0.40825348  0.52202721 -0.05363917
  -0.21492641  0.02200049 -0.33714657 -0.51782128 -0.8809714   0.25348493
  -0.12286959 -0.72502737]]


In [159]:
pd.set_option("display.max_rows", None)
output = zip(list(df_preprocessed["country"].values), labels)
df_output = pd.DataFrame(list(output), columns=["country", "cluster"])
df_output.sort_values("cluster")

Unnamed: 0,country,cluster
61,Romania,0
42,Kazakhstan,0
32,Hungary,0
48,Mexico,0
24,Ecuador,0
56,Panama,0
58,Poland,0
20,Croatia,0
19,Costa Rica,0
18,Colombia,0
