# Data Mining project

### Clustering of countries for COVID-19 cases based on disease prevalence, health systems and environmental indicators

Authors: Inga Wohlert, Nicolas Pablo Viola, Jakob Nyström

In [3]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

In [3]:
import jupyter_black
jupyter_black.load()

## 1. Data loading and preprocessing

In [11]:
def load_and_merge_data():
    """
    Loads and merges the three datasets used for the clustering.
    Also ensures that the country column (which is the join key)
    is consistently represented.

    Returns:
        df_data: Dataframe that contains all columns from the
            original datasets.
    """

    # Import EPI data
    df_epi = pd.read_excel("data/epi_data2.xlsx")

    # Import COVID and socioeconomic data
    df_covid_socio = pd.read_csv("data/Consolidated_COVID_Socioeconomics.csv")
    df_covid_socio = df_covid_socio.rename(
        columns={col: col.lower() for col in df_covid_socio.columns}
    )
    df_disease = pd.read_csv("data/diseases.csv")

    # Ensure that countries have consistent naming by renaming
    # countries in the COVID and socioeconomic dataset
    country_map = {
        "Egypt, Arab Rep.": "Egypt",
        "Iran, Islamic Rep.": "Iran",
        "Iran (Islamic Republic of)": "Iran",
        "Russian Federation": "Russia",
        "Turkiye": "Turkey",
        "United States": "United States of America"
    }
    for old, new in country_map.items():
        df_covid_socio.loc[df_covid_socio["country"] == old, "country"] = new
        df_disease.loc[df_disease["location_name"]== old, "location_name"] = new
    df_disease = df_disease.rename(columns={'location_name': 'country'})
    # Merge the dataframes together
    df_data = df_epi.merge(df_covid_socio, on="country")
    df_data = df_data.merge(df_disease, on="country")
    return df_data

In [14]:
def data_preprocessing(df):
    """
    Basic preprocessing of the data incl. dropping columns, reformatting,
    replacing NaNs and standardizing data. The logic for replacing NaNs
    is to use the mean of each column. We don't want to drop these rows
    (countries) altogether, and this is the least "biased", simple approach.

    Args:
        df: Dataframe containing the joined data from all sources, but
            which has not been preprocessed.

    Returns:
        df: Dataframe with the cleaned and preprocessed data.
    """

    # Drop columns that will not be used in the clustering
    df = df.drop(["cum_cases", "cum_deaths", "che_2019", "Vitamin A deficiency", "Dietary iron deficiency", "Chronic kidney disease due to diabetes mellitus type 1", "Chronic kidney disease due to diabetes mellitus type 2"], axis="columns")

    # Create list with column names except "country"
    col_names = list(df.columns)
    col_names.remove("country")

    # Change format of missing values from "-" to np.nan
    df = df.replace("-", np.nan)

    # Cast all columns to float type
    for col in col_names:
        df[col] = df[col].astype(float)

    # Replace NaN with the mean of each column, ignoring NaNs
    for col in col_names:
        df[col] = df[col].fillna(np.nanmean(df[col]))

    # Do standard scaling of all feature columns
    countries = df["country"]  # Save column for later use
    data = df.drop("country", axis="columns")
    data = StandardScaler().fit_transform(data)
    df_data = pd.DataFrame(data, columns=col_names)

    # Join standardized data with country labels
    df = pd.concat([countries, df_data], axis="columns")

    return df

In [15]:
# Load, preprocess and inspect the data
df_data = load_and_merge_data()
df_preprocessed = data_preprocessing(df_data)
df_preprocessed.head()

Unnamed: 0,country,pm2.5_exposure,overall_epi,environ_health,air_quality,solid_fuels,sanitation_water,unsafe_water,gdp,che_2020,cum_cases_100k,cum_deaths_100k,lexp_avg,smoking_prev,alcohol,Chronic kidney disease due to hypertension,Diabetes and kidney diseases,Lower respiratory infections,Nutritional deficiencies,Respiratory infections and tuberculosis
0,Afghanistan,-0.876231,-0.058749,-1.394648,-1.215604,-1.59554,-1.087721,-1.005544,-0.842364,-0.7120921,-1.180212,-1.162468,-1.587326,-1.141526,1.754219,-0.301219,-0.26936,-0.132512,-0.159508,-0.132512
1,Albania,-0.112437,0.197457,-0.382678,-0.302744,-0.750666,-0.157708,-0.208826,-0.635441,-8.823636000000001e-17,-0.551338,-0.3361936,0.396329,0.323736,0.019097,-0.402806,-0.407195,-0.28179,-0.246332,-0.28179
2,Algeria,-1.020134,-1.083575,-0.298347,-0.223906,0.617968,-0.186324,-0.251318,-0.708784,-0.6598738,0.0,-2.240361e-16,0.369062,-0.271899,1.983386,-0.113897,-0.071885,-0.176403,-0.165931,-0.176403
3,Argentina,0.311893,-0.241754,0.304618,0.298914,0.059914,0.225028,0.116943,-0.443312,-0.4080722,0.053988,0.9521885,0.225912,0.60964,0.15005,-0.17806,-0.20212,-0.152762,-0.160323,-0.152762
4,Armenia,-1.009064,0.2853,-0.353162,-0.526809,-0.058555,-0.043245,0.194844,-0.661946,-0.5292153,-0.348417,1.004505,-0.237623,0.776418,1.230409,-0.397942,-0.402284,-0.27911,-0.245055,-0.27911


## 2. Implementation of K-means algorithm

In [16]:
class KMeans:
    """
    Performs K-means clustering using Lloyd's algorithm.

    Attributes:
        xxx:

    Methods:
        xxx:
    """

    def __init__(self, n_clusters=4, max_iter=500, tol=0.0001):
        # Hyperparameters
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.shape = None  # Placeholder for shape of input data

        # Arrays for labels and centroids
        self.labels = None
        self.centroids = None
        self.old_centroids = None

    def initialize_centroids(self, data):
        """
        For the first iteration, centroids are initialized randomly as
        one of the points among the input data.
        """
        # Select K points from the data as initial centroids
        self.centroids = data[np.random.choice(self.shape[0], size=self.n_clusters)]

        # Initialize array for storing centroids from the last iteration
        self.old_centroids = np.zeros((self.n_clusters, self.shape[1]))

    def calculate_centroids(self, data):
        """
        Calculates new centroids given the latest cluster assignments.
        The centroids is the mean of all the points in a cluster. If there
        are no points assigned to a cluster, this cluster centroid is set
        to the point furthest away from the current centroid.
        """
        # Iterate through each centroids
        for label in range(self.n_clusters):
            # If the cluster is not empty, use mean as the new centroid
            if len(data[self.labels == label]) > 0:
                self.centroids[label, :] = np.mean(data[self.labels == label], axis=0)

            # Otherwise, use the outlier logic described above
            else:
                outlier_idx = self.find_largest_outlier(data, self.centroids[label, :])
                self.centroids[label, :] = data[outlier_idx]

    def assign_clusters(self, data):
        """
        Calculate the distance between each data point and each centroids.
        Assign labels to each point based on the closest centroid.
        """

        # Generate one array with distances for each centroid
        distance_arrays = []
        for centroid in self.centroids:
            distances = np.sqrt(np.sum(np.power(data - centroid, 2), axis=1))
            distance_arrays.append(distances.reshape(-1, 1))

        # Stack all distance arrays into a matrix with one row for each
        # country and one column for each centroid
        distance_matrix = np.concatenate(distance_arrays, axis=1)

        # Find the label of the closest centroid
        self.labels = np.argmin(distance_matrix, axis=1)

    @staticmethod
    def find_largest_outlier(data, centroid):
        """Find the point furthest away from a given cluster centroid."""
        distances = np.sqrt(np.sum(np.power(data - centroid, 2), axis=1))
        outlier_idx = np.argmax(distances)
        return outlier_idx

    def fit_predict(self, data):
        """
        Runs the clustering algorithm on the input data and returns
        cluster labels.
        """
        # If data comes as a dataframe, convert to numpy array
        if type(data) == pd.DataFrame:
            data = data.to_numpy()

        # Initialize centroids and labels
        self.shape = data.shape
        self.initialize_centroids(data)
        cluster_labels = np.zeros((self.shape[0],))

        # Loop until tolerance is met, or until reaching max iterations
        iterations = 0
        while (
            np.all(np.abs(self.centroids - self.old_centroids)) > self.tol
            or iterations < self.max_iter
        ):
            iterations += 1

            # Assign each point to a cluster
            self.assign_clusters(data)

            # Re-calculate the centroids
            self.old_centroids = np.copy(self.centroids)
            self.calculate_centroids(data)

        # Return the final labels
        return self.labels

## 3. Performing clustering and analyzing the results

In [17]:
# Select the data to be used
data = df_preprocessed.drop("country", axis="columns").to_numpy()

# Instantiate model object with hyperparameters
kmeans = KMeans(n_clusters=4, max_iter=1000, tol=0.0001)

# Do fit and predict cluster labels
result = kmeans.fit_predict(data)
result

array([2, 0, 0, 3, 3, 1, 0, 0, 2, 0, 1, 3, 0, 3, 1, 3, 2, 0, 0, 3, 1, 2,
       0, 0, 0, 2, 1, 1, 2, 2, 2, 3, 1, 2, 2, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       2, 0, 2, 0, 0, 2, 1, 2, 1, 0, 2, 0, 2, 3, 1, 0, 3, 3, 0, 2, 3, 1,
       0, 1, 2, 1, 1, 3, 3, 0, 1, 1, 0, 2], dtype=int64)

In [18]:
pd.set_option("display.max_rows", None)

# Create output dataframe
output = zip(list(df_preprocessed["country"].values), result)
df_output = pd.DataFrame(list(output), columns=["country", "cluster"])
df_output = df_output.sort_values("country")

# List with all countries per cluster
for label in range(4):
    country_list = list(df_output.loc[df_output["cluster"] == label]["country"].values)
    print(f"{label}: {country_list}\n")

0: ['Albania', 'Algeria', 'Azerbaijan', 'Bahrain', 'Belarus', 'Brazil', 'Colombia', 'Costa Rica', 'Dominican Republic', 'Ecuador', 'Egypt', 'Iran', 'Iraq', 'Kazakhstan', 'Kuwait', 'Malaysia', 'Mexico', 'Morocco', 'Oman', 'Panama', 'Qatar', 'Saudi Arabia', 'South Africa', 'United Arab Emirates', 'Uzbekistan']

1: ['Austria', 'Belgium', 'Canada', 'Denmark', 'France', 'Germany', 'Iceland', 'Ireland', 'Israel', 'Italy', 'Japan', 'Luxembourg', 'Netherlands', 'Norway', 'Portugal', 'Singapore', 'Spain', 'Sweden', 'Switzerland', 'United Kingdom', 'United States of America']

2: ['Afghanistan', 'Bangladesh', 'China', 'Djibouti', 'Ethiopia', 'Ghana', 'Guatemala', 'Honduras', 'India', 'Indonesia', 'Madagascar', 'Mauritania', 'Nepal', 'Nigeria', 'Pakistan', 'Philippines', 'Senegal', 'Sudan', 'Zambia']

3: ['Argentina', 'Armenia', 'Bosnia and Herzegovina', 'Bulgaria', 'Chile', 'Croatia', 'Hungary', 'Poland', 'Romania', 'Russia', 'Serbia', 'Turkey', 'Ukraine']



### 3.1 Benchmarking against sklearn

In [19]:
from sklearn.cluster import KMeans as SKLKMeans

kmeans_comp = SKLKMeans(
    n_clusters=4, init="random", n_init="auto", max_iter=1000, tol=0.0001
)
res_comp = kmeans_comp.fit_predict(data)
res_comp

array([1, 3, 3, 3, 3, 0, 1, 3, 1, 3, 0, 3, 3, 3, 0, 3, 2, 3, 3, 3, 0, 1,
       1, 3, 1, 1, 0, 0, 1, 1, 1, 3, 0, 2, 1, 3, 1, 0, 0, 0, 0, 3, 3, 0,
       1, 3, 1, 3, 1, 1, 0, 1, 0, 1, 1, 3, 1, 3, 0, 3, 3, 3, 3, 1, 3, 0,
       1, 0, 1, 0, 0, 3, 3, 3, 0, 0, 1, 1])

In [20]:
# Create output dataframe
output_comp = zip(list(df_preprocessed["country"].values), res_comp)
df_comp = pd.DataFrame(list(output_comp), columns=["country", "cluster"])
df_comp = df_comp.sort_values("country")

# List with all countries per cluster
for label in range(4):
    country_list = list(df_comp.loc[df_comp["cluster"] == label]["country"].values)
    print(f"{label}: {country_list}\n")

0: ['Austria', 'Belgium', 'Canada', 'Denmark', 'France', 'Germany', 'Iceland', 'Ireland', 'Israel', 'Italy', 'Japan', 'Luxembourg', 'Netherlands', 'Norway', 'Portugal', 'Singapore', 'Spain', 'Sweden', 'Switzerland', 'United Kingdom', 'United States of America']

1: ['Afghanistan', 'Azerbaijan', 'Bangladesh', 'Djibouti', 'Dominican Republic', 'Egypt', 'Ethiopia', 'Ghana', 'Guatemala', 'Honduras', 'Indonesia', 'Iraq', 'Madagascar', 'Mauritania', 'Morocco', 'Nepal', 'Nigeria', 'Oman', 'Pakistan', 'Philippines', 'Senegal', 'South Africa', 'Sudan', 'Uzbekistan', 'Zambia']

2: ['China', 'India']

3: ['Albania', 'Algeria', 'Argentina', 'Armenia', 'Bahrain', 'Belarus', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Ecuador', 'Hungary', 'Iran', 'Kazakhstan', 'Kuwait', 'Malaysia', 'Mexico', 'Panama', 'Poland', 'Qatar', 'Romania', 'Russia', 'Saudi Arabia', 'Serbia', 'Turkey', 'Ukraine', 'United Arab Emirates']

