In [441]:
import pandas as pd
import numpy as np
from functools import reduce
from IPython.display import display

from sklearn.pipeline import Pipeline

import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# The missingpy library is about to be updated, and it sends warnings not to leave some parameters at the default value
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [442]:
# Data engineering part

In [443]:
# Loading 2 data sources and 1 table used for aggregation by iso codes
# The na_filter parameter is used because Namibia, has the iso2 code "NA" and the function reads this as nan

vaccinationsDF = pd.read_csv('dataFiles/vaccinations.csv', sep=',', header=0, na_filter=False)
casesDF = pd.read_csv('dataFiles/covidCases.csv', sep=',', header=0, na_filter=False)
iso_codesDF = pd.read_csv('dataFiles/iso_codes.csv', sep=',', header=1, na_filter=False)

In [444]:
# Normalize the names of columns linking data
# Filter iso codes by length for error elimination
# Resetting an index after deleting columns

casesDF.rename(columns={'Date_reported':'date', 'Country':'location', 'Country_code': 'iso2Code'}, inplace=True)

vaccinationsDF.rename(columns={'iso_code': 'iso3Code'}, inplace=True)
vaccinationsDF.drop(vaccinationsDF[(vaccinationsDF['iso3Code'].apply(len) != 3)].index, inplace=True)
vaccinationsDF.reset_index(drop=True)

iso_codesDF.drop(iso_codesDF.columns[[1, 2, 5, 6, 7]], axis=1, inplace=True)
iso_codesDF.rename(columns={'Country name':'location', 'Alpha-2 code':'iso2Code', 'Alpha-3 code':'iso3Code'}, inplace=True)
iso_codesDF = iso_codesDF.astype(str)
iso_codesDF.drop(iso_codesDF[(iso_codesDF['iso2Code'].apply(len) != 2) | (iso_codesDF['iso3Code'].apply(len) != 3)].index, inplace=True)
iso_codesDF = iso_codesDF.reset_index(drop=True)

In [445]:
# Merging tables by iso codes.
# The function is designed to merge multiple df's at once, but due to differences between the types of iso codes in the cases and vaccinations, I could not use its full potential.

tmp_df = reduce(lambda left, right: pd.merge(left, right, on=['iso3Code'], how='inner'), [vaccinationsDF, iso_codesDF])
df = reduce(lambda left, right: pd.merge(left, right, on=['iso2Code', 'date'], how='outer'), [tmp_df, casesDF])

In [446]:
# Converting date format to int
df['date'] = pd.to_datetime(df['date'], format='%Y.%m.%d').apply(lambda x: x.toordinal())

In [447]:
# Removal of unnecessary columns after merging
df = df.drop(['location_x', 'location_y', 'WHO_region', 'iso3Code', 'total_vaccinations_per_hundred', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred', 'daily_vaccinations_per_million', 'daily_people_vaccinated_per_hundred', 'daily_vaccinations_raw'], axis=1)

In [448]:
# Removal of rows where the location is blank, this is due to writing the aggregation table in html form from the website and having records long for the entire table reporting on being an exclave or island of a country
# Removal of the location column, from now on the model uses iso 2 code

df = df.dropna(subset=['location'])
df = df.drop(['location'], axis=1)
df = df.reset_index(drop=True)

In [449]:
# replace empty values with nan
df = df.replace(r'^\s*$', np.nan, regex=True)

In [450]:
# Conversion of columns to numeric type with the ability to store nan
# Somehow function "remove()" does not work

df_numeric_col = [x for x in df.columns.tolist() if x != 'iso2Code']
df[df_numeric_col] = df[df_numeric_col].astype('Int64').astype('Float64')

In [451]:
# Function converting df_dummies back to regular df

def undummify(df, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [452]:
# Selection of sample European countries due to the amount of data and limited CPU power of my computer
selected_countries = ['PL', 'DE', 'NO', 'BE', 'GB', 'UA']
df_trimmed = df[df['iso2Code'].isin(selected_countries)]

# For operations on full data, please comment the two lines above and uncomment the one below.

# df_trimmed = df

In [453]:
# Transformation to dummies
df_dummies = pd.get_dummies(df_trimmed)

# Filling nan data as -1, the numerical interval of the data is in the interval of natural numbers. This is due to a problem with the MissForrest class and its method of type conversion
df_dummies = df_dummies.fillna(-1)

In [454]:
# Preprocessing

In [455]:
# Saving dummies columns and transformation df_dummies int an array
df_dummies_columns = df_dummies.columns.tolist()
df_array = df_dummies.to_numpy()

In [456]:
# A Pipeline to supplement -1(as nan) values and standardize the dataset

pipeline = Pipeline([
    ('imputer', MissForest(random_state=0, missing_values=-1, max_features='sqrt', criterion='squared_error', class_weight='balanced_subsample')),
    ('std_scaler', StandardScaler())
])

In [457]:
# Pipeline execution
df_array_transformed = pipeline.fit_transform(df_array)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6


In [458]:
# Save dataframe from transformed matrix using undummify function

df_transformed = pd.DataFrame(data=df_array_transformed, columns=df_dummies_columns)
df_transformed = undummify(df_transformed, prefix_sep='Code_')
df_transformed.rename(columns={'iso2': 'iso2Code'}, inplace=True)

In [459]:
# Calculation of mortality rate based on the amount of total daily mortality and infectiousness
df_transformed['mortality_rate'] = (df.Cumulative_deaths / df.Cumulative_cases) * 100

In [460]:
# Re-conversion to dummy
df_transformed = pd.get_dummies(df_transformed)

In [461]:
# Training of the machine learning model

In [462]:
X = np.array((df_transformed.drop(['mortality_rate'], axis=1)))
y = np.array((df_transformed['mortality_rate']))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [463]:
forest_reg = RandomForestRegressor(n_estimators=50, random_state=0)
forest_reg.fit(X_train, y_train)

In [464]:
# Prediction of mortality rate
predicted_indicator = forest_reg.predict(X_test)

In [465]:
# Error prediction rate
print(f'Error score: {np.sqrt(mean_squared_error(predicted_indicator, y_test, squared=True))}')

Error score: 0.0920323594959131
