In [None]:
import pandas as pd
import numpy as np
from functools import reduce
from IPython.display import display

from sklearn.pipeline import Pipeline
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
vaccinationsDF = pd.read_csv('dataFiles/vaccinations.csv', sep=',', header=0, na_filter=False)
casesDF = pd.read_csv('dataFiles/covidCases.csv', sep=',', header=0, na_filter=False)
iso_codesDF = pd.read_csv('dataFiles/iso_codes.csv', sep=',', header=1, na_filter=False)

In [None]:
casesDF.rename(columns={'Date_reported':'date', 'Country':'location', 'Country_code': 'iso2Code'}, inplace=True)

vaccinationsDF.rename(columns={'iso_code': 'iso3Code'}, inplace=True)
vaccinationsDF.drop(vaccinationsDF[(vaccinationsDF['iso3Code'].apply(len) != 3)].index, inplace=True)
vaccinationsDF.reset_index(drop=True)

iso_codesDF.drop(iso_codesDF.columns[[1, 2, 5, 6, 7]], axis=1, inplace=True)
iso_codesDF.rename(columns={'Country name':'location', 'Alpha-2 code':'iso2Code', 'Alpha-3 code':'iso3Code'}, inplace=True)
iso_codesDF = iso_codesDF.astype(str)
iso_codesDF.drop(iso_codesDF[(iso_codesDF['iso2Code'].apply(len) != 2) | (iso_codesDF['iso3Code'].apply(len) != 3)].index, inplace=True)
iso_codesDF = iso_codesDF.reset_index(drop=True)

In [None]:
tmp_df = reduce(lambda left, right: pd.merge(left, right, on=['iso3Code'], how='inner'), [vaccinationsDF, iso_codesDF])
df = reduce(lambda left, right: pd.merge(left, right, on=['iso2Code', 'date'], how='outer'), [tmp_df, casesDF])

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%Y.%m.%d').apply(lambda x: x.toordinal())

In [None]:
df = df.drop(['location_x', 'location_y', 'WHO_region', 'iso3Code', 'total_vaccinations_per_hundred', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred', 'daily_vaccinations_per_million', 'daily_people_vaccinated_per_hundred', 'daily_vaccinations_raw'], axis=1)

In [None]:
df = df.dropna(subset=['location'])
df = df.drop(['location'], axis=1)
df = df.reset_index(drop=True)

In [None]:
# replace empty values with nan
df = df.replace(r'^\s*$', np.nan, regex=True)

In [None]:
# Convert to int
# Somehow function "remove()" does not work
df_numeric_col = [x for x in df.columns.tolist() if x != 'iso2Code']
df[df_numeric_col] = df[df_numeric_col].astype('Int64').astype('Float64')

In [None]:
def undummify(df, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [None]:
selected_countries = ['PL', 'DE', 'NO', 'BE', 'GB', 'UA']
df_trimmed = df[df['iso2Code'].isin(selected_countries)]

In [None]:
df_dummies = pd.get_dummies(df_trimmed)
df_dummies =df_dummies.fillna(-1)

Completion of empty data using ML

In [None]:
df_dummies_columns = df_dummies.columns.tolist()
df_array = df_dummies.to_numpy()

In [None]:
pipeline = Pipeline([
    ('imputer', MissForest(random_state=0, missing_values=-1, max_features='sqrt', criterion='squared_error', class_weight='balanced_subsample')),
    ('std_scaler', StandardScaler())
])

In [None]:
df_array_transformed = pipeline.fit_transform(df_array)

In [None]:
display(df_array_transformed)

In [None]:
# RF_transformator = MissForest(random_state=0, missing_values=-1, max_features='sqrt', criterion='squared_error', class_weight='balanced_subsample')
# df_array_transformed = RF_transformator.fit_transform(df_array)

In [None]:
df_transformed = pd.DataFrame(data=df_array_transformed, columns=df_dummies_columns)

In [None]:
df_transformed = undummify(df_transformed, prefix_sep='Code_')
df_transformed.rename(columns={'iso2': 'iso2Code'}, inplace=True)

In [None]:
display(df_transformed.head(15))

In [None]:
df_transformed['mortality_rate'] = (df.Cumulative_deaths / df.Cumulative_cases) * 100

In [None]:
display(df_transformed['mortality_rate'].describe())

In [None]:
X = np.array((df_transformed.drop(['mortality_rate'], axis=1)))#usunięcię z df2 kolumny z cenami, i przyisananie df2 do zmniennej "X"
y = np.array((df_transformed['mortality_rate']))
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.25)

In [None]:
forest_reg = RandomForestRegressor(n_estimators=50, random_state=0)
forest_reg.fit(mpg_train_data, mpg_train_labels)
mpg_test_predicted = forest_reg.predict(mpg_test_data)
np.sqrt(mean_squared_error(mpg_test_predicted, mpg_test_labels, squared=True))

In [None]:
mpg_test_predicted = forest_reg.predict(mpg_test_data)
np.sqrt(mean_squared_error(mpg_test_predicted, mpg_test_labels, squared=True))

In [None]:
"""linear.fit(x_train, y_train)
        acc = linear.score(x_test, y_test)
        print(acc)"""

In [None]:
# df.to_csv('merged.csv', header=True, sep=';')

In [None]:
# df_transformed = pd.Dataframe(df_array_transformed).transpose()
# df_transformed.columns = df_dummies_columns

In [None]:
"""

# display(casesDF['location'].where(casesDF['iso2Code'].isnull()).unique())
# display(casesDF['iso2Code'].where(casesDF['location'].isnull()).unique())
# display(df.loc[df['iso2Code'] == 'XA'])
# display(df.isnull().sum())
# display(df.count())
# display(df[df['iso3Code'].isna()])
display(df.date)

display(len(df.location_x.unique()))
display(len(df.location_y.unique()))
display(len(df.location.unique()))
# display(casesDF.tail(5))
# display(vaccinationsDF.tail(5))

# display(vaccinationsDF)
display(iso_codesDF.location.unique().size)
display(len(vaccinationsDF.iso3Code.unique()))
# display(df['location'].where(['iso2Code'].isnull()))
display(df.loc[df['location_x'].isnull()])
display(casesDF.count())


def common_member(a, b):
    result = [i for i in a if i not in b]
    return len(result)

# display(len(common_member(casesDF.location.unique(), vaccinationsDF.location.unique())))

# display(common_member(casesDF.iso2Code.unique(), iso_codesDF.iso2Code.unique()))
# display(common_member(vaccinationsDF.iso3Code.unique(), iso_codesDF.iso3Code.unique()))
# display(casesDF.iso2Code.unique())
# display(iso_codesDF.loc[iso_codesDF['location'] == 'Namibia'])
display(common_member(df.location.unique(), df.location_x.unique()))
"""