In [15]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")
from sklearn.tree import DecisionTreeRegressor

In [3]:
def pre_series(series, countries):
    process_series = series.groupby('Country/Region').sum().drop(['Lat', 'Long'], axis=1).loc[countries, :].T[
    countries].fillna(0)
    process_series.index = pd.to_datetime(process_series.index)
    return process_series

In [4]:
# 'confirmed', 'deaths', 'recovered'
confirmed = pd.read_csv('raw_data/COVID/time_series_covid19_' + 'confirmed' + '_global.csv')
deaths = pd.read_csv('raw_data/COVID/time_series_covid19_' + 'deaths' + '_global.csv')
recovered = pd.read_csv('raw_data/COVID/time_series_covid19_' + 'recovered' + '_global.csv')

policies = pd.read_excel('raw_data/policies_all_countries.xlsx')
indicator_year = 2019
indicators = pd.read_excel('raw_data/indicators_all_countries.xlsx')[
    ['Country', 'Indicator', 'Unit', indicator_year]]

In [5]:
countries = sorted(list((set(confirmed['Country/Region'])).intersection(set(policies.entity)).intersection(set(indicators['Country']))))
len(countries)

163

In [6]:
confirmed = pre_series(confirmed, countries)
deaths = pre_series(deaths, countries)
recovered = pre_series(recovered, countries)

In [7]:
# mortality/recovery rate 死亡率恢复率
mortality = (deaths / confirmed).dropna(how='all').stack().reset_index()
recovery = (recovered / confirmed).dropna(how='all').stack().reset_index()
mortality.columns = ['date', 'country', 'y']
recovery.columns = ['date', 'country', 'y']

In [23]:
def policy_indicators(policies, rates):
    rates = mortality.copy()
    policy_names = [x for x in list(policies.columns) if x not in ['entity', 'iso', 'date']]
    for policy_name in policy_names:
        """
        information gain based on Gini Index
        """
        tmp = policies[['entity', 'date', policy_name]].pivot_table(index=['date', 'entity'], values=policy_name).reset_index()
        tmp.columns = ['date', 'country', policy_name]
        tmp.date = pd.to_datetime(tmp.date)
        rates = rates.merge(tmp, on=['date', 'country'], how='inner').dropna()
    indicators_values = indicators.pivot_table(index=['Indicator', 'Unit'], columns='Country', values=indicator_year)[countries].fillna(0).T
    
    res = None
    for policy_name in policy_names:
        train = indicators_values.merge(rates[['date', 'country', 'y', policy_name]], left_index=True, right_on='country')
        train_columns = [x for x in train.columns if x not in ['date', 'country', 'y']]
        model = DecisionTreeRegressor().fit(train[train_columns], train['y'])
        model_importance = model.tree_.compute_feature_importances(normalize=True)
        importance = pd.DataFrame({'Indicator': train_columns, 'importance': model_importance}).sort_values('importance', ascending=False)
        importance = importance[importance['Indicator']!=policy_name]
        importance['Policy'] = policy_name
        res = pd.concat((res,importance[['Policy', 'Indicator', 'importance']][:10]))
    return res

In [24]:
policy_indicators(policies, recovery).to_csv('output/step_two/policy_indicator.csv', index=False)