In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor

In [2]:
def pre_series(series, countries):
    process_series = series.groupby('Country/Region').sum().drop(['Lat', 'Long'], axis=1).loc[countries, :].T[
    countries].fillna(0)
    process_series.index = pd.to_datetime(process_series.index)
    return process_series

In [3]:
# 'confirmed', 'deaths', 'recovered'
confirmed = pd.read_csv('raw_data/COVID/time_series_covid19_' + 'confirmed' + '_global.csv')
deaths = pd.read_csv('raw_data/COVID/time_series_covid19_' + 'deaths' + '_global.csv')
recovered = pd.read_csv('raw_data/COVID/time_series_covid19_' + 'recovered' + '_global.csv')

policies = pd.read_excel('raw_data/policies.xlsx')

In [4]:
countries = sorted(list((set(confirmed['Country/Region'])).intersection(set(policies.entity))))
len(countries)

42

In [5]:
confirmed = pre_series(confirmed, countries)
deaths = pre_series(deaths, countries)
recovered = pre_series(recovered, countries)

In [6]:
# mortality/recovery rate 死亡率恢复率
mortality = (deaths / confirmed).dropna(how='all').stack().reset_index()
recovery = (recovered / confirmed).dropna(how='all').stack().reset_index()
mortality.columns = ['date', 'country', 'y']
recovery.columns = ['date', 'country', 'y']

In [12]:
def policy_influence(policies, rates):
    policy_names = [x for x in list(policies.columns) if x not in ['entity', 'iso', 'date']]
    for policy_name in policy_names:
        """
        information gain based on Gini Index
        """
        tmp = policies[['entity', 'date', policy_name]].pivot_table(index=['date', 'entity'], values=policy_name).reset_index()
        tmp.columns = ['date', 'country', policy_name]
        tmp.date = pd.to_datetime(tmp.date)
        rates = rates.merge(tmp, on=['date', 'country'], how='inner').dropna()
    res = None
    for key, rate in rates.groupby('country'):
        if len(rate) > 50:
            model = DecisionTreeRegressor().fit(rate[policy_names], rate['y'])
            model_importance = model.tree_.compute_feature_importances(normalize=True)
            importance = pd.DataFrame({'Policy': policy_names, 'importance': model_importance}).sort_values('importance', ascending=False)
            importance['Country'] = key
            res = pd.concat((res,importance[['Country', 'Policy', 'importance']][:10]))
    return res

In [13]:
policy_influence(policies, mortality).to_csv('output/step_two/country_policy_influence/policy_mortality.csv', index=False)
policy_influence(policies, recovery).to_csv('output/step_two/country_policy_influence/policy_recovery.csv', index=False)