In [None]:
import pandas as pd
import numpy as np 
from datetime import datetime

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
oxford = pd.read_csv("/content/drive/My Drive/CHANGE_THIS_TO_YOUR_DATASET_LOCATION/Oxford Covid-19 Government Response Tracker/OxCGRT_latest.csv")
# political data
pol = pd.read_csv("/content/drive/My Drive/CHANGE_THIS_TO_YOUR_DATASET_LOCATION/political2.csv", encoding='latin-1')
# cultural data
cul = pd.read_csv("/content/drive/My Drive/CHANGE_THIS_TO_YOUR_DATASET_LOCATION/Hofstede national culture dimensions (2015)/6-dimensions-for-website-2015-08-16.csv", sep = ";")
# health data
health = pd.read_csv("/content/drive/My Drive/CHANGE_THIS_TO_YOUR_DATASET_LOCATION/World development indicators - health systems/2.12_Health_systems.csv")
def convertDateV2(str):
    return datetime.strftime(datetime.strptime(str, "%Y%m%d"),"%Y-%m-%d")
oxford["Date"] = oxford["Date"].astype("str").apply(convertDateV2).values

In [None]:
l = ["VoiceAndAccountability", "PoliticalStabilityNoViolence", "GovernmentEffectiveness", "RegulatoryQuality", "RuleofLaw", "ControlofCorruption"]
from operator import add
pol.columns = ["Country", "Code"]+list( map(add, np.repeat(l,6),pol.drop(["Country", "Code"], axis = 1).columns.values) )


In [None]:
cul.at[103,"country"] = "United States"
cul.at[83,"country"] = "Russian Federation"
cul.at[45,"country"] = "hong kong sar, china"
cul.at[50,"country"] = "iran, islamic rep."
cul.at[42,"country"] = "united kingdom"
cul.at[107,"country"] = "venezuela, rb"
cul_health = pd.merge(cul,health,how = "inner", left_on = "country", right_on = "Country_Region")
df = pd.merge(oxford,pol,how = "left",left_on = "CountryCode", right_on = "Code").drop("Code", axis = 1)
##### Merges

def count(x,y, string):
    if string == "common":
        return len(np.intersect1d(x.unique(),y.unique()))
    elif string == "diff":
        print(np.setxor1d(x.unique(),y.unique()))
        return len(np.setxor1d(x.unique(),y.unique()))
    elif string == "leftdif":
        print(np.setdiff1d(y.unique(),x.unique()))
        return len(np.setdiff1d(y.unique(),x.unique()))
df = pd.merge(df,cul_health,how = "left",left_on = "CountryName", right_on = "country")

!pip install pycountry
import pycountry
# gmr['country_region_code'].values
def countrycodeconv(str):
    try:
        return pycountry.countries.get(alpha_2=str).alpha_3
    except:
        pass
gmr = gmr.groupby(by = ["country_region_code","date"]).median().reset_index()
gmr['a3country_code'] = gmr["country_region_code"].apply(countrycodeconv)
df= pd.merge(df,gmr, how = "left", left_on = ["CountryCode","Date"],right_on = ["a3country_code","date"])

In [None]:
all = pd.read_csv("/content/drive/My Drive/AtlassianDataSci/COMBINEDWITHPOP.csv")

In [None]:
actions_adopted = all[['C1_School closing','C2_Workplace closing','C3_Cancel public events','C4_Restrictions on gatherings','C5_Close public transport','C6_Stay at home requirements','C7_Restrictions on internal movement','C8_International travel controls','E1_Income support','E2_Debt/contract relief','E3_Fiscal measures','E4_International support','H1_Public information campaigns','H2_Testing policy','H3_Contact tracing','H4_Emergency investment in healthcare','H5_Investment in vaccines']]
scopes_of_actions = all[['C1_Flag','C2_Flag','C3_Flag','C4_Flag','C5_Flag','C6_Flag','C7_Flag','E1_Flag','H1_Flag',]]
characteristics = all[['StringencyIndex','GovernmentResponseIndex','ContainmentHealthIndex','EconomicSupportIndex']]
governance_indicators = all[['VoiceAndAccountabilityEstimate','PoliticalStabilityNoViolenceEstimate.1','GovernmentEffectivenessEstimate.2','RegulatoryQualityEstimate.3','RuleofLawEstimate.4','ControlofCorruptionEstimate.5']]
cultural_indicators = all[['pdi','idv','mas','ltowvs','ivr','uai']]
country_identifier = all[['CountryName','CountryCode']]
population_related = all[['Population (2020)','Density (P/Km²)']]
density_related = all[['Density (P/Km²)']]
# relevant_features = all[['Date','CountryCode','retail_and_recreation_percent_change_from_baseline','grocery_and_pharmacy_percent_change_from_baseline','parks_percent_change_from_baseline','transit_stations_percent_change_from_baseline','workplaces_percent_change_from_baseline','residential_percent_change_from_baseline','pdi','idv','mas','uai','ltowvs','ivr','Health_exp_pct_GDP_2016','Health_exp_public_pct_2016','Health_exp_out_of_pocket_pct_2016','Health_exp_per_capita_USD_2016','per_capita_exp_PPP_2016','External_health_exp_pct_2016','VoiceAndAccountabilityEstimate','PoliticalStabilityNoViolenceEstimate.1','GovernmentEffectivenessEstimate.2','RegulatoryQualityEstimate.3','RuleofLawEstimate.4','ControlofCorruptionEstimate.5']]
date = all[['Date']]
dep_var = all['ConfirmedCases'].divide(all['Density (P/Km²)'])

In [None]:
dataset = pd.concat([country_identifier,date,actions_adopted,characteristics,governance_indicators, density_related,dep_var], axis=1)
dataset = dataset.rename(columns={0:'CasesPerPerson'}).iloc[:-1,:]
dataset['CasesPerPerson'] = dataset['CasesPerPerson'].fillna(0)
dataset['CasesPerPersonInInterval'] = dataset.groupby('CountryName')['CasesPerPerson'].diff()
dataset['CasesPerPersonInInterval'] = dataset['CasesPerPersonInInterval'].fillna(0)
dataset['Date'] = pd.to_datetime(dataset['Date'],format='%Y-%m-%d')
valid_date = '2020-09-28'
dataset = dataset.loc[dataset['Date'] <= valid_date]

In [None]:
peru = dataset[dataset['CountryName'] == 'Peru']
import matplotlib.pyplot as plt
import seaborn as sns
# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(11, 4)})
peru['CasesPerPersonInInterval'].plot(linewidth=0.5)

In [None]:
def create_features(df, label=None):
    """
    Creates time series features from datetime index
    """
    df['hour'] = df['Date'].dt.hour
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    if label:
        y = df[label]
        return X, y
    return X
split_date = '2020-08-01'
peru['Date']
peru['Date'] <= split_date
peru_train = peru.loc[peru['Date'] <= split_date].copy()
peru_test = peru.loc[peru['Date'] > split_date].copy()
peru_X_train, peru_y_train = create_features(peru_train, label='CasesPerPersonInInterval')
peru_X_test, peru_y_test = create_features(peru_test, label='CasesPerPersonInInterval')

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
plt.style.use('fivethirtyeight')
from matplotlib import pyplot
rf_exp = RandomForestRegressor(n_estimators= 1000)
rf_exp.fit(peru_X_train, peru_y_train)
predictions_rf = rf_exp.predict(peru_X_test)
train_predictions_rf = rf_exp.predict(peru_X_train)
mse_rf = mean_squared_error(peru_y_test, predictions_rf)
rmse_rf = np.sqrt(mse_rf)
print(rmse_rf)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
plt.style.use('fivethirtyeight')

## XGboost


In [None]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
plt.style.use('fivethirtyeight')
reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(peru_X_train, peru_y_train,
        eval_set=[(peru_X_train, peru_y_train), (peru_X_test, peru_y_test)],
        early_stopping_rounds=50,
       verbose=False)
preds = reg.predict(peru_X_test)
rmse = np.sqrt(mean_squared_error(peru_y_test, preds))
pred_train = reg.predict(peru_X_train)
peru_y_train = np.array(peru_y_train).reshape((np.array(peru_y_train).shape[0],1))