In [74]:
import pandas as pd
import fbprophet as ph

For this section, we attempt to calculate the actual response variable for analysis. This response variable is calculated by taking the difference between the actual EDI (with the presence of COVID) and the projected EDI (assuming COVID never happens). 

The actual EDI is imported as 'cal_edi'. This data offers daily EDI values from 2020-02-01 to 2020-07-01. The projected EDI will be computed using historical values (from 2010-01-01 onward) from the EDI file imported as 'edis'. This data offers monthly EDI values from 2008-01-01 to 2020-07-01.

Note that 'cal_edi' was previously interpolated using 'edis', and just kept the data between 2020-02-01 to 2020-07-01, which was done in 'Response Processing.ipyn' file. Additionaly, we will use the interval between 2010-01-01 to 2020-02-14 of 'edis' to compute projected EDI (2020-02-15 to 2020-07-01). 

In [75]:
# import interpolated EDI data
cal_edi = pd.read_csv('./interpolated_edi.csv')

In [76]:
cal_edi.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [77]:
cal_edi

Unnamed: 0,country,EDI,index
0,Brazil,4.678294,2020-02-01
1,Brazil,4.653117,2020-02-02
2,Brazil,4.627939,2020-02-03
3,Brazil,4.602762,2020-02-04
4,Brazil,4.577584,2020-02-05
...,...,...,...
1363,Untied States,10.175226,2020-06-27
1364,Untied States,10.178105,2020-06-28
1365,Untied States,10.180984,2020-06-29
1366,Untied States,10.183863,2020-06-30


In [78]:
# read computed EDI data
edis = pd.read_csv('./EDI_indicators.csv')

In [79]:
# change names for consistency
edis.replace({'UnitedKingdom':'United Kingdom',
                        'UnitedStates':'Untied States',
                        'SouthAfrica':'South Africa'}, inplace = True)
edis = edis.loc[edis['country'] != 'HongKong', ]

In [80]:
edis

Unnamed: 0,country,index,EDI
0,Brazil,2008-01-01,2.375691
1,Brazil,2008-02-01,2.556433
2,Brazil,2008-03-01,2.623345
3,Brazil,2008-04-01,2.859230
4,Brazil,2008-05-01,3.352902
...,...,...,...
1505,Untied States,2020-03-01,10.195001
1506,Untied States,2020-04-01,8.744228
1507,Untied States,2020-05-01,9.243445
1508,Untied States,2020-06-01,10.100372


3 time index will be used:

* timeindex_2010_2020: used on 'edis' historical EDI data to project data for the prediction_interval.

* timeindex_0215_0701: used to subset 'cal_edi', whose original length starts from 2020-01-01.

* prediction_interval: used as the prediction interval.

In [None]:
# define time index of different ranges
# from 2010-01-01 to 2020-02-14
timeindex_2010_2020 = pd.date_range(start = '1/1/2010', end = '2/14/2020')
# from 2020-02-15 to 2020-07-01
timeindex_0215_0701 = pd.date_range(start = '2/15/2020', end = '7/1/2020')
# from 2020-02-14 to 2020-07-01
prediction_interval = len(pd.date_range(start = '2/14/2020', end = '7/1/2020'))

In [82]:
timeindex_2010_2020

DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',
               '2010-01-05', '2010-01-06', '2010-01-07', '2010-01-08',
               '2010-01-09', '2010-01-10',
               ...
               '2020-02-05', '2020-02-06', '2020-02-07', '2020-02-08',
               '2020-02-09', '2020-02-10', '2020-02-11', '2020-02-12',
               '2020-02-13', '2020-02-14'],
              dtype='datetime64[ns]', length=3697, freq='D')

In [83]:
def forecast_economic_indicator(df, f_interval, freq = 'D'):
    '''
    input: dataframe - a dataframe containing date and economic indicator value
           f_interval - prediction interval
           freq - prediction frequency; default daily forecast
    output: df - original dataframe with changed column names
            hist_predict - prediction on historical data in a dataframe
            future_predict - forecast on prediction interval in a dataframe
            df_names - column names of original dataframe
    '''
    # get dataframe column names
    df_names = df.columns
    # change dataframe column names
    df.columns = ['ds', 'y']
    # convert ds column name
    df['ds'] = pd.to_datetime(df['ds'])
    
    # initiate a prophet model
    m = ph.Prophet()
    # fit on dataframe
    m.fit(df)
    # define future prediction
    future = m.make_future_dataframe(periods = f_interval, freq = freq)
    # make prediction
    forecast = m.predict(future)
    # predict historical values
    hist_predict = forecast[['ds','yhat']][:-f_interval]
    # predict future values
    future_predict = forecast[['ds','yhat']][-f_interval:]
    
    return df, hist_predict, future_predict, df_names

In [84]:
# define a dataframe to store difference between projected EDI and actual EDI
edi_diff = pd.DataFrame(columns=['index', 'EDI', 'yhat', 'country'])

In [85]:
# loop through all countries
for country in edis.country.unique():
    # create data for computing projected EDI
    # subset edi data for each country
    temp_edi = edis.loc[edis['country'] == country, ]
    temp_edi.set_index('index', inplace = True)
    temp_edi.index = pd.to_datetime(temp_edi.index)
    # subset data to 2010-01-01 to 2020-02-14
    temp_edi = temp_edi.reindex(timeindex_2010_2020)
    # perform interpolation and forward fill missing values
    temp_edi = temp_edi.interpolate().fillna(method = 'ffill')
    temp_edi = temp_edi.reset_index()
    # drop country column 
    temp_edi.drop(['country'], axis = 1, inplace = True)
    
    # create data that captures actual EDI
    # subset edi data for each country
    temp_cal_edi = cal_edi.loc[cal_edi['country'] == country, ]
    temp_cal_edi.set_index('index', inplace = True)
    temp_cal_edi.index = pd.to_datetime(temp_cal_edi.index)
     # subset data to 2020-02-15 to 2020-07-01
    temp_cal_edi = temp_cal_edi.reindex(timeindex_0215_0701)
    temp_cal_edi = temp_cal_edi.reset_index()
    # drop country column 
    temp_cal_edi.drop(['country'], axis = 1, inplace = True)
    
    # compute projected EDI
    _, _, predicted_edi, _ = forecast_economic_indicator(temp_edi, prediction_interval)
    # merge data that contains projected EDI and data that contains actual EDI
    combined = temp_cal_edi.merge(predicted_edi, left_on = 'index', right_on = 'ds', how = 'left')
    # add country name back to dataframe
    combined['country'] = country
    # append to combine all countrie together
    edi_diff = edi_diff.append(combined)

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


In [86]:
# calculate the difference in EDI
edi_diff['response'] = edi_diff['yhat'] - edi_diff['EDI']

In [87]:
edi_diff[]

Unnamed: 0,index,EDI,yhat,country,ds,response
0,2020-02-15,4.325809,4.909036,Brazil,2020-02-15,0.583227
1,2020-02-16,4.300631,4.912205,Brazil,2020-02-16,0.611573
2,2020-02-17,4.275454,4.915322,Brazil,2020-02-17,0.639868
3,2020-02-18,4.250276,4.918377,Brazil,2020-02-18,0.668101
4,2020-02-19,4.225099,4.921313,Brazil,2020-02-19,0.696215
...,...,...,...,...,...,...
133,2020-06-27,10.175226,12.320775,Untied States,2020-06-27,2.145549
134,2020-06-28,10.178105,12.324533,Untied States,2020-06-28,2.146428
135,2020-06-29,10.180984,12.327755,Untied States,2020-06-29,2.146771
136,2020-06-30,10.183863,12.330399,Untied States,2020-06-30,2.146536


In [69]:
# drop columns
edi_diff.drop(['EDI', 'yhat', 'ds'], axis = 1, inplace = True)

In [71]:
# output as csv
edi_diff.to_csv('response.csv')