In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# https://gist.github.com/rogerallen/1583593
# ^ Use this in case we need state code translation

from sklearn.preprocessing import StandardScaler

import math
import matplotlib.pyplot as plt
class utils:
    def __init__(self):
        pass
    
    def split(self, dataframe, test=0.2):
        size = dataframe.shape
        train_size = size[0] - math.floor(size[0] * test)
        return dataframe[:train_size], dataframe[train_size:]
    
class CoreData(object):
    def __init__(self, data_path = None, debug=False):
        if not data_path:
            raise Exception("Input file path!")
        self.data_path = data_path
        self.df = None
        self.states = []
        self.state_df = {}
        self.debug = debug
        
    def load(self, normalize=True):
        '''
        DataProcessor.load()
        Pre-load data state-by-state to a dictionary.
        '''
        # Change this line to modify dropped data series.
        dropped_col = ['ID', 'Province_State', 'Date', 'Incident_Rate', 'Recovered', 'People_Tested', 'People_Hospitalized', 'Mortality_Rate', 'Testing_Rate', 'Hospitalization_Rate' ]
        
        self.df = pd.read_csv(train_data_path)
        self.states = list(np.unique(self.df['Province_State']))
        self.state_df = dict.fromkeys(self.states, None)
        self.mean = dict.fromkeys(self.states, None)
        self.std = dict.fromkeys(self.states, None)
        
        for s in self.states:
            df_filter = self.df['Province_State'] == s
            tmp_state_df = self.df[df_filter]
            self.state_df[s] = tmp_state_df.drop(dropped_col, 1)
            
            # Normalize
            if normalize:
                self.mean[s] = self.state_df[s].mean()
                self.std[s] = self.state_df[s].std()
                self.state_df[s] = (self.state_df[s] - self.mean[s])/self.std[s]
    
    def access(self, state=None):
        if not state:
            raise Exception('Enter state name! i.e. self.access("California")')
        elif state not in self.states:
            raise Exception('Check your spelling of the state.')
        return self.state_df[state]
    
    def access_split(self, state=None, test_portion=0.2):
        df = self.access(state)
        return utils().split(dataframe=df, test=test_portion)


## Round 1 Forecasting

### Loading Round 1 Data 

In [3]:
import math
from pandas.plotting import lag_plot
from datetime import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from pandas.plotting import autocorrelation_plot
import warnings
warnings.filterwarnings('ignore')

In [4]:
data_dir = "data"
train_data_path = os.path.join(data_dir, "train.csv")
test_data_path  = os.path.join(data_dir, "test.csv")
graph_data_path = os.path.join(data_dir, "graph.csv")
dp = CoreData(train_data_path)
dp.load(False)

### Forecasting Confirmed Cases

In [5]:
N_test = 26

result_confirmed = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Confirmed'].values
    testing = test_data['Confirmed'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 1500:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(1,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(1,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 0.5:
            yhat[0]=0.5
        if state == 'Hawaii' and yhat[0] > 120:
            yhat[0] = 120
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Confirmed'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
        if i > 20 and 10000 < M[i] < 50000:
            M[i] += M[i] / 50
    print(f"Done: {state}...{M}")
    
    result_confirmed[state] = M

# N_test = 26
# result_confirmed = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Confirmed'].values
#     testing = test_data['Confirmed'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         # These threadhold may need to be replaced in round2
#         # if 200 < yhat[0]-history[-1] < 300 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         # el
#         if 500 < yhat[0]-history[-1] < 700 and history[-1]>6000:
#             yhat[0] = history[-1]+700
#         elif yhat[0]-history[-1] > 3000:
#             yhat[0] = history[-1]+3000
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_confirmed[state] = M

6609, 76683.88306784809, 77394.78260660151, 78108.6527166346, 78825.49329331382, 79545.30417130476, 80268.08577358155, 80993.83807693988, 81722.5605195465, 82454.25349699635, 83188.91700239675]
Processing: Kansas...
292.5886524822695
Done: Kansas...[43035.1008292906, 43673.35350426799, 44217.7176930893, 44813.28773390899, 45388.9450556241, 45979.54994324108, 46568.01354115854, 47162.71261411926, 47759.54038229899, 48360.51019495667, 48964.63450578612, 49572.39786344899, 50183.5620577421, 50798.244070033536, 51416.386498353095, 52038.017245530435, 52663.12300268771, 53291.70976573465, 53923.77485505826, 54559.31951727175, 55198.34311748096, 55840.84549225563, 56486.82718906825, 57136.28780230937, 57789.227376373754, 58445.64601451998]
Processing: Kentucky...
329.31205673758865
Done: Kentucky...[48810.33639157419, 49438.16217951695, 50051.02730315871, 50669.01697430225, 51290.3632849134, 51915.220333276106, 52543.57299834105, 53175.42203326808, 53810.766179510225, 54449.60507698157, 5509

### Forecasting Deaths

In [6]:
N_test = 26

result_death = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Deaths'].values
    testing = test_data['Deaths'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 30:
                model = ARIMA(history, order=(7,2,1))
            else:
                model = ARIMA(history, order=(3,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(1,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 0.5:
            if training_data['Deaths'].values[-1] < 1000:
                yhat[0]=0.5
            else:
                yhat[0]=10
        if state == 'Hawaii':
            if time < 20:
                if yhat[0] > 2:
                    yhat[0] = 2
            else:
                yhat[0] = 5
        if state == 'Vermont':
            yhat[0] = 0
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Deaths'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_death[state] = M

# N_test = 26
# result_death = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Deaths'].values
#     testing = test_data['Deaths'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         #These threadhold may need to be replaced in round2
#         # if yhat[0]-history[-1] < 50 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_death[state] = M

, 8449.187146340262, 8453.077053852776, 8455.668086120924, 8465.668086120924, 8475.668086120924]
Processing: Indiana...
20.94326241134752
Done: Indiana...[3303.811934208743, 3312.3557800268095, 3320.2156528052615, 3327.1641128690308, 3333.756653830362, 3340.3234602347366, 3346.819096375273, 3353.0874744037255, 3359.0704545692674, 3364.8006459746657, 3370.3172438436504, 3375.626730673806, 3380.716846794294, 3385.578986083362, 3390.2137504152242, 3394.624843399075, 3398.813870614149, 3402.7801627603367, 3406.5227590290024, 3410.04145178127, 3413.3365200726403, 3416.408182018843, 3419.256439016064, 3421.8812026217074, 3424.282427148434, 3426.460123920508]
Processing: Iowa...
7.652482269503546
Done: Iowa...[1127.390623360119, 1142.5613121711576, 1154.6223758439992, 1157.7741889517015, 1168.7952344849182, 1177.2346862162196, 1184.8754355146639, 1197.0920807943533, 1207.52635923405, 1214.6620566429199, 1225.0452044914127, 1234.1431846621786, 1242.6369535094575, 1253.478547523606, 1263.424112

In [7]:
forecastID = [x for x in range(N_test*50)]
deaths = []
confirmed = []

for i in range(N_test):
    for s in states:
        print(f"Day {i}/{N_test}, {s}")
        confirmed.append(result_confirmed[s][i])
        deaths.append(result_death[s][i])



Day 6/26, Indiana
Day 6/26, Iowa
Day 6/26, Kansas
Day 6/26, Kentucky
Day 6/26, Louisiana
Day 6/26, Maine
Day 6/26, Maryland
Day 6/26, Massachusetts
Day 6/26, Michigan
Day 6/26, Minnesota
Day 6/26, Mississippi
Day 6/26, Missouri
Day 6/26, Montana
Day 6/26, Nebraska
Day 6/26, Nevada
Day 6/26, New Hampshire
Day 6/26, New Jersey
Day 6/26, New Mexico
Day 6/26, New York
Day 6/26, North Carolina
Day 6/26, North Dakota
Day 6/26, Ohio
Day 6/26, Oklahoma
Day 6/26, Oregon
Day 6/26, Pennsylvania
Day 6/26, Rhode Island
Day 6/26, South Carolina
Day 6/26, South Dakota
Day 6/26, Tennessee
Day 6/26, Texas
Day 6/26, Utah
Day 6/26, Vermont
Day 6/26, Virginia
Day 6/26, Washington
Day 6/26, West Virginia
Day 6/26, Wisconsin
Day 6/26, Wyoming
Day 7/26, Alabama
Day 7/26, Alaska
Day 7/26, Arizona
Day 7/26, Arkansas
Day 7/26, California
Day 7/26, Colorado
Day 7/26, Connecticut
Day 7/26, Delaware
Day 7/26, Florida
Day 7/26, Georgia
Day 7/26, Hawaii
Day 7/26, Idaho
Day 7/26, Illinois
Day 7/26, Indiana
Day 7/26,

In [8]:
final = pd.DataFrame(list(zip(forecastID, confirmed, deaths)), 
               columns =['ForecastID', 'Confirmed', 'Deaths']) 
final.to_csv("submission_round1.csv", index=False)

## Round 2 Forecasting

In [9]:
import math
from pandas.plotting import lag_plot
from datetime import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from pandas.plotting import autocorrelation_plot
import warnings
warnings.filterwarnings('ignore')

In [10]:
data_dir = "data"
train_data_path = os.path.join(data_dir, "train_round2.csv")
test_data_path  = os.path.join(data_dir, "test_round2.csv")
graph_data_path = os.path.join(data_dir, "graph_round2.csv")
dp = CoreData(train_data_path)
dp.load(False)

## Forecasting Confirmed Cases

In [11]:
N_test = 21

result_confirmed = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Confirmed'].values
    testing = test_data['Confirmed'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 1500:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(2,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(5,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 1:
            yhat[0]=1
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Confirmed'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_confirmed[state] = M[-7:]

# N_test = 26
# result_confirmed = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Confirmed'].values
#     testing = test_data['Confirmed'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         # These threadhold may need to be replaced in round2
#         # if 200 < yhat[0]-history[-1] < 300 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         # el
#         if 500 < yhat[0]-history[-1] < 700 and history[-1]>6000:
#             yhat[0] = history[-1]+700
#         elif yhat[0]-history[-1] > 3000:
#             yhat[0] = history[-1]+3000
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_confirmed[state] = M

.70491816465, 33613.61824420735, 34070.6452725473, 34529.1356457277, 34989.046817926224, 35450.350613519026, 35913.052407712625, 36377.15448884225, 36842.65634808785, 37309.55780151463, 37777.85909927661, 38247.559941007785, 38718.66059766668, 39191.16102105847, 39665.06119262067, 40140.361125495496, 40617.06082984574, 41095.16032386319, 41574.65957019451]
Processing: Florida...
4100.53125
Done: Florida...[946866.6413333782, 954920.546984299, 964443.4988216378, 974178.6581665325, 984249.0016873642, 993768.288913079, 1003091.4704188631, 1012590.2957921826, 1022798.3183890563, 1033358.1635105452, 1044462.7875798171, 1055609.4854734733, 1066741.2107499193, 1077790.8563508168, 1089099.2561752913, 1100793.8679142056, 1112914.221691967, 1125321.5035151192, 1137929.2596523154, 1150592.0673603013, 1163367.426242276]
Processing: Georgia...
1949.4642857142858
Done: Georgia...[452313.78059201455, 455999.84492923354, 459435.38855226204, 463074.4165363957, 466774.5911190339, 470808.0562104887, 4744

## Forecasting Deaths

In [12]:
N_test = 21

result_death = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Deaths'].values
    testing = test_data['Deaths'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 30:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(2,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(5,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 1:
            yhat[0]=1
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Deaths'].values[0])   
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_death[state] = M[-7:]

# N_test = 26
# result_death = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Deaths'].values
#     testing = test_data['Deaths'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         #These threadhold may need to be replaced in round2
#         # if yhat[0]-history[-1] < 50 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_death[state] = M

Processing: Alabama...
15.017857142857142
Done: Alabama...[3487.2478553517453, 3524.177083753607, 3572.966278480174, 3611.2422563634836, 3635.29528999245, 3650.113417097067, 3675.23055297779, 3706.958848764934, 3745.266984732033, 3781.894698232999, 3812.5590571656903, 3836.6991203668267, 3862.875601987392, 3892.768968371137, 3926.536459014033, 3960.95512352953, 3993.4124176908813, 4022.3636104315724, 4050.8517104248976, 4080.8244407695543, 4112.936065128302]
Processing: Alaska...
0.42410714285714285
Done: Alaska...[104.0, 105.00298782914358, 106.01054334581593, 107.02301829678375, 108.04053520235358, 109.06312731229094, 110.09080499509801, 111.12357134223238, 112.16142779228284, 113.20437359959647, 114.2524091807936, 115.30553457543904, 116.36374979586856, 117.42705488733104, 118.49544983749968, 119.56893464850666, 120.64750942871046, 121.73117398152633, 122.819928435155, 123.91377274425608, 125.01270707729223]
Processing: Arizona...
28.34375
Done: Arizona...[6486.040839781463, 6512.63

In [16]:
forecastID = [x for x in range(N_test*50)]
deaths = []
confirmed = []
for i in range(7):
    for s in states:
        print(f"Day {i}/{7}, {s}")
        confirmed.append(result_confirmed[s][i])
        deaths.append(result_death[s][i])


Day 0/7, Alabama
Day 0/7, Alaska
Day 0/7, Arizona
Day 0/7, Arkansas
Day 0/7, California
Day 0/7, Colorado
Day 0/7, Connecticut
Day 0/7, Delaware
Day 0/7, Florida
Day 0/7, Georgia
Day 0/7, Hawaii
Day 0/7, Idaho
Day 0/7, Illinois
Day 0/7, Indiana
Day 0/7, Iowa
Day 0/7, Kansas
Day 0/7, Kentucky
Day 0/7, Louisiana
Day 0/7, Maine
Day 0/7, Maryland
Day 0/7, Massachusetts
Day 0/7, Michigan
Day 0/7, Minnesota
Day 0/7, Mississippi
Day 0/7, Missouri
Day 0/7, Montana
Day 0/7, Nebraska
Day 0/7, Nevada
Day 0/7, New Hampshire
Day 0/7, New Jersey
Day 0/7, New Mexico
Day 0/7, New York
Day 0/7, North Carolina
Day 0/7, North Dakota
Day 0/7, Ohio
Day 0/7, Oklahoma
Day 0/7, Oregon
Day 0/7, Pennsylvania
Day 0/7, Rhode Island
Day 0/7, South Carolina
Day 0/7, South Dakota
Day 0/7, Tennessee
Day 0/7, Texas
Day 0/7, Utah
Day 0/7, Vermont
Day 0/7, Virginia
Day 0/7, Washington
Day 0/7, West Virginia
Day 0/7, Wisconsin
Day 0/7, Wyoming
Day 1/7, Alabama
Day 1/7, Alaska
Day 1/7, Arizona
Day 1/7, Arkansas
Day 1/7, C

In [17]:
final = pd.DataFrame(list(zip(forecastID, confirmed, deaths)), 
               columns =['ForecastID', 'Confirmed', 'Deaths']) 
final.to_csv("submission_round2.csv", index=False)