In [11]:
import os
import numpy as np
import pandas as pd

In [12]:
# https://gist.github.com/rogerallen/1583593
# ^ Use this in case we need state code translation

from sklearn.preprocessing import StandardScaler

import math
import matplotlib.pyplot as plt
class utils:
    def __init__(self):
        pass
    
    def split(self, dataframe, test=0.2):
        size = dataframe.shape
        train_size = size[0] - math.floor(size[0] * test)
        return dataframe[:train_size], dataframe[train_size:]
    
class CoreData(object):
    def __init__(self, data_path = None, debug=False):
        if not data_path:
            raise Exception("Input file path!")
        self.data_path = data_path
        self.df = None
        self.states = []
        self.state_df = {}
        self.debug = debug
        
    def load(self, normalize=True):
        '''
        DataProcessor.load()
        Pre-load data state-by-state to a dictionary.
        '''
        # Change this line to modify dropped data series.
        dropped_col = ['ID', 'Province_State', 'Date', 'Incident_Rate', 'Recovered', 'People_Tested', 'People_Hospitalized', 'Mortality_Rate', 'Testing_Rate', 'Hospitalization_Rate' ]
        
        self.df = pd.read_csv(train_data_path)
        self.states = list(np.unique(self.df['Province_State']))
        self.state_df = dict.fromkeys(self.states, None)
        self.mean = dict.fromkeys(self.states, None)
        self.std = dict.fromkeys(self.states, None)
        
        for s in self.states:
            df_filter = self.df['Province_State'] == s
            tmp_state_df = self.df[df_filter]
            self.state_df[s] = tmp_state_df.drop(dropped_col, 1)
            
            # Normalize
            if normalize:
                self.mean[s] = self.state_df[s].mean()
                self.std[s] = self.state_df[s].std()
                self.state_df[s] = (self.state_df[s] - self.mean[s])/self.std[s]
    
    def access(self, state=None):
        if not state:
            raise Exception('Enter state name! i.e. self.access("California")')
        elif state not in self.states:
            raise Exception('Check your spelling of the state.')
        return self.state_df[state]
    
    def access_split(self, state=None, test_portion=0.2):
        df = self.access(state)
        return utils().split(dataframe=df, test=test_portion)


## Round 1 Forecasting

### Loading Round 1 Data 

In [13]:
import math
from pandas.plotting import lag_plot
from datetime import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from pandas.plotting import autocorrelation_plot
import warnings
warnings.filterwarnings('ignore')

In [14]:
data_dir = "data"
train_data_path = os.path.join(data_dir, "train.csv")
test_data_path  = os.path.join(data_dir, "test.csv")
graph_data_path = os.path.join(data_dir, "graph.csv")
dp = CoreData(train_data_path)
dp.load(False)

### Forecasting Confirmed Cases

In [5]:
N_test = 26
outliers1 = ['West Virginia', 'North Dakota','Wisconsin','Utah', 'Wyoming','New Hampshire', 'Montana'] #may remove New Hampshire
outliers2 = ['Alabama', 'Vermont']
result_confirmed = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Confirmed'].values
    testing = test_data['Confirmed'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 1500:
                model = ARIMA(history, order=(8,2,1))
            elif time < 14:
                model = ARIMA(history, order=(1,1,1))
            else:
                model = ARIMA(history, order=(8,2,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(1,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 0.5:
            yhat[0]=0.5
        if state == 'Hawaii':
            if time < 14:
                yhat[0] = 150
            else:
                yhat[0] = 100
                
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Confirmed'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
        if i >=16:
            if state in outliers1:
                M[i] = M[i] + M[0] / 50
            elif state in outliers2:
                M[i] = M[i] - M[0] / 100
    print(f"Done: {state}...{M}")
    
    result_confirmed[state] = M


65.07914532922, 76665.15789932669, 77366.95551105398, 78070.91182625205, 78777.0253924424, 79484.31238212512, 80192.27177495798, 80901.26696641433, 81611.54600998765, 82323.29084951432, 83036.6240227166]
Processing: Kansas...
292.5886524822695
Done: Kansas...[43035.1008292906, 43673.35350426799, 44217.7176930893, 44813.28773390899, 45388.9450556241, 45979.54994324108, 46568.01354115854, 47162.71261411926, 47759.54038229899, 48360.51019495667, 48964.63450578612, 49572.39786344899, 50183.5620577421, 50798.244070033536, 51411.974395074336, 52029.28874307741, 52649.394237673485, 53271.80754515277, 53897.37786988077, 54525.130789570925, 55155.50942759949, 55786.11392472996, 56419.555197955284, 57054.954122649884, 57692.12454032123, 58331.75998014855]
Processing: Kentucky...
329.31205673758865
Done: Kentucky...[48810.33639157419, 49438.16217951695, 50051.02730315871, 50669.01697430225, 51290.3632849134, 51915.220333276106, 52543.57299834105, 53175.42203326808, 53810.766179510225, 54449.60507

### Forecasting Deaths

In [15]:
N_test = 26
outliers1 = ['Missouri', 'Kansas', 'North Dakota', 'Montana', 'North Dakota', 'Ohio', 'Kentucky', 'Arizona', 'Florida', 'South Dakota', 'Utah']
outliers2 = ['Mississippi', 'Alabama', 'Idaho', 'Georgia', 'Virginia','New York', 'Indiana', 'Oregon', 'Nebraska','Michigan','South Carolina']
result_death = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Deaths'].values
    testing = test_data['Deaths'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 30:
                model = ARIMA(history, order=(6,2,1))
            elif time < 14:
                model = ARIMA(history, order=(1,1,1))
            else:
                model = ARIMA(history, order=(6,2,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(1,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 0.5:
            yhat[0]=0.5
        if state == 'Hawaii':
            if time < 18:
                if yhat[0] > 2:
                    yhat[0] = 2
            else:
                yhat[0] = 3
        if state == 'Vermont':
            yhat[0] = 0
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Deaths'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        if state in outliers1:
            M[i] += M[i-1] + M[i] * 1.05
        elif state in outliers2:
            M[i] += M[i-1] + M[i] * 0.1
        else:
            M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_death[state] = M


5, 8446.18209288335, 8450.265473211075, 8453.790903931444, 8455.497483970197, 8455.997483970197, 8456.497483970197]
Processing: Indiana...
20.94326241134752
Done: Indiana...[3298.495867978936, 3300.394316458219, 3301.870284367521, 3303.021856979297, 3303.8716672535434, 3304.424937624143, 3304.974937624143, 3305.524937624143, 3306.0749376241433, 3306.6249376241435, 3307.1749376241437, 3307.724937624144, 3308.274937624144, 3308.8249376241442, 3309.3749376241444, 3309.9249376241446, 3310.474937624145, 3311.024937624145, 3311.574937624145, 3312.1249376241453, 3312.6749376241455, 3313.2249376241457, 3313.774937624146, 3314.324937624146, 3314.8749376241462, 3315.4249376241464]
Processing: Iowa...
7.652482269503546
Done: Iowa...[1129.738500094612, 1138.3234525750793, 1147.4884262095186, 1156.4612273939426, 1165.5858345170218, 1174.7848796323765, 1184.0174816847589, 1193.3018542699397, 1202.629882088944, 1211.9483341029863, 1221.2949718694608, 1230.668749963929, 1240.0696946759358, 1249.497803

In [16]:
forecastID = [x for x in range(N_test*50)]
deaths = []
confirmed = []

for i in range(N_test):
    for s in states:
        print(f"Day {i}/{N_test}, {s}")
        confirmed.append(result_confirmed[s][i])
        deaths.append(result_death[s][i])



Day 6/26, Indiana
Day 6/26, Iowa
Day 6/26, Kansas
Day 6/26, Kentucky
Day 6/26, Louisiana
Day 6/26, Maine
Day 6/26, Maryland
Day 6/26, Massachusetts
Day 6/26, Michigan
Day 6/26, Minnesota
Day 6/26, Mississippi
Day 6/26, Missouri
Day 6/26, Montana
Day 6/26, Nebraska
Day 6/26, Nevada
Day 6/26, New Hampshire
Day 6/26, New Jersey
Day 6/26, New Mexico
Day 6/26, New York
Day 6/26, North Carolina
Day 6/26, North Dakota
Day 6/26, Ohio
Day 6/26, Oklahoma
Day 6/26, Oregon
Day 6/26, Pennsylvania
Day 6/26, Rhode Island
Day 6/26, South Carolina
Day 6/26, South Dakota
Day 6/26, Tennessee
Day 6/26, Texas
Day 6/26, Utah
Day 6/26, Vermont
Day 6/26, Virginia
Day 6/26, Washington
Day 6/26, West Virginia
Day 6/26, Wisconsin
Day 6/26, Wyoming
Day 7/26, Alabama
Day 7/26, Alaska
Day 7/26, Arizona
Day 7/26, Arkansas
Day 7/26, California
Day 7/26, Colorado
Day 7/26, Connecticut
Day 7/26, Delaware
Day 7/26, Florida
Day 7/26, Georgia
Day 7/26, Hawaii
Day 7/26, Idaho
Day 7/26, Illinois
Day 7/26, Indiana
Day 7/26,

In [17]:
final = pd.DataFrame(list(zip(forecastID, confirmed, deaths)), 
               columns =['ForecastID', 'Confirmed', 'Deaths']) 
final.to_csv("submission_round1.csv", index=False)

## Round 2 Forecasting

In [9]:
import math
from pandas.plotting import lag_plot
from datetime import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from pandas.plotting import autocorrelation_plot
import warnings
warnings.filterwarnings('ignore')

In [10]:
data_dir = "data"
train_data_path = os.path.join(data_dir, "train_round2-1205.csv")
test_data_path  = os.path.join(data_dir, "test_round2.csv")
graph_data_path = os.path.join(data_dir, "graph_round2.csv")
dp = CoreData(train_data_path)
dp.load(False)

FileNotFoundError: [Errno 2] File data/train_round2-1205.csv does not exist: 'data/train_round2-1205.csv'

## Forecasting Confirmed Cases

In [11]:
N_test = 7

result_confirmed = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Confirmed'].values
    testing = test_data['Confirmed'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 1500:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(2,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(5,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 1:
            yhat[0]=1
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Confirmed'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_confirmed[state] = M[-7:]

# N_test = 26
# result_confirmed = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Confirmed'].values
#     testing = test_data['Confirmed'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         # These threadhold may need to be replaced in round2
#         # if 200 < yhat[0]-history[-1] < 300 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         # el
#         if 500 < yhat[0]-history[-1] < 700 and history[-1]>6000:
#             yhat[0] = history[-1]+700
#         elif yhat[0]-history[-1] > 3000:
#             yhat[0] = history[-1]+3000
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_confirmed[state] = M

Processing: Alabama...
1022.0669642857143
Done: Alabama...[234561.43243418977, 236674.47724028322, 238795.33009982202, 240923.312938898, 243059.80828494835, 245205.01306506482, 247358.91598334443, 249521.51238924943, 251692.79888167346, 253872.7811301316, 256061.4544614028, 258258.8214545198, 260464.88078760344, 262679.6335276582, 264903.07709079224, 267135.215308953, 269376.04925610864, 271625.5773605985, 273883.7974491351, 276150.7114816917, 278426.32246711035]
Processing: Alaska...
122.77232142857143
Done: Alaska...[28365.33874105172, 28945.40369353879, 29532.468682255512, 30120.482617019075, 30711.582741327795, 31305.11316182614, 31901.311874374733, 32500.10819465698, 33101.52897072475, 33705.566495433966, 34312.223926627434, 34921.50039676699, 35533.3962994827, 36147.9115922616, 36765.04636832166, 37384.80063690121, 38007.17432701985, 38632.167575260144, 39259.78026851634, 39890.01243441071, 40522.86413564125]
Processing: Arizona...
1321.9776785714287
Done: Arizona...[303595.24086

## Forecasting Deaths

In [12]:
N_test = 7

result_death = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Deaths'].values
    testing = test_data['Deaths'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 30:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(2,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(5,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 1:
            yhat[0]=1
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Deaths'].values[0])   
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_death[state] = M[-7:]

# N_test = 26
# result_death = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Deaths'].values
#     testing = test_data['Deaths'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         #These threadhold may need to be replaced in round2
#         # if yhat[0]-history[-1] < 50 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_death[state] = M

Delaware...
3.1830357142857144
Done: Delaware...[749.2092072516272, 750.3969123325302, 751.5625976281927, 752.7028732438342, 753.8178226100242, 754.9074972115128, 755.9719203633302, 757.0111192896123, 758.025118812529, 759.025118812529, 760.025118812529, 761.025118812529, 762.025118812529, 763.025118812529, 764.025118812529, 765.025118812529, 766.025118812529, 767.025118812529, 768.025118812529, 769.025118812529, 770.025118812529]
Processing: Florida...
78.25892857142857
Done: Florida...[18047.99180197623, 18125.44048081371, 18207.93729165752, 18289.523819944825, 18358.442423293956, 18414.148597019437, 18470.14651074653, 18533.581322783935, 18606.863178837848, 18686.719753798345, 18764.789302314788, 18833.26528053271, 18892.375745008416, 18950.612745307713, 19014.888251316, 19086.876635833945, 19163.74026809598, 19239.16785416775, 19307.329483880116, 19368.470418129473, 19428.594096880173]
Processing: Georgia...
39.129464285714285
Done: Georgia...[9221.928464816949, 9254.269560884964, 

In [13]:
forecastID = [x for x in range(N_test*50)]
deaths = []
confirmed = []
for i in range(7):
    for s in states:
        print(f"Day {i}/{7}, {s}")
        confirmed.append(result_confirmed[s][i])
        deaths.append(result_death[s][i])


Day 0/7, Alabama
Day 0/7, Alaska
Day 0/7, Arizona
Day 0/7, Arkansas
Day 0/7, California
Day 0/7, Colorado
Day 0/7, Connecticut
Day 0/7, Delaware
Day 0/7, Florida
Day 0/7, Georgia
Day 0/7, Hawaii
Day 0/7, Idaho
Day 0/7, Illinois
Day 0/7, Indiana
Day 0/7, Iowa
Day 0/7, Kansas
Day 0/7, Kentucky
Day 0/7, Louisiana
Day 0/7, Maine
Day 0/7, Maryland
Day 0/7, Massachusetts
Day 0/7, Michigan
Day 0/7, Minnesota
Day 0/7, Mississippi
Day 0/7, Missouri
Day 0/7, Montana
Day 0/7, Nebraska
Day 0/7, Nevada
Day 0/7, New Hampshire
Day 0/7, New Jersey
Day 0/7, New Mexico
Day 0/7, New York
Day 0/7, North Carolina
Day 0/7, North Dakota
Day 0/7, Ohio
Day 0/7, Oklahoma
Day 0/7, Oregon
Day 0/7, Pennsylvania
Day 0/7, Rhode Island
Day 0/7, South Carolina
Day 0/7, South Dakota
Day 0/7, Tennessee
Day 0/7, Texas
Day 0/7, Utah
Day 0/7, Vermont
Day 0/7, Virginia
Day 0/7, Washington
Day 0/7, West Virginia
Day 0/7, Wisconsin
Day 0/7, Wyoming
Day 1/7, Alabama
Day 1/7, Alaska
Day 1/7, Arizona
Day 1/7, Arkansas
Day 1/7, C

In [14]:
final = pd.DataFrame(list(zip(forecastID, confirmed, deaths)), 
               columns =['ForecastID', 'Confirmed', 'Deaths']) 
final.to_csv("team4_round2.csv", index=False)