In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# https://gist.github.com/rogerallen/1583593
# ^ Use this in case we need state code translation

from sklearn.preprocessing import StandardScaler

import math
import matplotlib.pyplot as plt
class utils:
    def __init__(self):
        pass
    
    def split(self, dataframe, test=0.2):
        size = dataframe.shape
        train_size = size[0] - math.floor(size[0] * test)
        return dataframe[:train_size], dataframe[train_size:]
    
class CoreData(object):
    def __init__(self, data_path = None, debug=False):
        if not data_path:
            raise Exception("Input file path!")
        self.data_path = data_path
        self.df = None
        self.states = []
        self.state_df = {}
        self.debug = debug
        
    def load(self, normalize=True):
        '''
        DataProcessor.load()
        Pre-load data state-by-state to a dictionary.
        '''
        # Change this line to modify dropped data series.
        dropped_col = ['ID', 'Province_State', 'Date', 'Incident_Rate', 'Recovered', 'People_Tested', 'People_Hospitalized', 'Mortality_Rate', 'Testing_Rate', 'Hospitalization_Rate' ]
        
        self.df = pd.read_csv(train_data_path)
        self.states = list(np.unique(self.df['Province_State']))
        self.state_df = dict.fromkeys(self.states, None)
        self.mean = dict.fromkeys(self.states, None)
        self.std = dict.fromkeys(self.states, None)
        
        for s in self.states:
            df_filter = self.df['Province_State'] == s
            tmp_state_df = self.df[df_filter]
            self.state_df[s] = tmp_state_df.drop(dropped_col, 1)
            
            # Normalize
            if normalize:
                self.mean[s] = self.state_df[s].mean()
                self.std[s] = self.state_df[s].std()
                self.state_df[s] = (self.state_df[s] - self.mean[s])/self.std[s]
    
    def access(self, state=None):
        if not state:
            raise Exception('Enter state name! i.e. self.access("California")')
        elif state not in self.states:
            raise Exception('Check your spelling of the state.')
        return self.state_df[state]
    
    def access_split(self, state=None, test_portion=0.2):
        df = self.access(state)
        return utils().split(dataframe=df, test=test_portion)


## Round 1 Forecasting

### Loading Round 1 Data 

In [3]:
import math
from pandas.plotting import lag_plot
from datetime import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from pandas.plotting import autocorrelation_plot
import warnings
warnings.filterwarnings('ignore')

In [4]:
data_dir = "data"
train_data_path = os.path.join(data_dir, "train.csv")
test_data_path  = os.path.join(data_dir, "test.csv")
graph_data_path = os.path.join(data_dir, "graph.csv")
dp = CoreData(train_data_path)
dp.load(False)

### Forecasting Confirmed Cases

In [5]:
N_test = 26

result_confirmed = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Confirmed'].values
    testing = test_data['Confirmed'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 1500:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(1,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(5,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 0:
            yhat[0]=1
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Confirmed'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_confirmed[state] = M

# N_test = 26
# result_confirmed = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Confirmed'].values
#     testing = test_data['Confirmed'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         # These threadhold may need to be replaced in round2
#         # if 200 < yhat[0]-history[-1] < 300 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         # el
#         if 500 < yhat[0]-history[-1] < 700 and history[-1]>6000:
#             yhat[0] = history[-1]+700
#         elif yhat[0]-history[-1] > 3000:
#             yhat[0] = history[-1]+3000
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_confirmed[state] = M

0821186995, 75270.99595048597, 75975.9542476609, 76683.88306784809, 77394.78260660151, 78108.6527166346, 78825.49329331382, 79545.30417130476, 80268.08577358155, 80993.83807693988, 81722.5605195465, 82454.25349699635, 83188.91700239675]
Processing: Kansas...
292.5886524822695
Done: Kansas...[43035.1008292906, 43673.35350426799, 44217.7176930893, 44813.28773390899, 45388.9450556241, 45979.54994324108, 46568.01354115854, 47162.71261411926, 47759.54038229899, 48360.51019495667, 48964.63450578612, 49572.39786344899, 50183.5620577421, 50798.244070033536, 51416.386498353095, 52038.017245530435, 52663.12300268771, 53291.70976573465, 53923.77485505826, 54559.31951727175, 55198.34311748096, 55840.84549225563, 56486.82718906825, 57136.28780230937, 57789.227376373754, 58445.64601451998]
Processing: Kentucky...
329.31205673758865
Done: Kentucky...[49080.42616498898, 49767.385840961, 50427.38519067536, 51036.368538084884, 51604.717261065, 52212.07155027619, 52873.47869764943, 53523.92555716924, 541

### Forecasting Deaths

In [6]:
N_test = 26

result_death = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Deaths'].values
    testing = test_data['Deaths'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 30:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(1,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(5,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 0:
            yhat[0]=1
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Deaths'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_death[state] = M

# N_test = 26
# result_death = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Deaths'].values
#     testing = test_data['Deaths'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         #These threadhold may need to be replaced in round2
#         # if yhat[0]-history[-1] < 50 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_death[state] = M

9209658, 1279.1938991860186, 1288.6499005791302, 1298.1323930594676, 1307.6413742490254, 1317.176841705108, 1326.7387931461144, 1336.3272266531942, 1345.9421399717337, 1355.5835313940559, 1365.2513993656355]
Processing: Kansas...
2.801418439716312
Done: Kansas...[453.7706468637584, 456.4969079950478, 459.21024437121326, 461.90944089397925, 464.5945445536833, 467.26555366481637, 469.92246865060247, 472.56528999844807, 475.19401813663103, 477.8086541167675, 480.4091985380368, 482.9956522746173, 485.5680163179086, 488.126291824023, 490.6704795788899, 493.2005808786671, 495.71659678922856, 498.2185284622842, 500.7063771677236, 503.18014402012255, 505.63983031010054, 508.0854371961426, 510.51696594302484, 512.9344177658783, 515.3377939936827, 517.7270957517218]
Processing: Kentucky...
5.929078014184397
Done: Kentucky...[938.8704142236944, 944.9829232770671, 951.1201412395493, 957.2531285339355, 963.3781130607879, 969.4946786514279, 975.6028505631293, 981.7027075976957, 987.7943297579785, 99

In [7]:
forecastID = [x for x in range(N_test*50)]
deaths = []
confirmed = []

for i in range(N_test):
    for s in states:
        print(f"Day {i}/{N_test}, {s}")
        confirmed.append(result_confirmed[s][i])
        deaths.append(result_death[s][i])



Day 6/26, Indiana
Day 6/26, Iowa
Day 6/26, Kansas
Day 6/26, Kentucky
Day 6/26, Louisiana
Day 6/26, Maine
Day 6/26, Maryland
Day 6/26, Massachusetts
Day 6/26, Michigan
Day 6/26, Minnesota
Day 6/26, Mississippi
Day 6/26, Missouri
Day 6/26, Montana
Day 6/26, Nebraska
Day 6/26, Nevada
Day 6/26, New Hampshire
Day 6/26, New Jersey
Day 6/26, New Mexico
Day 6/26, New York
Day 6/26, North Carolina
Day 6/26, North Dakota
Day 6/26, Ohio
Day 6/26, Oklahoma
Day 6/26, Oregon
Day 6/26, Pennsylvania
Day 6/26, Rhode Island
Day 6/26, South Carolina
Day 6/26, South Dakota
Day 6/26, Tennessee
Day 6/26, Texas
Day 6/26, Utah
Day 6/26, Vermont
Day 6/26, Virginia
Day 6/26, Washington
Day 6/26, West Virginia
Day 6/26, Wisconsin
Day 6/26, Wyoming
Day 7/26, Alabama
Day 7/26, Alaska
Day 7/26, Arizona
Day 7/26, Arkansas
Day 7/26, California
Day 7/26, Colorado
Day 7/26, Connecticut
Day 7/26, Delaware
Day 7/26, Florida
Day 7/26, Georgia
Day 7/26, Hawaii
Day 7/26, Idaho
Day 7/26, Illinois
Day 7/26, Indiana
Day 7/26,

In [8]:
final = pd.DataFrame(list(zip(forecastID, confirmed, deaths)), 
               columns =['ForecastID', 'Confirmed', 'Deaths']) 
final.to_csv("submission_round1.csv", index=False)

## Round 2 Forecasting

In [3]:
import math
from pandas.plotting import lag_plot
from datetime import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from pandas.plotting import autocorrelation_plot
import warnings
warnings.filterwarnings('ignore')

In [4]:
data_dir = "data"
train_data_path = os.path.join(data_dir, "train_round2.csv")
test_data_path  = os.path.join(data_dir, "test_round2.csv")
graph_data_path = os.path.join(data_dir, "graph_round2.csv")
dp = CoreData(train_data_path)
dp.load(False)

## Forecasting Confirmed Cases

In [11]:
N_test = 21

result_confirmed = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Confirmed'].values
    testing = test_data['Confirmed'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 1500:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(1,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(5,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 0:
            yhat[0]=1
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Confirmed'].values[0])
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_confirmed[state] = M[-7:]

# N_test = 26
# result_confirmed = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Confirmed'].values
#     testing = test_data['Confirmed'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         # These threadhold may need to be replaced in round2
#         # if 200 < yhat[0]-history[-1] < 300 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         # el
#         if 500 < yhat[0]-history[-1] < 700 and history[-1]>6000:
#             yhat[0] = history[-1]+700
#         elif yhat[0]-history[-1] > 3000:
#             yhat[0] = history[-1]+3000
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_confirmed[state] = M

3, 270116.07324382017, 273724.1883916249, 277346.69553364284, 280983.5934117812, 284634.8893211617, 288300.5756200706, 291980.6544976978, 295675.1268686304, 299383.99355447077, 303107.25332709093]
Processing: Kansas...
605.9866071428571
Done: Kansas...[142950.38678059517, 143653.92976949617, 146291.4549283067, 148660.4322839876, 150031.78211860638, 153981.7584522088, 155474.25332528137, 157644.74856271816, 160582.23786572507, 162428.01312154406, 165359.52186096387, 167498.47565994068, 169733.18710769908, 172572.91733993537, 174705.79127200533, 177294.1421292464, 179753.5072839023, 182090.44241523958, 184774.66151521914, 187125.7230726106, 189645.6961773163, 192216.65804962703, 194650.18735143292, 197264.48741476564, 199760.8566739272, 202299.87606634354]
Processing: Kentucky...
697.0401785714286
Done: Kentucky...[160759.18804642596, 163542.43985031807, 166365.40958035248, 169207.3648100481, 172063.21126107097, 174931.69690450933, 177812.51599912922, 180705.593287236, 183610.91133954903

## Forecasting Deaths

In [12]:
N_test = 21

result_death = {}
states = dp.states
for state in states:
    print(f"Processing: {state}...")
    training_data, test_data = dp.access_split(state, 0)
    data = dp.access(state)
    training = training_data['Deaths'].values
    testing = test_data['Deaths'].values
    temp_data = []
    for i in range(1, len(training)):
        temp_data.append(training[i] - training[i-1])
    training = temp_data
    history = [x for x in training]
    model_predictions = []
    print(sum(training) / len(training))
    for time in range(N_test):
        try:
            if sum(training) / len(training) > 30:
                model = ARIMA(history, order=(8,2,1))
            else:
                model = ARIMA(history, order=(1,1,1))
            model_fit = model.fit(disp=0)
        except:
            model = ARIMA(history, order=(5,1,0))
            model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        if yhat[0] < 0:
            yhat[0]=1
        model_predictions.append(yhat)
        true_test_value = yhat
        history.append(true_test_value)
    M = []
    for m in model_predictions:
        M.append(m.tolist()[0])
    training.insert(0, training_data['Deaths'].values[0])   
    for i in range(1, len(training)):
        training[i] = training[i-1] + training[i]
    M[0] = training[-1] + M[0]
    for i in range(1, len(M)):
        M[i] = M[i-1] + M[i]
    print(f"Done: {state}...{M}")
    
    result_death[state] = M[-7:]

# N_test = 26
# result_death = {}
# states = dp.states
# for state in states:
#     print(f"Processing: {state}...")
#     training_data, test_data = dp.access_split(state, 0)
#     data = dp.access(state)
#     training = training_data['Deaths'].values
#     testing = test_data['Deaths'].values
#     history = [x for x in training]
#     model_predictions = []
#     for time in range(N_test):
#         model = ARIMA(history, order=(5,1,0))        
#         model_fit = model.fit(disp=0)
#         output = model_fit.forecast()
#         yhat = output[0]
#         #These threadhold may need to be replaced in round2
#         # if yhat[0]-history[-1] < 50 and history[-1]>6000:
#         #     yhat[0] = history[-1]+500
#         model_predictions.append(yhat)
#         true_test_value = yhat
#         history.append(true_test_value)
#     M = []
#     for m in model_predictions:
#         M.append(m.tolist()[0])
#     print(f"Done: {state}...{M}")
    
#     result_death[state] = M

842, 2585.0370427686044, 2611.1305969750215, 2637.3115520958654, 2663.5799112682907, 2689.935676907378, 2716.3788513783034, 2742.9094360924532, 2769.527433090506, 2796.2328431773435, 2823.0256675793676, 2849.905907382566, 2876.873562907379]
Processing: Kansas...
6.044642857142857
Done: Kansas...[1432.4859351379446, 1448.5513313419344, 1466.5549804091568, 1484.0797789549754, 1501.8247671910237, 1519.5878261650607, 1537.427449695688, 1555.326757156242, 1573.290667538754, 1591.3177835037181, 1609.4085402192234, 1627.5628365968737, 1645.780728061697, 1664.0622185691172, 1682.4073314018997, 1700.8160790591205, 1719.288476091127, 1737.8245343021265, 1756.4242666806954, 1775.0876851969113, 1793.8148007893524, 1812.6056223800592, 1831.4601606362817, 1850.3784238635515, 1869.36041989871, 1888.4061570300692]
Processing: Kentucky...
7.544642857142857
Done: Kentucky...[1799.2730765217705, 1813.0096274065543, 1827.0263485792273, 1841.1176453650817, 1855.2478217823534, 1869.4106996516334, 1883.60523

In [14]:
forecastID = [x for x in range(N_test*50)]
deaths = []
confirmed = []

for i in range(N_test):
    for s in states:
        print(f"Day {i}/{N_test}, {s}")
        confirmed.append(result_confirmed[s][i])
        deaths.append(result_death[s][i])



Day 6/26, Indiana
Day 6/26, Iowa
Day 6/26, Kansas
Day 6/26, Kentucky
Day 6/26, Louisiana
Day 6/26, Maine
Day 6/26, Maryland
Day 6/26, Massachusetts
Day 6/26, Michigan
Day 6/26, Minnesota
Day 6/26, Mississippi
Day 6/26, Missouri
Day 6/26, Montana
Day 6/26, Nebraska
Day 6/26, Nevada
Day 6/26, New Hampshire
Day 6/26, New Jersey
Day 6/26, New Mexico
Day 6/26, New York
Day 6/26, North Carolina
Day 6/26, North Dakota
Day 6/26, Ohio
Day 6/26, Oklahoma
Day 6/26, Oregon
Day 6/26, Pennsylvania
Day 6/26, Rhode Island
Day 6/26, South Carolina
Day 6/26, South Dakota
Day 6/26, Tennessee
Day 6/26, Texas
Day 6/26, Utah
Day 6/26, Vermont
Day 6/26, Virginia
Day 6/26, Washington
Day 6/26, West Virginia
Day 6/26, Wisconsin
Day 6/26, Wyoming
Day 7/26, Alabama
Day 7/26, Alaska
Day 7/26, Arizona
Day 7/26, Arkansas
Day 7/26, California
Day 7/26, Colorado
Day 7/26, Connecticut
Day 7/26, Delaware
Day 7/26, Florida
Day 7/26, Georgia
Day 7/26, Hawaii
Day 7/26, Idaho
Day 7/26, Illinois
Day 7/26, Indiana
Day 7/26,

In [15]:
final = pd.DataFrame(list(zip(forecastID, confirmed, deaths)), 
               columns =['ForecastID', 'Confirmed', 'Deaths']) 
final.to_csv("submission_round2.csv", index=False)