In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# https://gist.github.com/rogerallen/1583593
# ^ Use this in case we need state code translation

from sklearn.preprocessing import StandardScaler

import math
import matplotlib.pyplot as plt
class utils:
    def __init__(self):
        pass
    
    def split(self, dataframe, test=0.2):
        size = dataframe.shape
        train_size = size[0] - math.floor(size[0] * test)
        return dataframe[:train_size], dataframe[train_size:]
    
class CoreData(object):
    def __init__(self, data_path = None, debug=False):
        if not data_path:
            raise Exception("Input file path!")
        self.data_path = data_path
        self.df = None
        self.states = []
        self.state_df = {}
        self.debug = debug
        
    def load(self):
        '''
        DataProcessor.load()
        Pre-load data state-by-state to a dictionary.
        '''
        
        self.df = pd.read_csv(train_data_path)
        
    
    def filter(self, normalize=True, dropped_col=[]):
        self.states = list(np.unique(self.df['Province_State']))
        self.state_df = dict.fromkeys(self.states, None)
        self.mean = dict.fromkeys(self.states, None)
        self.std = dict.fromkeys(self.states, None)
        for s in self.states:
            df_filter = self.df['Province_State'] == s
            tmp_state_df = self.df[df_filter]
            self.state_df[s] = tmp_state_df.drop(dropped_col, 1)
            
            # Normalize
            if normalize:
                self.mean[s] = self.state_df[s].mean()
                self.std[s] = self.state_df[s].std()
                self.state_df[s] = (self.state_df[s] - self.mean[s])/self.std[s]
    
    def access(self, state=None):
        if not state:
            raise Exception('Enter state name! i.e. self.access("California")')
        elif state not in self.states:
            raise Exception('Check your spelling of the state.')
        return self.state_df[state]
    
    def access_split(self, state=None, test_portion=0.2):
        df = self.access(state)
        return utils().split(dataframe=df, test=test_portion)


In [3]:
data_dir = "other_data"
train_data_path = os.path.join(data_dir, "validation_round1.csv")
dp = CoreData(train_data_path)
dp.load()
dp.filter(False, ['ID', 'Province_State', 'Date', 'Incident_Rate', 'Recovered', 'People_Tested', 'People_Hospitalized', 'Mortality_Rate', 'Testing_Rate', 'Hospitalization_Rate'])
print(dp.df)

train_data_path = os.path.join(data_dir, "team4_round1.csv")
dp2 = CoreData(train_data_path)
dp2.load()
dp2.df['Province_State'] = pd.Series(dp.df['Province_State'])
dp2.filter(False, ['ForecastID'])

print(dp2.df)



        ID Province_State        Date  Confirmed  Deaths  Recovered    Active  \
0     7100        Alabama  09-01-2020     127616    2200    48028.0   77388.0   
1     7101         Alaska  09-01-2020       5297      39     2246.0    3012.0   
2     7102        Arizona  09-01-2020     202342    5044    30841.0  166457.0   
3     7103       Arkansas  09-01-2020      61497     814    55647.0    5036.0   
4     7104     California  09-01-2020     715617   13150        NaN  702467.0   
...    ...            ...         ...        ...     ...        ...       ...   
1295  8395       Virginia  09-26-2020     145267    3142    17255.0  124870.0   
1296  8396     Washington  09-26-2020      85830    2100        NaN   83730.0   
1297  8397  West Virginia  09-26-2020      15167     337    11121.0    3709.0   
1298  8398      Wisconsin  09-26-2020     113645    1281    94094.0   18270.0   
1299  8399        Wyoming  09-26-2020       5465      50     4479.0     936.0   

      Incident_Rate  People

In [4]:
states = dp.states
confirmed_diff = {}
deaths_diff = {}
for state in states:
    truth, test = dp.access_split(state, 0)
    forecast, test = dp2.access_split(state, 0)

    confirmed_diff[state] = abs((truth['Confirmed'].values - forecast['Confirmed'].values) / truth['Confirmed'].values) * 100
    deaths_diff[state] = abs((truth['Deaths'].values - forecast['Deaths'].values) / truth['Deaths'].values) * 100
    

In [5]:

forecastID = [x for x in range(len(truth['Confirmed'].values)*50)]
deaths = []
confirmed = []
label = []
truthConfirmed = []
truthDeaths = []
forecastConfirmed = []
forecastDeaths = []
state_name = []

for i in range(len(truth['Confirmed'].values)):
    for s in states:
        truth, test = dp.access_split(s, 0)
        forecast, test = dp2.access_split(s, 0)

        print(f"Day {i}/{len(truth['Confirmed'].values)}, {s}")
        confirmed.append(confirmed_diff[s][i])
        deaths.append(deaths_diff[s][i])
        truthConfirmed.append((truth['Confirmed'].values)[i])
        truthDeaths.append((truth['Deaths'].values)[i])
        forecastConfirmed.append((forecast['Confirmed'].values)[i])
        forecastDeaths.append((forecast['Deaths'].values)[i])
        state_name.append(s)
        
        if confirmed[-1] > 5 or deaths[-1] > 5:
            label.append("---BAD!---")
        else:
            label.append("----------")


Day 0/26, Alabama
Day 0/26, Alaska
Day 0/26, Arizona
Day 0/26, Arkansas
Day 0/26, California
Day 0/26, Colorado
Day 0/26, Connecticut
Day 0/26, Delaware
Day 0/26, Florida
Day 0/26, Georgia
Day 0/26, Hawaii
Day 0/26, Idaho
Day 0/26, Illinois
Day 0/26, Indiana
Day 0/26, Iowa
Day 0/26, Kansas
Day 0/26, Kentucky
Day 0/26, Louisiana
Day 0/26, Maine
Day 0/26, Maryland
Day 0/26, Massachusetts
Day 0/26, Michigan
Day 0/26, Minnesota
Day 0/26, Mississippi
Day 0/26, Missouri
Day 0/26, Montana
Day 0/26, Nebraska
Day 0/26, Nevada
Day 0/26, New Hampshire
Day 0/26, New Jersey
Day 0/26, New Mexico
Day 0/26, New York
Day 0/26, North Carolina
Day 0/26, North Dakota
Day 0/26, Ohio
Day 0/26, Oklahoma
Day 0/26, Oregon
Day 0/26, Pennsylvania
Day 0/26, Rhode Island
Day 0/26, South Carolina
Day 0/26, South Dakota
Day 0/26, Tennessee
Day 0/26, Texas
Day 0/26, Utah
Day 0/26, Vermont
Day 0/26, Virginia
Day 0/26, Washington
Day 0/26, West Virginia
Day 0/26, Wisconsin
Day 0/26, Wyoming
Day 1/26, Alabama
Day 1/26, 

Day 10/26, Utah
Day 10/26, Vermont
Day 10/26, Virginia
Day 10/26, Washington
Day 10/26, West Virginia
Day 10/26, Wisconsin
Day 10/26, Wyoming
Day 11/26, Alabama
Day 11/26, Alaska
Day 11/26, Arizona
Day 11/26, Arkansas
Day 11/26, California
Day 11/26, Colorado
Day 11/26, Connecticut
Day 11/26, Delaware
Day 11/26, Florida
Day 11/26, Georgia
Day 11/26, Hawaii
Day 11/26, Idaho
Day 11/26, Illinois
Day 11/26, Indiana
Day 11/26, Iowa
Day 11/26, Kansas
Day 11/26, Kentucky
Day 11/26, Louisiana
Day 11/26, Maine
Day 11/26, Maryland
Day 11/26, Massachusetts
Day 11/26, Michigan
Day 11/26, Minnesota
Day 11/26, Mississippi
Day 11/26, Missouri
Day 11/26, Montana
Day 11/26, Nebraska
Day 11/26, Nevada
Day 11/26, New Hampshire
Day 11/26, New Jersey
Day 11/26, New Mexico
Day 11/26, New York
Day 11/26, North Carolina
Day 11/26, North Dakota
Day 11/26, Ohio
Day 11/26, Oklahoma
Day 11/26, Oregon
Day 11/26, Pennsylvania
Day 11/26, Rhode Island
Day 11/26, South Carolina
Day 11/26, South Dakota
Day 11/26, Tenne

Day 19/26, Tennessee
Day 19/26, Texas
Day 19/26, Utah
Day 19/26, Vermont
Day 19/26, Virginia
Day 19/26, Washington
Day 19/26, West Virginia
Day 19/26, Wisconsin
Day 19/26, Wyoming
Day 20/26, Alabama
Day 20/26, Alaska
Day 20/26, Arizona
Day 20/26, Arkansas
Day 20/26, California
Day 20/26, Colorado
Day 20/26, Connecticut
Day 20/26, Delaware
Day 20/26, Florida
Day 20/26, Georgia
Day 20/26, Hawaii
Day 20/26, Idaho
Day 20/26, Illinois
Day 20/26, Indiana
Day 20/26, Iowa
Day 20/26, Kansas
Day 20/26, Kentucky
Day 20/26, Louisiana
Day 20/26, Maine
Day 20/26, Maryland
Day 20/26, Massachusetts
Day 20/26, Michigan
Day 20/26, Minnesota
Day 20/26, Mississippi
Day 20/26, Missouri
Day 20/26, Montana
Day 20/26, Nebraska
Day 20/26, Nevada
Day 20/26, New Hampshire
Day 20/26, New Jersey
Day 20/26, New Mexico
Day 20/26, New York
Day 20/26, North Carolina
Day 20/26, North Dakota
Day 20/26, Ohio
Day 20/26, Oklahoma
Day 20/26, Oregon
Day 20/26, Pennsylvania
Day 20/26, Rhode Island
Day 20/26, South Carolina
Da

In [7]:
final = pd.DataFrame(list(zip(forecastID, state_name, confirmed, deaths, label, truthConfirmed, truthDeaths, forecastConfirmed, forecastDeaths, )), 
               columns =['ForecastID', 'StateName', 'Confirmed error%', 'Deaths error%', 'Label', 'truthConfirmed', 'truthDeaths', 'forecastConfirmed', 'forecastDeaths']) 
final.to_csv("debug_output.csv", index=False)

print('Score:', (sum(confirmed)+sum(deaths)) / (len(confirmed) + len(deaths)))

Score: 2.2616576971066813
