In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv("datasets/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59580 entries, 0 to 59579
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   submission_date  59580 non-null  object 
 1   state            59580 non-null  object 
 2   tot_cases        59580 non-null  int64  
 3   conf_cases       33738 non-null  float64
 4   prob_cases       33666 non-null  float64
 5   new_case         59580 non-null  int64  
 6   pnew_case        56054 non-null  float64
 7   tot_death        59580 non-null  int64  
 8   conf_death       32993 non-null  float64
 9   prob_death       32993 non-null  float64
 10  new_death        59580 non-null  int64  
 11  pnew_death       56086 non-null  float64
 12  created_at       59580 non-null  object 
 13  consent_cases    49645 non-null  object 
 14  consent_deaths   50643 non-null  object 
dtypes: float64(6), int64(4), object(5)
memory usage: 6.8+ MB


In [4]:
df.columns

Index(['date', 'location_key', 'new_confirmed', 'new_deceased',
       'new_recovered', 'new_tested', 'cumulative_confirmed',
       'cumulative_deceased', 'cumulative_recovered', 'cumulative_tested'],
      dtype='object')

In [7]:
df['state'].unique()

array(['KS', 'UT', 'AS', 'ND', 'IN', 'AR', 'NY', 'PW', 'NM', 'SD', 'OH',
       'AK', 'CO', 'OK', 'GU', 'FL', 'TN', 'WA', 'AL', 'NYC', 'GA', 'MS',
       'IA', 'WV', 'TX', 'PA', 'MA', 'MP', 'PR', 'RMI', 'HI', 'OR', 'MD',
       'VT', 'DC', 'NJ', 'NE', 'DE', 'VA', 'MN', 'KY', 'FSM', 'AZ', 'WY',
       'ME', 'SC', 'CT', 'WI', 'MI', 'NV', 'CA', 'IL', 'MO', 'LA', 'ID',
       'MT', 'NC', 'RI', 'NH', 'VI'], dtype=object)

In [None]:
### Data Cleaning 

#TODO: Stratify the data by state
#TODO: For each datasets take the new cases and new deaths and aggregate them by week
#TODO: Plot the missing data and see imputation methods for missing data

In [None]:
# Stratify the data by state
dfs = [df[df['state'] == region] for region in df['state'].unique()]

for df in dfs:
    df['date'] = pd.to_datetime(df['date'])
    # Aggregate the data by week
    df = df.resample('W', on='date').sum()
    # Select the columns we want 
    df = df.loc[:, ['date', 'new_case', 'new_death', 'state']]

    # Plot the missing data
    ax, fig = plt.subplots(figsize=(10, 5))
    sns.heatmap(df.isnull(), cbar=False)
    plt.show()
    

In [3]:
neighbors = [
  {
    "code": "AK",
    "Neighborcodes": [ "WA" ]
  },
  {
    "code": "AL",
    "Neighborcodes": [ "FL", "GA", "MS", "TN" ]
  },
  {
    "code": "AR",
    "Neighborcodes": [ "LA", "MO", "MS", "OK", "TN", "TX" ]
  },
  {
    "code": "AZ",
    "Neighborcodes": [ "CA", "CO", "NM", "NV", "UT" ]
  },
  {
    "code": "CA",
    "Neighborcodes": [ "AZ", "HI", "NV", "OR" ]
  },
  {
    "code": "CO",
    "Neighborcodes": [ "AZ", "KS", "NE", "NM", "OK", "UT", "WY" ]
  },
  {
    "code": "CT",
    "Neighborcodes": [ "MA", "NY", "RI" ]
  },
  {
    "code": "DC",
    "Neighborcodes": [ "MD", "VA" ]
  },
  {
    "code": "DE",
    "Neighborcodes": [ "MD", "NJ", "PA" ]
  },
  {
    "code": "FL",
    "Neighborcodes": [ "AL", "GA" ]
  },
  {
    "code": "GA",
    "Neighborcodes": [ "AL", "FL", "NC", "SC", "TN" ]
  },
  {
    "code": "HI",
    "Neighborcodes": [ "CA" ]
  },
  {
    "code": "IA",
    "Neighborcodes": [ "IL", "MN", "MO", "NE", "SD", "WI" ]
  },
  {
    "code": "ID",
    "Neighborcodes": [ "MT", "NV", "OR", "UT", "WA", "WY" ]
  },
  {
    "code": "IL",
    "Neighborcodes": [ "IA", "IN", "KY", "MO", "WI" ]
  },
  {
    "code": "IN",
    "Neighborcodes": [ "IL", "KY", "MO", "WI" ]
  },
  {
    "code": "KS",
    "Neighborcodes": [ "CO", "MO", "NE", "OK" ]
  },
  {
    "code": "KY",
    "Neighborcodes": [ "IL", "IN", "MO", "OH", "TN", "VA", "WV" ]
  },
  {
    "code": "LA",
    "Neighborcodes": [ "AR", "MS", "TX" ]
  },
  {
    "code": "MA",
    "Neighborcodes": [ "CT", "NH", "NY", "RI", "VT" ]
  },
  {
    "code": "MD",
    "Neighborcodes": [ "DC", "DE", "PA", "VA", "WV" ]
  },
  {
    "code": "ME",
    "Neighborcodes": [ "NH" ]
  },
  {
    "code": "MI",
    "Neighborcodes": [ "IN", "OH", "WI" ]
  },
  {
    "code": "MN",
    "Neighborcodes": [ "IA", "ND", "SD", "WI" ]
  },
  {
    "code": "MO",
    "Neighborcodes": [ "AR", "IA", "IL", "KS", "KY", "NE", "OK", "TN" ]
  },
  {
    "code": "MS",
    "Neighborcodes": [ "AL", "AR", "LA", "TN" ]
  },
  {
    "code": "MT",
    "Neighborcodes": [ "ID", "ND", "SD", "WY" ]
  },
  {
    "code": "NC",
    "Neighborcodes": [ "GA", "SC", "TN", "VA" ]
  },
  {
    "code": "ND",
    "Neighborcodes": [ "MN", "MT", "SD" ]
  },
  {
    "code": "NE",
    "Neighborcodes": [ "CO", "IA", "KS", "MO", "SD", "WY" ]
  },
  {
    "code": "NH",
    "Neighborcodes": [ "MA", "ME", "VT" ]
  },
  {
    "code": "NJ",
    "Neighborcodes": [ "DE", "NY", "PA" ]
  },
  {
    "code": "NM",
    "Neighborcodes": [ "AZ", "CO", "OK", "TX", "UT" ]
  },
  {
    "code": "NV",
    "Neighborcodes": [ "AZ", "CA", "ID", "OR", "UT" ]
  },
  {
    "code": "NY",
    "Neighborcodes": [ "CT", "MA", "NJ", "PA", "VT" ]
  },
  {
    "code": "OH",
    "Neighborcodes": [ "IN", "KY", "MI", "PA", "WV" ]
  },
  {
    "code": "OK",
    "Neighborcodes": [ "AR", "CO", "KS", "MO", "NM", "TX" ]
  },
  {
    "code": "OR",
    "Neighborcodes": [ "CA", "ID", "NV", "WA" ]
  },
  {
    "code": "PA",
    "Neighborcodes": [ "DE", "MD", "NJ", "NY", "OH", "WV" ]
  },
  {
    "code": "RI",
    "Neighborcodes": [ "CT", "MA" ]
  },
  {
    "code": "SC",
    "Neighborcodes": [ "GA", "NC" ]
  },
  {
    "code": "SD",
    "Neighborcodes": [ "IA", "MN", "MT", "ND", "NE", "WY" ]
  },
  {
    "code": "TN",
    "Neighborcodes": [ "AL", "AR", "GA", "KY", "MO", "MS", "NC", "VA" ]
  },
  {
    "code": "TX",
    "Neighborcodes": [ "AR", "LA", "NM", "OK" ]
  },
  {
    "code": "UT",
    "Neighborcodes": [ "AZ", "CO", "ID", "NM", "NV", "WY" ]
  },
  {
    "code": "VA",
    "Neighborcodes": [ "DC", "KY", "MD", "NC", "TN", "WV" ]
  },
  {
    "code": "VT",
    "Neighborcodes": [ "MA", "NH", "NY" ]
  },
  {
    "code": "WA",
    "Neighborcodes": [ "AK", "ID", "OR" ]
  },
  {
    "code": "WI",
    "Neighborcodes": [ "IA", "IL", "MI", "MN" ]
  },
  {
    "code": "WV",
    "Neighborcodes": [ "KY", "MD", "OH", "PA", "VA" ]
  },
  {
    "code": "WY",
    "Neighborcodes": [ "CO", "ID", "MT", "NE", "SD", "UT" ]
  }
]

In [4]:
variwant = {}
for i in range(len(neighbors)):
    variwant[neighbors[i]['code']] = neighbors[i]['Neighborcodes']

In [5]:
print(variwant)

{'AK': ['WA'], 'AL': ['FL', 'GA', 'MS', 'TN'], 'AR': ['LA', 'MO', 'MS', 'OK', 'TN', 'TX'], 'AZ': ['CA', 'CO', 'NM', 'NV', 'UT'], 'CA': ['AZ', 'HI', 'NV', 'OR'], 'CO': ['AZ', 'KS', 'NE', 'NM', 'OK', 'UT', 'WY'], 'CT': ['MA', 'NY', 'RI'], 'DC': ['MD', 'VA'], 'DE': ['MD', 'NJ', 'PA'], 'FL': ['AL', 'GA'], 'GA': ['AL', 'FL', 'NC', 'SC', 'TN'], 'HI': ['CA'], 'IA': ['IL', 'MN', 'MO', 'NE', 'SD', 'WI'], 'ID': ['MT', 'NV', 'OR', 'UT', 'WA', 'WY'], 'IL': ['IA', 'IN', 'KY', 'MO', 'WI'], 'IN': ['IL', 'KY', 'MO', 'WI'], 'KS': ['CO', 'MO', 'NE', 'OK'], 'KY': ['IL', 'IN', 'MO', 'OH', 'TN', 'VA', 'WV'], 'LA': ['AR', 'MS', 'TX'], 'MA': ['CT', 'NH', 'NY', 'RI', 'VT'], 'MD': ['DC', 'DE', 'PA', 'VA', 'WV'], 'ME': ['NH'], 'MI': ['IN', 'OH', 'WI'], 'MN': ['IA', 'ND', 'SD', 'WI'], 'MO': ['AR', 'IA', 'IL', 'KS', 'KY', 'NE', 'OK', 'TN'], 'MS': ['AL', 'AR', 'LA', 'TN'], 'MT': ['ID', 'ND', 'SD', 'WY'], 'NC': ['GA', 'SC', 'TN', 'VA'], 'ND': ['MN', 'MT', 'SD'], 'NE': ['CO', 'IA', 'KS', 'MO', 'SD', 'WY'], 'NH': ['M

In [6]:
results = pd.read_csv('datasets/allstates_correlation_features.csv')

In [10]:
results.shape

(21522, 9)