In [437]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [438]:
#read super Covid-19 dataframe with state as index. 
USCovid = pd.read_csv("../../../data/output/covid.csv", index_col=0)

### Clean Data

#### Drop unnecessary data

In [439]:
# Clean
USCovid.drop(["Unnamed: 0", "countyFIPS", "stateFIPS"],
             axis=1,
             errors='ignore',
             inplace=True)
USCovid.drop(
    USCovid.loc[USCovid['County Name'] == "Statewide Unallocated"].index,
    inplace=True)
regex_cases = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[x])|^County Name$|^State$|^population$'
regex_deaths = '(^[0-9]+[/]+[0-9]+[/]+[0-9]+[_]+[y])|^County Name$|^State$|^population$'
USCases = USCovid.filter(regex=regex_cases)
USDeaths = USCovid.filter(regex=regex_deaths)

#### Find new cases

In [441]:
def diff(df):
    new = [df[0], df[1], df[2], df[3]]
    for i in range(4, df.shape[0]):
        new.append(df[i] - df[i - 1])
    return pd.Series(new)

In [442]:
columns_D = USDeaths.columns
columns_C = USCases.columns
USDeaths = USDeaths.reset_index(drop=True).T.apply(diff).T
USDeaths.columns = columns_D
USCases = USCases.reset_index(drop=True).T.apply(diff).T
USCases.columns = columns_C

#### Convert date type from String to datetime Timestamp

In [443]:
# Group by State then fixing date string syntax
USDeaths_byStates = USDeaths.groupby('State').sum()
USDeaths_byStates = USDeaths_byStates.rename(
    columns=lambda x: (x.split('_')[0]),
    errors='raise')
USCases_byStates = USCases.groupby('State').sum()
USCases_byStates = USCases_byStates.rename(
    columns=lambda x: (x.split('_')[0]),
    errors='raise')

In [444]:
# Split States and population from the data (temparary)
USCases_byStates_SP = USCases_byStates.reset_index()[['County Name', 'State', 'population']]
USCases_byStates_Date = USCases_byStates.reset_index().drop(
    ['County Name', 'State', 'population'], axis=1)

In [445]:
# Convert String to TimeStamp
USCases_byStates_Date =USCases_byStates_Date.rename(
    columns=lambda x: (pd.to_datetime(x, errors='ignore')),
    errors='raise')

##### Resample date from days to weeks and find the mean of week

In [446]:
USCases_byStates_Date_weeks = np.ceil(USCases_byStates_Date.resample('W', axis='columns').mean())

In [447]:
USCases_States_weeks = pd.merge(USCases_byStates_SP, USCases_byStates_x, left_index=True, right_index=True)

In [448]:
USCases_States_weeks.head()

Unnamed: 0,County Name,State,population,2020-01-26 00:00:00,2020-02-02 00:00:00,2020-02-09 00:00:00,2020-02-16 00:00:00,2020-02-23 00:00:00,2020-03-01 00:00:00,2020-03-08 00:00:00,...,2020-08-16 00:00:00,2020-08-23 00:00:00,2020-08-30 00:00:00,2020-09-06 00:00:00,2020-09-13 00:00:00,2020-09-20 00:00:00,2020-09-27 00:00:00,2020-10-04 00:00:00,2020-10-11 00:00:00,2020-10-18 00:00:00
0,Aleutians East BoroughAleutians West Census Ar...,AK,731545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,78.0,69.0,69.0,77.0,73.0,80.0,92.0,132.0,183.0,173.0
1,Autauga CountyBaldwin CountyBarbour CountyBibb...,AL,4903185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1015.0,947.0,1454.0,1011.0,920.0,887.0,1051.0,979.0,882.0,1006.0
2,Arkansas CountyAshley CountyBaxter CountyBento...,AR,3017804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,617.0,550.0,598.0,600.0,632.0,779.0,777.0,792.0,836.0,938.0
3,Apache CountyCochise CountyCoconino CountyGila...,AZ,7278717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,945.0,648.0,512.0,586.0,392.0,786.0,459.0,503.0,689.0,782.0
4,Grand Princess Cruise ShipAlameda CountyAlpine...,CA,39512223,0.0,0.0,0.0,0.0,1.0,1.0,16.0,...,9123.0,6281.0,5163.0,4778.0,3263.0,3573.0,3421.0,3169.0,3250.0,3424.0


#### Find Means, Meidan and Mode for the weeks

In [464]:
USCases_means = USCases_byStates_Date_weeks.T.mean()
USCases_median = USCases_byStates_Date_weeks.T.median()
USCases_mode = USCases_byStates_Date_weeks.T.mode()
USCases_mode = USCases_mode.values.tolist()
USCases_list = [
     USCases_byStates_SP['State'], USCases_means, USCases_median, pd.Series(USCases_mode[0])
]

result = pd.DataFrame(USCases_list).transpose()
result.columns = ['State', 'Mean', 'Median', 'Mode']

In [465]:
result.head()

Unnamed: 0,State,Mean,Median,Mode
0,AK,40.3333,13,0
1,AL,631.769,478,0
2,AR,357.282,300,0
3,AZ,846.744,460,0
4,CA,3217.77,2664,1
