In [1]:
import pandas as pd
import numpy as np
from path import Path

import os,sys
parentdir = Path(os.path.abspath("../.."))
sys.path.insert(0,parentdir)

# Get the DB password
from config import db_password

from sqlalchemy import create_engine
import psycopg2
# import the psycopg2 database adapter for PostgreSQL
from psycopg2 import connect, extensions, sql

from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [2]:
# Read the raw covid data file
file_path = Path("../../data/raw/covid/RAW_us_confirmed_cases.csv")
covid_df = pd.read_csv(file_path)
print(covid_df.shape)
covid_df.head()

(3340, 332)


Unnamed: 0,Province_State,Admin2,UID,iso2,iso3,code3,FIPS,Country_Region,Lat,Long_,...,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20,12/4/20,12/5/20,12/6/20,12/7/20
0,Alabama,Autauga,84001001,US,USA,840,1001.0,US,32.539527,-86.644082,...,2735,2751,2780,2818,2873,2893,2945,2979,3005,3043
1,Alabama,Baldwin,84001003,US,USA,840,1003.0,US,30.72775,-87.722071,...,8733,8820,8890,9051,9163,9341,9501,9626,9728,9821
2,Alabama,Barbour,84001005,US,USA,840,1005.0,US,31.868263,-85.387129,...,1173,1175,1178,1189,1206,1214,1217,1219,1223,1224
3,Alabama,Bibb,84001007,US,USA,840,1007.0,US,32.996421,-87.125115,...,1179,1188,1196,1204,1239,1252,1270,1283,1293,1299
4,Alabama,Blount,84001009,US,USA,840,1009.0,US,33.982109,-86.567906,...,2922,2946,2997,3061,3100,3158,3231,3281,3299,3324


## ETL - Transform Process
Keep only FL and CA data and then drop unnecessary columns

In [3]:
# Check the column names
print(covid_df.columns.values)

['Province_State' 'Admin2' 'UID' 'iso2' 'iso3' 'code3' 'FIPS'
 'Country_Region' 'Lat' 'Long_' 'Combined_Key' '1/22/20' '1/23/20'
 '1/24/20' '1/25/20' '1/26/20' '1/27/20' '1/28/20' '1/29/20' '1/30/20'
 '1/31/20' '2/1/20' '2/2/20' '2/3/20' '2/4/20' '2/5/20' '2/6/20' '2/7/20'
 '2/8/20' '2/9/20' '2/10/20' '2/11/20' '2/12/20' '2/13/20' '2/14/20'
 '2/15/20' '2/16/20' '2/17/20' '2/18/20' '2/19/20' '2/20/20' '2/21/20'
 '2/22/20' '2/23/20' '2/24/20' '2/25/20' '2/26/20' '2/27/20' '2/28/20'
 '2/29/20' '3/1/20' '3/2/20' '3/3/20' '3/4/20' '3/5/20' '3/6/20' '3/7/20'
 '3/8/20' '3/9/20' '3/10/20' '3/11/20' '3/12/20' '3/13/20' '3/14/20'
 '3/15/20' '3/16/20' '3/17/20' '3/18/20' '3/19/20' '3/20/20' '3/21/20'
 '3/22/20' '3/23/20' '3/24/20' '3/25/20' '3/26/20' '3/27/20' '3/28/20'
 '3/29/20' '3/30/20' '3/31/20' '4/1/20' '4/2/20' '4/3/20' '4/4/20'
 '4/5/20' '4/6/20' '4/7/20' '4/8/20' '4/9/20' '4/10/20' '4/11/20'
 '4/12/20' '4/13/20' '4/14/20' '4/15/20' '4/16/20' '4/17/20' '4/18/20'
 '4/19/20' '4/20/20' '4/21

In [4]:
# Drop the unnecessary columns
covid_df.drop(columns=["UID","iso2","iso3","FIPS","code3","Country_Region","Combined_Key"],inplace=True)
print(covid_df.shape)

(3340, 325)


In [5]:
# Rename Province_State to State and Admin2 to County
covid_df.rename(columns={"Province_State":"State","Admin2":"County"},inplace=True)
print(covid_df.shape)
covid_df.head()                        

(3340, 325)


Unnamed: 0,State,County,Lat,Long_,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20,12/4/20,12/5/20,12/6/20,12/7/20
0,Alabama,Autauga,32.539527,-86.644082,0,0,0,0,0,0,...,2735,2751,2780,2818,2873,2893,2945,2979,3005,3043
1,Alabama,Baldwin,30.72775,-87.722071,0,0,0,0,0,0,...,8733,8820,8890,9051,9163,9341,9501,9626,9728,9821
2,Alabama,Barbour,31.868263,-85.387129,0,0,0,0,0,0,...,1173,1175,1178,1189,1206,1214,1217,1219,1223,1224
3,Alabama,Bibb,32.996421,-87.125115,0,0,0,0,0,0,...,1179,1188,1196,1204,1239,1252,1270,1283,1293,1299
4,Alabama,Blount,33.982109,-86.567906,0,0,0,0,0,0,...,2922,2946,2997,3061,3100,3158,3231,3281,3299,3324


In [6]:
# Filter rows with state FL and CA only
state = ["Florida", "California"]
state_covid_data = covid_df[covid_df.State.isin(state)]
state_covid_data.dropna(axis=0,how="any",inplace=True)
print(state_covid_data.shape)
state_covid_data.head()

(129, 325)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_covid_data.dropna(axis=0,how="any",inplace=True)


Unnamed: 0,State,County,Lat,Long_,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20,12/4/20,12/5/20,12/6/20,12/7/20
195,California,Alameda,37.646294,-121.892927,0,0,0,0,0,0,...,29116,29476,29918,29972,30330,30980,31204,31871,32781,33477
196,California,Alpine,38.596786,-119.822359,0,0,0,0,0,0,...,50,50,50,51,51,52,57,57,57,57
197,California,Amador,38.445831,-120.65696,0,0,0,0,0,0,...,554,563,590,638,666,740,832,917,991,1020
198,California,Butte,39.667278,-121.600525,0,0,0,0,0,0,...,3894,3894,4008,4148,4195,4244,4305,4417,4455,4527
199,California,Calaveras,38.205371,-120.552913,0,0,0,0,0,0,...,478,478,478,478,544,553,572,585,606,606


In [7]:
## Dataframe for State and County. 
df_counties = pd.DataFrame()
df_counties["State"] = state_covid_data["State"]
df_counties["County"] = state_covid_data["County"]
print(df_counties.shape)
df_counties.head()

(129, 2)


Unnamed: 0,State,County
195,California,Alameda
196,California,Alpine
197,California,Amador
198,California,Butte
199,California,Calaveras


Looks like the covid numbers are cumulative numbers added each day. So in order to get the monthly numbers
we will just take the numbers from the last day of month

In [8]:
## Get last day of month numbers
eom_dates =  []
import calendar
year = 2020
for i in range(1,12):
    #print(f'{i}/{calendar.monthrange(year,i)[1]}/{year}')
    eom_dates.append(str(i)+"/"+str(calendar.monthrange(year,i)[1])+"/"+str(year)[-2:])

print(eom_dates)

['1/31/20', '2/29/20', '3/31/20', '4/30/20', '5/31/20', '6/30/20', '7/31/20', '8/31/20', '9/30/20', '10/31/20', '11/30/20']


In [9]:
## Extract the columns specific to end of month
df_dates = pd.DataFrame()
for key in eom_dates:
    df_dates[key] = state_covid_data[key]

df_dates

Unnamed: 0,1/31/20,2/29/20,3/31/20,4/30/20,5/31/20,6/30/20,7/31/20,8/31/20,9/30/20,10/31/20,11/30/20
195,0,0,313,1603,3390,5964,11139,18187,21383,23876,29918
196,0,0,0,2,2,2,2,2,2,8,50
197,0,0,2,8,19,22,115,280,296,344,590
198,0,0,8,16,44,168,941,2026,2848,3160,4008
199,0,0,3,13,15,32,125,236,322,354,478
...,...,...,...,...,...,...,...,...,...,...,...
404,0,0,1,4,32,62,215,594,969,1139,1229
405,0,0,80,452,736,2105,6834,9283,10945,12927,16506
406,0,0,1,23,34,60,572,888,1142,1327,1618
407,0,0,18,38,117,245,1205,1638,1977,2496,3690


To get covid numbers for each month (and not cumulative), we will subtract M1 from M2

In [10]:
df_dates_unique=df_dates.diff(axis=1)
df_dates_unique.iloc[:, 0]=df_dates.iloc[:, 0]
print(df_dates_unique.shape)
df_dates_unique.head()

(129, 11)


Unnamed: 0,1/31/20,2/29/20,3/31/20,4/30/20,5/31/20,6/30/20,7/31/20,8/31/20,9/30/20,10/31/20,11/30/20
195,0,0.0,313.0,1290.0,1787.0,2574.0,5175.0,7048.0,3196.0,2493.0,6042.0
196,0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,6.0,42.0
197,0,0.0,2.0,6.0,11.0,3.0,93.0,165.0,16.0,48.0,246.0
198,0,0.0,8.0,8.0,28.0,124.0,773.0,1085.0,822.0,312.0,848.0
199,0,0.0,3.0,10.0,2.0,17.0,93.0,111.0,86.0,32.0,124.0


In [11]:
# Merge the date columns with state and county columns from the trimmed dataframe
df_covid_final = pd.concat([df_counties, df_dates_unique], axis=1)
print(df_covid_final.shape)
df_covid_final.head()

(129, 13)


Unnamed: 0,State,County,1/31/20,2/29/20,3/31/20,4/30/20,5/31/20,6/30/20,7/31/20,8/31/20,9/30/20,10/31/20,11/30/20
195,California,Alameda,0,0.0,313.0,1290.0,1787.0,2574.0,5175.0,7048.0,3196.0,2493.0,6042.0
196,California,Alpine,0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,6.0,42.0
197,California,Amador,0,0.0,2.0,6.0,11.0,3.0,93.0,165.0,16.0,48.0,246.0
198,California,Butte,0,0.0,8.0,8.0,28.0,124.0,773.0,1085.0,822.0,312.0,848.0
199,California,Calaveras,0,0.0,3.0,10.0,2.0,17.0,93.0,111.0,86.0,32.0,124.0


Check columns types and if any null values

In [12]:
df_covid_final.dtypes

State        object
County       object
1/31/20       int64
2/29/20     float64
3/31/20     float64
4/30/20     float64
5/31/20     float64
6/30/20     float64
7/31/20     float64
8/31/20     float64
9/30/20     float64
10/31/20    float64
11/30/20    float64
dtype: object

In [13]:
# Change the first column to float64 to be consistent with other columns
df_covid_final['1/31/20'] = df_covid_final['1/31/20'].astype(float) 
df_covid_final.dtypes

State        object
County       object
1/31/20     float64
2/29/20     float64
3/31/20     float64
4/30/20     float64
5/31/20     float64
6/30/20     float64
7/31/20     float64
8/31/20     float64
9/30/20     float64
10/31/20    float64
11/30/20    float64
dtype: object

In [14]:
# Check if any null values present
df_covid_final.isnull().sum()

State       0
County      0
1/31/20     0
2/29/20     0
3/31/20     0
4/30/20     0
5/31/20     0
6/30/20     0
7/31/20     0
8/31/20     0
9/30/20     0
10/31/20    0
11/30/20    0
dtype: int64

Everything looks good, Tranpose the data now.

In [15]:
## Convert the Date columns to rows
FL_CA_covid_df = pd.melt(df_covid_final, id_vars=["State","County"], 
                  var_name="Date", value_name="Covid_Cummulative_Numbers")


In [16]:
print(FL_CA_covid_df.shape)

(1419, 4)


In [17]:
FL_CA_covid_df.sample(n=10)

Unnamed: 0,State,County,Date,Covid_Cummulative_Numbers
1339,California,Sonoma,11/30/20,2633.0
297,California,San Joaquin,3/31/20,136.0
73,Florida,Dixie,1/31/20,0.0
127,Florida,Walton,1/31/20,0.0
193,Florida,Brevard,2/29/20,0.0
836,Florida,Bay,7/31/20,3091.0
1204,California,Santa Clara,10/31/20,3607.0
276,California,Los Angeles,3/31/20,3018.0
67,Florida,Charlotte,1/31/20,0.0
1012,Florida,Out of FL,8/31/20,0.0


In [18]:
FL_CA_covid_df.sample(n=10)

Unnamed: 0,State,County,Date,Covid_Cummulative_Numbers
939,California,San Bernardino,8/31/20,15412.0
298,California,San Luis Obispo,3/31/20,80.0
788,California,Kern,7/31/20,14680.0
1072,California,San Luis Obispo,9/30/20,631.0
359,Florida,Martin,3/31/20,28.0
401,California,Kern,4/30/20,810.0
1374,Florida,Hendry,11/30/20,259.0
1075,California,Santa Clara,9/30/20,4030.0
427,California,San Luis Obispo,4/30/20,104.0
1310,California,Marin,11/30/20,646.0


In [19]:
# Drop the Rows where any field is null or blank
FL_CA_covid_df.dropna(axis=0)

Unnamed: 0,State,County,Date,Covid_Cummulative_Numbers
0,California,Alameda,1/31/20,0.0
1,California,Alpine,1/31/20,0.0
2,California,Amador,1/31/20,0.0
3,California,Butte,1/31/20,0.0
4,California,Calaveras,1/31/20,0.0
...,...,...,...,...
1414,Florida,Union,11/30/20,90.0
1415,Florida,Volusia,11/30/20,3579.0
1416,Florida,Wakulla,11/30/20,291.0
1417,Florida,Walton,11/30/20,1194.0


In [20]:
# Convert the date field to Datetime field
FL_CA_covid_df["Date"] = pd.to_datetime(FL_CA_covid_df["Date"])
FL_CA_covid_df.sample(n=10)

Unnamed: 0,State,County,Date,Covid_Cummulative_Numbers
661,California,Lake,2020-06-30,39.0
566,California,Stanislaus,2020-05-31,405.0
1199,California,San Francisco,2020-10-31,1124.0
23,California,Merced,2020-01-31,0.0
963,Florida,Alachua,2020-08-31,1670.0
1004,Florida,Martin,2020-08-31,906.0
634,Florida,St. Johns,2020-05-31,52.0
1261,Florida,Marion,2020-10-31,1152.0
436,California,Sonoma,2020-04-30,159.0
115,Florida,Santa Rosa,2020-01-31,0.0


In [21]:
# Ensure Avg_Price field is of type integer 
print(FL_CA_covid_df["Covid_Cummulative_Numbers"].dtype)
FL_CA_covid_df['Covid_Cummulative_Numbers'] = FL_CA_covid_df['Covid_Cummulative_Numbers'].astype('Int64')
FL_CA_covid_df.sample(n=10)

float64


Unnamed: 0,State,County,Date,Covid_Cummulative_Numbers
1249,Florida,Holmes,2020-10-31,178
1409,Florida,St. Lucie,2020-11-30,1666
220,Florida,Jefferson,2020-02-29,0
584,Florida,Citrus,2020-05-31,22
1095,Florida,Bradford,2020-09-30,391
948,California,Shasta,2020-08-31,201
1165,California,Calaveras,2020-10-31,32
644,Florida,Washington,2020-05-31,45
1232,Florida,Columbia,2020-10-31,448
850,Florida,Flagler,2020-07-31,591


## ETL - Load data 
write the processed data to CSV and load to database

In [22]:
#Export the data to CSV
file_path_export=Path("../../data/processed/covid_cleaned.csv")
FL_CA_covid_df.to_csv(file_path_export,index=False)

In [23]:
# Load the table to database
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/covid_property_pandemic"
engine = create_engine(db_string)
FL_CA_covid_df.to_sql(name='covid_cleaned', con=engine, if_exists='replace')