In [1]:
import pandas as pd
import datetime as dt

df = "Resources/us-counties.csv"
df1 = pd.read_csv(df)
file = "Resources/county_mask_mandate_data.xlsx"
df2 = pd.read_excel(file)

## Dataset 1 - US Counties

In [2]:
df1.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0


In [3]:
#Shows the data types for the df1
df1.dtypes

date       object
county     object
state      object
fips      float64
cases       int64
deaths    float64
dtype: object

In [4]:
# count the number of rows with empty columns
count = df1['fips'].isna().sum()
print(count)

19744


In [5]:
#count the number of rows with data in each column
df1.count()

date      2135147
county    2135147
state     2135147
fips      2115403
cases     2135147
deaths    2086356
dtype: int64

In [6]:
# Drop the rows with blank data creating a new dataframe
df1_clean = df1.dropna()

In [7]:
# count the number of rows with data in each column from the clean dataframe
df1_clean.count()

date      2066612
county    2066612
state     2066612
fips      2066612
cases     2066612
deaths    2066612
dtype: int64

In [8]:
#change fips from float64 to int64 to match datatype of df1_clean "county_fips"
df1_clean = df1_clean.astype({'fips': 'int64'})
df1_clean.dtypes

date       object
county     object
state      object
fips        int64
cases       int64
deaths    float64
dtype: object

In [9]:
# change deaths from float64 to int64 to match datatype of df1_clean "deaths"
df1_clean = df1_clean.astype({'deaths': 'int64'})
df1_clean.dtypes

date      object
county    object
state     object
fips       int64
cases      int64
deaths     int64
dtype: object

In [10]:
# Change the data type for cases to int64

df1_clean['cases'].astype('int64')

0             1
1             1
2             1
3             1
4             1
           ... 
2135142    9241
2135143    8741
2135144    4827
2135145    2022
2135146    1341
Name: cases, Length: 2066612, dtype: int64

In [11]:
# Check the data types for databases

df1_clean.dtypes

date      object
county    object
state     object
fips       int64
cases      int64
deaths     int64
dtype: object

In [12]:
# Group by county to get the total_cases for each county, read the last day of 2020 as the total number of cases and deaths
df1_total = df1_clean.loc[df1_clean['date'] == "2020-12-31"]
df1_total.head()

Unnamed: 0,date,county,state,fips,cases,deaths
881492,2020-12-31,Autauga,Alabama,1001,4190,48
881493,2020-12-31,Baldwin,Alabama,1003,13601,161
881494,2020-12-31,Barbour,Alabama,1005,1514,32
881495,2020-12-31,Bibb,Alabama,1007,1834,46
881496,2020-12-31,Blount,Alabama,1009,4641,63


In [13]:
# Drop duplicates so only county shows once
df1_total = df1_total.drop_duplicates(subset=["fips"])
df1_total.head()

Unnamed: 0,date,county,state,fips,cases,deaths
881492,2020-12-31,Autauga,Alabama,1001,4190,48
881493,2020-12-31,Baldwin,Alabama,1003,13601,161
881494,2020-12-31,Barbour,Alabama,1005,1514,32
881495,2020-12-31,Bibb,Alabama,1007,1834,46
881496,2020-12-31,Blount,Alabama,1009,4641,63


## Dataset 2 - Mask Mandate

In [14]:
# Display head for mask mandate dataframe

df2.head()

Unnamed: 0,state_fips,state_name,county_fips,county_name,county_start_date,county_end_date,county_conditions,county_source,escalation,defiance,county_start_edate,state_start_date,state_end_date,state_conditions,state_source,state_start_edate,earliest_start_edate,county_fips_string,earliest_start_date
0,1,ALABAMA,1067,Henry County,NaT,,,,,,,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22112.0,1067,7-16-2020
1,1,ALABAMA,1071,Jackson County,2020-07-01,,all public places,https://www.examiner.net/news/20200629/jackson...,,,22097.0,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22097.0,1071,7-1-2020
2,1,ALABAMA,1093,Marion County,2020-07-09,,retail only,https://fox59.com/news/coronavirus/starting-to...,,,22105.0,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22105.0,1093,7-9-2020
3,1,ALABAMA,1073,Jefferson County,2020-06-29,,all public places,https://www.wbrc.com/2020/06/29/jefferson-coun...,,,22095.0,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22095.0,1073,6-29-2020
4,1,ALABAMA,1099,Monroe County,NaT,,,,,,,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22112.0,1099,7-16-2020


In [15]:
# Display datatypes for mask mandate dataframe 

df2.dtypes

state_fips                       int64
state_name                      object
county_fips                      int64
county_name                     object
county_start_date       datetime64[ns]
county_end_date                 object
county_conditions               object
county_source                   object
escalation                      object
defiance                        object
county_start_edate             float64
state_start_date                object
state_end_date                  object
state_conditions                object
state_source                    object
state_start_edate              float64
earliest_start_edate           float64
county_fips_string               int64
earliest_start_date             object
dtype: object

In [16]:
# Count the number of data points for each column
df2.count()

state_fips              3150
state_name              3150
county_fips             3150
county_name             3149
county_start_date       1500
county_end_date          137
county_conditions       1377
county_source           1778
escalation                34
defiance                 203
county_start_edate      1500
state_start_date        1652
state_end_date           504
state_conditions        2045
state_source            3150
state_start_edate       1652
earliest_start_edate    2103
county_fips_string      3150
earliest_start_date     2103
dtype: int64

In [17]:
# Drop the unneeded columns 
df2_clean1 = df2.drop(['escalation', 'defiance', 'county_conditions', 'county_source', 'county_start_edate', 'state_start_edate', 'earliest_start_edate', 'county_fips_string', 'state_conditions', 'state_source'], axis=1)

df2_clean1.head()

Unnamed: 0,state_fips,state_name,county_fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date
0,1,ALABAMA,1067,Henry County,NaT,,07/16/2020,07/31/2020,7-16-2020
1,1,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020
2,1,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020
3,1,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020
4,1,ALABAMA,1099,Monroe County,NaT,,07/16/2020,07/31/2020,7-16-2020


In [18]:
# Create our mask_mandate column with binary values Yes or No
no_mandate_df = df2_clean1[df2_clean1.county_start_date.isnull()]
mandate_df = df2_clean1[df2_clean1.county_start_date.notnull()]

In [19]:
# Create mask_mandate column
no_mandate_df["mask_mandate"] = "0"
mandate_df["mask_mandate"] = "1"
merged_df = no_mandate_df.append(mandate_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_mandate_df["mask_mandate"] = "0"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mandate_df["mask_mandate"] = "1"


Show duration of mask mandate in 2020 only

In [20]:
# Create a null and not null dataset based on county start date
duration_df = merged_df[merged_df.county_start_date.notnull()]
duration_df1 = merged_df[merged_df.county_start_date.isnull()]
# Create a column that keeps up with month started 
duration_df['month_started'] = pd.DatetimeIndex(duration_df['county_start_date']).month
duration_df1['month_started'] = "0"


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duration_df['month_started'] = pd.DatetimeIndex(duration_df['county_start_date']).month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duration_df1['month_started'] = "0"


In [21]:
# Create a new column that is the difference between month started and total months in a year
duration_df["mandate_duration_months"] = (12 - duration_df.month_started)
duration_df1["mandate_duration_months"] = 0
final_df = duration_df.append(duration_df1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duration_df["mandate_duration_months"] = (12 - duration_df.month_started)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duration_df1["mandate_duration_months"] = 0


In [22]:
# print the dataframe with duration and mandate columns 
final_df

Unnamed: 0,state_fips,state_name,county_fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months
1,1,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5
2,1,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5
3,1,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6
10,1,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5
23,1,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5
...,...,...,...,...,...,...,...,...,...,...,...,...
3144,56,WYOMING,56011,Crook County,NaT,,,,,0,0,0
3145,56,WYOMING,56027,Niobrara County,NaT,,,,,0,0,0
3146,56,WYOMING,56015,Goshen County,NaT,,,,,0,0,0
3147,56,WYOMING,56029,Park County,NaT,,,,,0,0,0


In [23]:
# Rename county_fips to fips 
final_df_fips = final_df.rename(columns={'county_fips': 'fips'})

final_df_fips.dtypes

state_fips                          int64
state_name                         object
fips                                int64
county_name                        object
county_start_date          datetime64[ns]
county_end_date                    object
state_start_date                   object
state_end_date                     object
earliest_start_date                object
mask_mandate                       object
month_started                      object
mandate_duration_months             int64
dtype: object

In [24]:

# merge final_df_fips and df1_total
full_merged_df = pd.merge(final_df_fips, df1_total, on='fips', how='outer')
full_merged_df


Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5.0,2020-12-31,Jackson,Alabama,5097.0,34.0
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5.0,2020-12-31,Marion,Alabama,2087.0,43.0
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6.0,2020-12-31,Jefferson,Alabama,52339.0,697.0
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5.0,2020-12-31,Greene,Alabama,672.0,20.0
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5.0,2020-12-31,Madison,Alabama,22197.0,178.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3160,,,69120,,NaT,,,,,,,,2020-12-31,Tinian,Northern Mariana Islands,2.0,0.0
3161,,,46102,,NaT,,,,,,,,2020-12-31,Oglala Lakota,South Dakota,1927.0,35.0
3162,,,78010,,NaT,,,,,,,,2020-12-31,St. Croix,Virgin Islands,829.0,7.0
3163,,,78020,,NaT,,,,,,,,2020-12-31,St. John,Virgin Islands,174.0,1.0


In [25]:
# Drop NA in the cases column

full_merged_df.dropna(subset=['cases'], inplace=True)
full_merged_df

Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5.0,2020-12-31,Jackson,Alabama,5097.0,34.0
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5.0,2020-12-31,Marion,Alabama,2087.0,43.0
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6.0,2020-12-31,Jefferson,Alabama,52339.0,697.0
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5.0,2020-12-31,Greene,Alabama,672.0,20.0
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5.0,2020-12-31,Madison,Alabama,22197.0,178.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3160,,,69120,,NaT,,,,,,,,2020-12-31,Tinian,Northern Mariana Islands,2.0,0.0
3161,,,46102,,NaT,,,,,,,,2020-12-31,Oglala Lakota,South Dakota,1927.0,35.0
3162,,,78010,,NaT,,,,,,,,2020-12-31,St. Croix,Virgin Islands,829.0,7.0
3163,,,78020,,NaT,,,,,,,,2020-12-31,St. John,Virgin Islands,174.0,1.0


In [26]:
# Drop NA in the deaths column

full_merged_df.dropna(subset=['deaths'], inplace=True)
full_merged_df

Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5.0,2020-12-31,Jackson,Alabama,5097.0,34.0
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5.0,2020-12-31,Marion,Alabama,2087.0,43.0
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6.0,2020-12-31,Jefferson,Alabama,52339.0,697.0
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5.0,2020-12-31,Greene,Alabama,672.0,20.0
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5.0,2020-12-31,Madison,Alabama,22197.0,178.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3160,,,69120,,NaT,,,,,,,,2020-12-31,Tinian,Northern Mariana Islands,2.0,0.0
3161,,,46102,,NaT,,,,,,,,2020-12-31,Oglala Lakota,South Dakota,1927.0,35.0
3162,,,78010,,NaT,,,,,,,,2020-12-31,St. Croix,Virgin Islands,829.0,7.0
3163,,,78020,,NaT,,,,,,,,2020-12-31,St. John,Virgin Islands,174.0,1.0


In [27]:
full_merged_df.dropna(subset=['mandate_duration_months'], inplace=True)
full_merged_df

Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5.0,2020-12-31,Jackson,Alabama,5097.0,34.0
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5.0,2020-12-31,Marion,Alabama,2087.0,43.0
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6.0,2020-12-31,Jefferson,Alabama,52339.0,697.0
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5.0,2020-12-31,Greene,Alabama,672.0,20.0
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5.0,2020-12-31,Madison,Alabama,22197.0,178.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,56.0,WYOMING,56011,Crook County,NaT,,,,,0,0,0.0,2020-12-31,Crook,Wyoming,388.0,7.0
3146,56.0,WYOMING,56027,Niobrara County,NaT,,,,,0,0,0.0,2020-12-31,Niobrara,Wyoming,140.0,2.0
3147,56.0,WYOMING,56015,Goshen County,NaT,,,,,0,0,0.0,2020-12-31,Goshen,Wyoming,1010.0,17.0
3148,56.0,WYOMING,56029,Park County,NaT,,,,,0,0,0.0,2020-12-31,Park,Wyoming,2021.0,10.0


In [28]:
# checking the data types of the full_merged_df

full_merged_df.dtypes

state_fips                        float64
state_name                         object
fips                                int64
county_name                        object
county_start_date          datetime64[ns]
county_end_date                    object
state_start_date                   object
state_end_date                     object
earliest_start_date                object
mask_mandate                       object
month_started                      object
mandate_duration_months           float64
date                               object
county                             object
state                              object
cases                             float64
deaths                            float64
dtype: object

In [29]:
# count the number of NAs in cases column

full_merged_df['cases'].isna().sum()

0

In [30]:
# count the number of NAs in deaths column

full_merged_df['deaths'].isna().sum()

0

In [31]:
full_merged_df['mandate_duration_months'].isna().sum()

0

In [32]:
# change the data type of mandate_duration_months column to int64

full_merged_df = full_merged_df.astype({'mandate_duration_months': 'int64'})
full_merged_df.head()

Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Jackson,Alabama,5097.0,34.0
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5,2020-12-31,Marion,Alabama,2087.0,43.0
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6,2020-12-31,Jefferson,Alabama,52339.0,697.0
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Greene,Alabama,672.0,20.0
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5,2020-12-31,Madison,Alabama,22197.0,178.0


In [33]:
# change the data type of months_started column to int64

full_merged_df = full_merged_df.astype({'month_started' : 'int64'})
full_merged_df.head()

Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Jackson,Alabama,5097.0,34.0
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5,2020-12-31,Marion,Alabama,2087.0,43.0
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6,2020-12-31,Jefferson,Alabama,52339.0,697.0
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Greene,Alabama,672.0,20.0
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5,2020-12-31,Madison,Alabama,22197.0,178.0


In [34]:
# change the data type of months_started column to int64

full_merged_df = full_merged_df.astype({'mask_mandate' : 'int64'})
full_merged_df.head()

Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Jackson,Alabama,5097.0,34.0
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5,2020-12-31,Marion,Alabama,2087.0,43.0
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6,2020-12-31,Jefferson,Alabama,52339.0,697.0
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Greene,Alabama,672.0,20.0
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5,2020-12-31,Madison,Alabama,22197.0,178.0


In [35]:
full_merged_df = full_merged_df.astype({'cases' : 'int64'})
full_merged_df.head()

Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Jackson,Alabama,5097,34.0
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5,2020-12-31,Marion,Alabama,2087,43.0
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6,2020-12-31,Jefferson,Alabama,52339,697.0
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Greene,Alabama,672,20.0
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5,2020-12-31,Madison,Alabama,22197,178.0


In [36]:
full_merged_df = full_merged_df.astype({'deaths' : 'int64'})
full_merged_df.head()

Unnamed: 0,state_fips,state_name,fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months,date,county,state,cases,deaths
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Jackson,Alabama,5097,34
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5,2020-12-31,Marion,Alabama,2087,43
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6,2020-12-31,Jefferson,Alabama,52339,697
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5,2020-12-31,Greene,Alabama,672,20
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5,2020-12-31,Madison,Alabama,22197,178


In [37]:
# checking the full_merged_df data types

full_merged_df.dtypes

state_fips                        float64
state_name                         object
fips                                int64
county_name                        object
county_start_date          datetime64[ns]
county_end_date                    object
state_start_date                   object
state_end_date                     object
earliest_start_date                object
mask_mandate                        int64
month_started                       int64
mandate_duration_months             int64
date                               object
county                             object
state                              object
cases                               int64
deaths                              int64
dtype: object

In [38]:
# writing the data frame to a csv

full_merged_df.to_csv('full_merged_df.csv', index=False)

## Dataset 3 - US Counties - Lat Long

In [39]:
# Reading in the uscounties_lat_long.csv file

file = "Resources/uscounties_lat_long.csv"
dfe = pd.read_csv(file)

dfe.head()

Unnamed: 0,county_name,fips,state_name,lat,lng
0,Los Angeles,6037,California,34.3207,-118.2248
1,Cook,17031,Illinois,41.8401,-87.8168
2,Harris,48201,Texas,29.8577,-95.3936
3,Maricopa,4013,Arizona,33.349,-112.4915
4,San Diego,6073,California,33.0341,-116.7353


In [40]:
# Merging the full_merged_df with the uscounties_lat_long (dfe)

full_merged_latlong_df = pd.merge(full_merged_df, dfe, how="left", on=["fips", "fips"])
full_merged_latlong_df.head()

Unnamed: 0,state_fips,state_name_x,fips,county_name_x,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,...,mandate_duration_months,date,county,state,cases,deaths,county_name_y,state_name_y,lat,lng
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,...,5,2020-12-31,Jackson,Alabama,5097,34,Jackson,Alabama,34.7795,-85.9994
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,...,5,2020-12-31,Marion,Alabama,2087,43,Marion,Alabama,34.1366,-87.8871
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,...,6,2020-12-31,Jefferson,Alabama,52339,697,Jefferson,Alabama,33.5543,-86.8964
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,...,5,2020-12-31,Greene,Alabama,672,20,Greene,Alabama,32.8531,-87.9522
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,...,5,2020-12-31,Madison,Alabama,22197,178,Madison,Alabama,34.7631,-86.5502


In [41]:
full_merged_latlong_df.dtypes

state_fips                        float64
state_name_x                       object
fips                                int64
county_name_x                      object
county_start_date          datetime64[ns]
county_end_date                    object
state_start_date                   object
state_end_date                     object
earliest_start_date                object
mask_mandate                        int64
month_started                       int64
mandate_duration_months             int64
date                               object
county                             object
state                              object
cases                               int64
deaths                              int64
county_name_y                      object
state_name_y                       object
lat                               float64
lng                               float64
dtype: object

In [42]:
full_merged_latLong_df = full_merged_latlong_df.astype({'mask_mandate': 'int64'})
full_merged_latlong_df.head()

Unnamed: 0,state_fips,state_name_x,fips,county_name_x,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,...,mandate_duration_months,date,county,state,cases,deaths,county_name_y,state_name_y,lat,lng
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,...,5,2020-12-31,Jackson,Alabama,5097,34,Jackson,Alabama,34.7795,-85.9994
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,...,5,2020-12-31,Marion,Alabama,2087,43,Marion,Alabama,34.1366,-87.8871
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,...,6,2020-12-31,Jefferson,Alabama,52339,697,Jefferson,Alabama,33.5543,-86.8964
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,...,5,2020-12-31,Greene,Alabama,672,20,Greene,Alabama,32.8531,-87.9522
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,...,5,2020-12-31,Madison,Alabama,22197,178,Madison,Alabama,34.7631,-86.5502


In [43]:
full_merged_latLong_df = full_merged_latlong_df.astype({'month_started': 'int64'})
full_merged_latlong_df.head()

Unnamed: 0,state_fips,state_name_x,fips,county_name_x,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,...,mandate_duration_months,date,county,state,cases,deaths,county_name_y,state_name_y,lat,lng
0,1.0,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,...,5,2020-12-31,Jackson,Alabama,5097,34,Jackson,Alabama,34.7795,-85.9994
1,1.0,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,...,5,2020-12-31,Marion,Alabama,2087,43,Marion,Alabama,34.1366,-87.8871
2,1.0,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,...,6,2020-12-31,Jefferson,Alabama,52339,697,Jefferson,Alabama,33.5543,-86.8964
3,1.0,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,...,5,2020-12-31,Greene,Alabama,672,20,Greene,Alabama,32.8531,-87.9522
4,1.0,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,...,5,2020-12-31,Madison,Alabama,22197,178,Madison,Alabama,34.7631,-86.5502


In [44]:
full_merged_latlong_df.dtypes

state_fips                        float64
state_name_x                       object
fips                                int64
county_name_x                      object
county_start_date          datetime64[ns]
county_end_date                    object
state_start_date                   object
state_end_date                     object
earliest_start_date                object
mask_mandate                        int64
month_started                       int64
mandate_duration_months             int64
date                               object
county                             object
state                              object
cases                               int64
deaths                              int64
county_name_y                      object
state_name_y                       object
lat                               float64
lng                               float64
dtype: object

In [45]:
# write the full_merged_latlong_df to a csv

full_merged_latlong_df.to_csv('./Resources/full_merged_latlong.csv', index=False)

In [52]:
from sqlalchemy import create_engine
import psycopg2

#from config import db_password

# creat the connection to the PostgreSQL database, connection string
db_string = f"postgresql://postgres:covid19data@127.0.0.1:5432/COVID"

# create the database engine
engine = create_engine(db_string)

# save the movies_df dataframe to a SQL table
full_merged_latlong_df.to_sql(name='fmdll', con=engine)