In [1]:
import pandas as pd
df = "Resources/us-counties.csv"
df1 = pd.read_csv(df)
file = "Resources/county_mask_mandate_data.xlsx"
df2 = pd.read_excel(file)

## Read in Both Datasets

In [2]:
df1.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0


In [3]:
#Shows the data types for the df1
df1.dtypes

date       object
county     object
state      object
fips      float64
cases       int64
deaths    float64
dtype: object

In [4]:
# count the number of rows with empty columns
count = df1['fips'].isna().sum()
print(count)

19744


In [5]:
#count the number of rows with data in each column
df1.count()

date      2135147
county    2135147
state     2135147
fips      2115403
cases     2135147
deaths    2086356
dtype: int64

In [6]:
# Drop the rows with blank data creating a new dataframe
df1_clean = df1.dropna()

In [7]:
# count the number of rows with data in each column from the clean dataframe
df1_clean.count()

date      2066612
county    2066612
state     2066612
fips      2066612
cases     2066612
deaths    2066612
dtype: int64

In [8]:
#change fips from float64 to int64 to match datatype of df1_clean "county_fips"
df1_clean['fips'].astype('int64')

0          53061
1          53061
2          53061
3          17031
4          53061
           ...  
2135142    56037
2135143    56039
2135144    56041
2135145    56043
2135146    56045
Name: fips, Length: 2066612, dtype: int64

In [9]:
# change deaths from float64 to int64 to match datatype of df1_clean "deaths"
df1_clean['deaths'].astype('int64')

0            0
1            0
2            0
3            0
4            0
          ... 
2135142    114
2135143     14
2135144     34
2135145     41
2135146     16
Name: deaths, Length: 2066612, dtype: int64

In [10]:
df1_clean['cases'].astype('int64')

0             1
1             1
2             1
3             1
4             1
           ... 
2135142    9241
2135143    8741
2135144    4827
2135145    2022
2135146    1341
Name: cases, Length: 2066612, dtype: int64

In [11]:
# Group by county to get the total_cases for each county
df1_clean['total_cases'] = df1_clean['cases'].groupby(df1_clean['county']).transform('sum')
df1_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,date,county,state,fips,cases,deaths,total_cases
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0,20811122
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0,20811122
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0,20811122
3,2020-01-24,Cook,Illinois,17031.0,1,0.0,265186377
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0,20811122


In [12]:
# Group by county to get the total_deaths for each county
df1_clean['total_deaths'] = df1_clean['deaths'].groupby(df1_clean['county']).transform('sum')
df1_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,date,county,state,fips,cases,deaths,total_cases,total_deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0,20811122,301036.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0,20811122,301036.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0,20811122,301036.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0,265186377,5516261.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0,20811122,301036.0


In [13]:
# Drop case and death column
df1_clean = df1_clean.drop(columns=["cases", "deaths", "date"])
df1_clean.head()

Unnamed: 0,county,state,fips,total_cases,total_deaths
0,Snohomish,Washington,53061.0,20811122,301036.0
1,Snohomish,Washington,53061.0,20811122,301036.0
2,Snohomish,Washington,53061.0,20811122,301036.0
3,Cook,Illinois,17031.0,265186377,5516261.0
4,Snohomish,Washington,53061.0,20811122,301036.0


In [14]:
# Drop duplicates so only county shows once
df1_clean = df1_clean.drop_duplicates(subset=["fips"])
df1_clean.head()

Unnamed: 0,county,state,fips,total_cases,total_deaths
0,Snohomish,Washington,53061.0,20811122,301036.0
3,Cook,Illinois,17031.0,265186377,5516261.0
5,Orange,California,6059.0,232137717,3337299.0
8,Maricopa,Arizona,4013.0,269498690,4658788.0
9,Los Angeles,California,6037.0,578223385,10513090.0


## DataSet 2 Mask Mandate

In [15]:
df2.head()

Unnamed: 0,state_fips,state_name,county_fips,county_name,county_start_date,county_end_date,county_conditions,county_source,escalation,defiance,county_start_edate,state_start_date,state_end_date,state_conditions,state_source,state_start_edate,earliest_start_edate,county_fips_string,earliest_start_date
0,1,ALABAMA,1067,Henry County,NaT,,,,,,,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22112.0,1067,7-16-2020
1,1,ALABAMA,1071,Jackson County,2020-07-01,,all public places,https://www.examiner.net/news/20200629/jackson...,,,22097.0,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22097.0,1071,7-1-2020
2,1,ALABAMA,1093,Marion County,2020-07-09,,retail only,https://fox59.com/news/coronavirus/starting-to...,,,22105.0,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22105.0,1093,7-9-2020
3,1,ALABAMA,1073,Jefferson County,2020-06-29,,all public places,https://www.wbrc.com/2020/06/29/jefferson-coun...,,,22095.0,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22095.0,1073,6-29-2020
4,1,ALABAMA,1099,Monroe County,NaT,,,,,,,07/16/2020,07/31/2020,all public places,https://www.alabamapublichealth.gov/legal/asse...,22112.0,22112.0,1099,7-16-2020


In [16]:
df2.dtypes

state_fips                       int64
state_name                      object
county_fips                      int64
county_name                     object
county_start_date       datetime64[ns]
county_end_date                 object
county_conditions               object
county_source                   object
escalation                      object
defiance                        object
county_start_edate             float64
state_start_date                object
state_end_date                  object
state_conditions                object
state_source                    object
state_start_edate              float64
earliest_start_edate           float64
county_fips_string               int64
earliest_start_date             object
dtype: object

In [17]:
df2.count()

state_fips              3150
state_name              3150
county_fips             3150
county_name             3149
county_start_date       1500
county_end_date          137
county_conditions       1377
county_source           1778
escalation                34
defiance                 203
county_start_edate      1500
state_start_date        1652
state_end_date           504
state_conditions        2045
state_source            3150
state_start_edate       1652
earliest_start_edate    2103
county_fips_string      3150
earliest_start_date     2103
dtype: int64

In [18]:
# Drop the unneeded columns 
df2_clean1 = df2.drop(['escalation', 'defiance', 'county_conditions', 'county_source', 'county_start_edate', 'state_start_edate', 'earliest_start_edate', 'county_fips_string', 'state_conditions', 'state_source'], axis=1)

df2_clean1.head()

Unnamed: 0,state_fips,state_name,county_fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date
0,1,ALABAMA,1067,Henry County,NaT,,07/16/2020,07/31/2020,7-16-2020
1,1,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020
2,1,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020
3,1,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020
4,1,ALABAMA,1099,Monroe County,NaT,,07/16/2020,07/31/2020,7-16-2020


In [19]:
# Create our mask_mandate column with binary values Yes or No
no_mandate_df = df2_clean1[df2_clean1.county_start_date.isnull()]
mandate_df = df2_clean1[df2_clean1.county_start_date.notnull()]

In [27]:
# Create mask_mandate column
no_mandate_df["mask_mandate"] = "0"
mandate_df["mask_mandate"] = "1"
merged_df = no_mandate_df.append(mandate_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Show duration of mask mandate in 2020 only

In [37]:
# Create a null and not null dataset based on county start date
duration_df = merged_df[merged_df.county_start_date.notnull()]
duration_df1 = merged_df[merged_df.county_start_date.isnull()]
# Create a column that keeps up with month started 
duration_df['month_started'] = pd.DatetimeIndex(duration_df['county_start_date']).month
duration_df1['month_started'] = "0"


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [38]:
# Create a new column that is the difference between month started and total months in a year
duration_df["mandate_duration_months"] = (12 - duration_df.month_started)
duration_df1["mandate_duration_months"] = 0
final_df = duration_df.append(duration_df1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [40]:
final_df

Unnamed: 0,state_fips,state_name,county_fips,county_name,county_start_date,county_end_date,state_start_date,state_end_date,earliest_start_date,mask_mandate,month_started,mandate_duration_months
1,1,ALABAMA,1071,Jackson County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5
2,1,ALABAMA,1093,Marion County,2020-07-09,,07/16/2020,07/31/2020,7-9-2020,1,7,5
3,1,ALABAMA,1073,Jefferson County,2020-06-29,,07/16/2020,07/31/2020,6-29-2020,1,6,6
10,1,ALABAMA,1063,Greene County,2020-07-01,,07/16/2020,07/31/2020,7-1-2020,1,7,5
23,1,ALABAMA,1089,Madison County,2020-07-07,,07/16/2020,07/31/2020,7-7-2020,1,7,5
...,...,...,...,...,...,...,...,...,...,...,...,...
3144,56,WYOMING,56011,Crook County,NaT,,,,,0,0,0
3145,56,WYOMING,56027,Niobrara County,NaT,,,,,0,0,0
3146,56,WYOMING,56015,Goshen County,NaT,,,,,0,0,0
3147,56,WYOMING,56029,Park County,NaT,,,,,0,0,0


In [22]:
# Split the county name into two columns NEEDS SOME WORK BECAUSE THERE ARE ARE SOME WITH MORE THAN ONE WORD AND END IN OTHER THINGS THAN COUNTY (BOROUGH, PARISH, CITY, CENSUS AREA)
# may not be needed if using FIP as the identifier

#df2_clean1['county'] = df2_clean1['county_name'].str.split(' ', expand=True)[0]
#df2_clean1['name'] = df2_clean1['county_name'].str.split(' ', expand=True)[1]

#df2_clean1.head()

In [23]:
# Drop the county_name column and the name column only leaving the county column with county name
#df2_clean2 = df2_clean1.drop(['county_name', 'name'], axis=1)

#df2_clean2.head()

In [24]:
#df2_clean2.dtypes

In [25]:
#df2_clean2.count()