# US Data Wrangling

In [1]:
### reading data in
import pandas as pd
import csv

url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
df = pd.read_csv(url)
###this just gets rid of scientific notation in the pandas display and replaces it with a float to 3 decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)
#this just turns off an error I kept getting... hahaha
pd.options.mode.chained_assignment = None  # default='warn'
df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0


In [2]:
#just an option to print 15 rows
pd.set_option('display.max_rows', 15)

In [3]:
### create unique ID for to separate county/st with same name. not all counties have fips codes listed 

df['uniqueId'] = df["county"] +", "+ df["state"]
df.head()

Unnamed: 0,date,county,state,fips,cases,deaths,uniqueId
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0,"Snohomish, Washington"
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0,"Snohomish, Washington"
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0,"Snohomish, Washington"
3,2020-01-24,Cook,Illinois,17031.0,1,0.0,"Cook, Illinois"
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0,"Snohomish, Washington"


# Check percentages of 5/1 vs 6/1 #

In [4]:
### narrowing data down
#convert date column to datetime type
df['date'] = pd.to_datetime(df['date'])
#only dates after/on 5/30/2020 or before/on 01/01/2021
df = df.loc[(df['date'] >= '2020-05-30') & (df['date'] <= '2021-01-01')]
#group to see unique data points per county/state
groups = df.groupby('uniqueId')


In [14]:
### gets list of state/county combos (uniqueId) and iterates through them to ensure they all have the proper # of rows
cleanedData = pd.DataFrame()
listUnique = df.uniqueId.unique()


for x in listUnique:
    county = groups.get_group(x)
    
    ### need to figure out how to broadly apply over a county, not whole dataset 
    
    ### get daily case counts
    county['daily cases'] = county['cases'].diff()


    ### get daily deaths
    county['daily deaths'] = county['deaths'].diff()

    ###drop 4/30/2020

    dfList= [cleanedData, county]
    
    cleanedData = pd.concat(dfList)
    


In [9]:
print(cleanedData.head)

<bound method NDFrame.head of              date   county    state      fips  cases  deaths  \
193780 2020-05-31  Autauga  Alabama  1001.000    221   4.000   
196850 2020-06-01  Autauga  Alabama  1001.000    234   5.000   
199923 2020-06-02  Autauga  Alabama  1001.000    240   5.000   
202999 2020-06-03  Autauga  Alabama  1001.000    240   5.000   
206078 2020-06-04  Autauga  Alabama  1001.000    242   5.000   
...           ...      ...      ...       ...    ...     ...   
869065 2020-12-27  Kalawao   Hawaii 15005.000      1   0.000   
872310 2020-12-28  Kalawao   Hawaii 15005.000      1   0.000   
875555 2020-12-29  Kalawao   Hawaii 15005.000      1   0.000   
878800 2020-12-30  Kalawao   Hawaii 15005.000      1   0.000   
882045 2020-12-31  Kalawao   Hawaii 15005.000      1   0.000   

                uniqueId  daily cases  daily deaths  
193780  Autauga, Alabama        5.000         0.000  
196850  Autauga, Alabama       13.000         1.000  
199923  Autauga, Alabama        6.000  

In [10]:
cleanedData = cleanedData[cleanedData['date'] != '2020-05-30']
cleanedData = cleanedData[cleanedData['date'] != '2021-01-01']
cleanedData = cleanedData[cleanedData['county'] != 'Unknown']
listUnique = df.uniqueId.unique()

In [11]:
print(cleanedData)

             date   county    state      fips  cases  deaths  \
193780 2020-05-31  Autauga  Alabama  1001.000    221   4.000   
196850 2020-06-01  Autauga  Alabama  1001.000    234   5.000   
199923 2020-06-02  Autauga  Alabama  1001.000    240   5.000   
202999 2020-06-03  Autauga  Alabama  1001.000    240   5.000   
206078 2020-06-04  Autauga  Alabama  1001.000    242   5.000   
...           ...      ...      ...       ...    ...     ...   
869065 2020-12-27  Kalawao   Hawaii 15005.000      1   0.000   
872310 2020-12-28  Kalawao   Hawaii 15005.000      1   0.000   
875555 2020-12-29  Kalawao   Hawaii 15005.000      1   0.000   
878800 2020-12-30  Kalawao   Hawaii 15005.000      1   0.000   
882045 2020-12-31  Kalawao   Hawaii 15005.000      1   0.000   

                uniqueId  daily cases  daily deaths  
193780  Autauga, Alabama        5.000         0.000  
196850  Autauga, Alabama       13.000         1.000  
199923  Autauga, Alabama        6.000         0.000  
202999  Autauga

In [12]:
groupedCleanedData = cleanedData.groupby('uniqueId')

In [19]:
for x in listUnique:
    county = groupedCleanedData.get_group(x)
    print(x)
    #print(county.describe())
    print(county.isnull().sum())

Autauga, Alabama
date            0
county          0
state           0
fips            0
cases           0
deaths          0
uniqueId        0
daily cases     0
daily deaths    0
dtype: int64
Baldwin, Alabama
date            0
county          0
state           0
fips            0
cases           0
deaths          0
uniqueId        0
daily cases     0
daily deaths    0
dtype: int64
Barbour, Alabama
date            0
county          0
state           0
fips            0
cases           0
deaths          0
uniqueId        0
daily cases     0
daily deaths    0
dtype: int64
Bibb, Alabama
date            0
county          0
state           0
fips            0
cases           0
deaths          0
uniqueId        0
daily cases     0
daily deaths    0
dtype: int64
Blount, Alabama
date            0
county          0
state           0
fips            0
cases           0
deaths          0
uniqueId        0
daily cases     0
daily deaths    0
dtype: int64
Bullock, Alabama
date            0
county   

KeyError: 'Unknown, Alaska'

In [22]:
cleanedData.isnull().sum()

date                0
county              0
state               0
fips             6265
cases               0
deaths          16926
uniqueId            0
daily cases      3261
daily deaths    20109
dtype: int64

In [None]:
### initialize dict to store data
USADict = {}
countyData = {}

### pulled state list to get all state names 
states = pd.read_csv('https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv')
states = states['State'].to_list()

for state in states: 
    stateData = df.loc[df['state'] == state]
    countyList = stateData.county.unique()
    groupedStateData = stateData.groupby(stateData.county)
    
    ### build a dictionary for county to data 
    for county in countyList:
        countyData[county] = groupedStateData.get_group(county)
    print(countyData)
    ### Key = state value = dictionary{county:data}
    USADict[state] = countyData
print(USADict)




In [None]:
### initialize dict to store data
USADict = {}

### pulled state list to get all state names 
states = pd.read_csv('https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv')
states = states['State'].to_list()

###pulled county list to get all county names 


#put each state into a dict, grab unique counties, and make a state:county dict
for state in states:
    realCountyList = []
    USADict[state] = df.loc[df['state'] == state]
    countyList = USADict[state].county.unique()
    
    ### assign county to key state
    for county in countyList:
        USADict[state][county] = 0
        realCountyList.append(county)
        USADict
    
    USADict[state] = realCountyList

print(USADict)
"""
#go through each state to clean data generally
for state in USADict:
    clnState = USADict[state]
    
    clnState['date'] = pd.to_datetime(clnState['date'])
    
    clnState = clnState.loc[clnState['date'] >= '2020-04-30']
    
    
    ###have to break down to county level to get county data together 
    clnState = clnState.groupby(clnState.county)
    countyList = clnState.county.unique()
    for county in countyList:
        print(county)
"""

In [None]:
states = pd.read_csv('https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv')
states = states['State'].to_list()

USADict = {}

for state in states:
    USADict[state] = {}
    countyList = USADict[state].county.unique()
    for county in countyList:
        USADict[state][county] = df.loc[(df['state'] == state) & (df['county'] == county)]


In [None]:
print(USADict)

In [None]:
### narrowing data down
#gets only california
caliData = df.loc[df['state'] == 'California']
#resets index to be sequential
caliData = caliData.reset_index()
del caliData['index']
#convert date column to datetime type
caliData['date'] = pd.to_datetime(caliData['date'])
#only dates after may 1 2020
caliData = caliData.loc[caliData['date'] >= '2020-04-30']



#group data by county
groupedCaliData = caliData.groupby(caliData.county)


# can assign each county their own dataframe in a for loop
alamedaCty = groupedCaliData.get_group('Alameda')


### get daily case counts
alamedaCty['daily cases'] = alamedaCty['cases'].diff()


### get daily deaths
alamedaCty['daily deaths'] = alamedaCty['deaths'].diff()

###drop 4/30/2020
alamedaCty['daily deaths'] = alamedaCty['deaths'].diff()

print(alamedaCty.describe())

In [None]:
import matplotlib.pyplot as plt
xtick = range(10)
axis = ['2020-05-01','2020-06-01','2020-07-01','2020-08-01','2020-09-01','2020-10-01','2020-11-01','2020-12-01','2020-01-01',]


plt.plot(alamedaCty['date'], alamedaCty['daily cases'])
plt.plot(alamedaCty['date'], alamedaCty['daily deaths'])
plt.show()

plt.hist(alamedaCty['daily deaths'])
plt.show()


In [None]:
alamedaCty.describe()