In [12]:
import requests
import io 

import pandas as pd

# 1. Cases by county data
Found: https://github.com/nytimes/covid-19-data

In [26]:
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
r = requests.get(url)

In [27]:
df = pd.read_csv(io.BytesIO(r.content))

In [28]:
df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


#### Notes 1.1 
* https://covidtracking.com/api only had data at state level

In [110]:
r = requests.get('https://covidtracking.com/api/states/daily.csv')

In [111]:
df_states = pd.read_csv(io.BytesIO(r.content))

In [112]:
df_states.columns

Index(['date', 'state', 'positive', 'negative', 'pending',
       'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently',
       'inIcuCumulative', 'onVentilatorCurrently', 'onVentilatorCumulative',
       'recovered', 'hash', 'dateChecked', 'death', 'hospitalized', 'total',
       'totalTestResults', 'posNeg', 'fips', 'deathIncrease',
       'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease',
       'totalTestResultsIncrease'],
      dtype='object')

#### Notes 1.2 
* JHU data was only at global level. Main link [here](https://console.cloud.google.com/marketplace/details/johnshopkins/covid19_jhu_global_cases?filter=solution-type:dataset&q=covid&id=430e16bb-bd19-42dd-bb7a-d38386a9edf5&_ga=2.240248515.-1701848518.1585755058&pli=1), but can also be found: https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases

In [113]:
r = requests.get('https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv')

In [114]:
df_jhu = pd.read_csv(io.BytesIO(r.content))
df_jhu.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,4/1/20,4/2/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,74,84,94,110,110,120,170,174,237,273
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,123,146,174,186,197,212,223,243,259,277
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,264,302,367,409,454,511,584,716,847,986
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,164,188,224,267,308,334,370,376,390,428
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,3,3,4,4,5,7,7,7,8,8


# 2. Unemployment data
Info about all of this data: https://download.bls.gov/pub/time.series/la/la.txt

In [124]:
%%time
url_county = 'https://download.bls.gov/pub/time.series/la/la.data.64.County'
pd.read_csv(url_county, delim_whitespace=True)

CPU times: user 4.15 s, sys: 1.6 s, total: 5.75 s
Wall time: 2min 19s


Unnamed: 0,series_id,year,period,value,footnote_codes
0,LAUCN010010000000003,1990,M01,6.4,
1,LAUCN010010000000003,1990,M02,6.6,
2,LAUCN010010000000003,1990,M03,5.8,
3,LAUCN010010000000003,1990,M04,6.6,
4,LAUCN010010000000003,1990,M05,6,
...,...,...,...,...,...
5019371,LAUCN721530000000006,2019,M09,9543,
5019372,LAUCN721530000000006,2019,M10,9626,
5019373,LAUCN721530000000006,2019,M11,9536,
5019374,LAUCN721530000000006,2019,M12,9619,P


In [123]:
%%time
url_county = 'https://download.bls.gov/pub/time.series/la/la.data.64.County'
r = requests.get(url_county)
df_county = pd.read_csv(io.BytesIO(r.content), delim_whitespace=True)

df_county = df_county.assign(
    county_id=df_county.series_id.str.slice(start=3, stop=-2)
)



CPU times: user 4.33 s, sys: 651 ms, total: 4.98 s
Wall time: 13.9 s


In [116]:
url_county_map = 'https://download.bls.gov/pub/time.series/la/la.areamaps'

r = requests.get(url_county_map)
df_county_map = pd.read_csv(
    io.BytesIO(r.content),
    delimiter='\t', 
    usecols=['area_code', 'area_text']
)

In [119]:
df_unemployment_by_county_month = pd.merge(
    df_county,
    df_county_map,
    how='inner',
    left_on='county_id',
    right_on='area_code',
)

In [120]:
df_unemployment_by_county_month.head()

Unnamed: 0,series_id,year,period,value,footnote_codes,county_id,area_code,area_text
0,LAUCN010010000000003,1990,M01,6.4,,CN0100100000000,CN0100100000000,"Autauga County, AL"
1,LAUCN010010000000003,1990,M02,6.6,,CN0100100000000,CN0100100000000,"Autauga County, AL"
2,LAUCN010010000000003,1990,M03,5.8,,CN0100100000000,CN0100100000000,"Autauga County, AL"
3,LAUCN010010000000003,1990,M04,6.6,,CN0100100000000,CN0100100000000,"Autauga County, AL"
4,LAUCN010010000000003,1990,M05,6.0,,CN0100100000000,CN0100100000000,"Autauga County, AL"


#### Notes
Useful links:
* https://simplemaps.com/viz/unemployment
* https://fred.stlouisfed.org/series/TXPECO1URN

In [133]:
df_unemployment_by_county_month[
    (df_unemployment_by_county_month.year == 2020) &
    (df_unemployment_by_county_month.period == 'M02')
    
]

Unnamed: 0,series_id,year,period,value,footnote_codes,county_id,area_code,area_text
316343,LAUCN060370000000003,2020,M02,4.5,P,CN0603700000000,CN0603700000000,"Los Angeles County, CA"
316735,LAUCN060370000000004,2020,M02,237327.0,P,CN0603700000000,CN0603700000000,"Los Angeles County, CA"
317127,LAUCN060370000000005,2020,M02,4985250.0,P,CN0603700000000,CN0603700000000,"Los Angeles County, CA"
317519,LAUCN060370000000006,2020,M02,5222580.0,P,CN0603700000000,CN0603700000000,"Los Angeles County, CA"
495751,LAUCN110010000000003,2020,M02,5.2,P,CN1100100000000,CN1100100000000,District of Columbia
496143,LAUCN110010000000004,2020,M02,21696.0,P,CN1100100000000,CN1100100000000,District of Columbia
496535,LAUCN110010000000005,2020,M02,397566.0,P,CN1100100000000,CN1100100000000,District of Columbia
496927,LAUCN110010000000006,2020,M02,419262.0,P,CN1100100000000,CN1100100000000,District of Columbia
562839,LAUCN120860000000003,2020,M02,1.5,P,CN1208600000000,CN1208600000000,"Miami-Dade County, FL"
563231,LAUCN120860000000004,2020,M02,20378.0,P,CN1208600000000,CN1208600000000,"Miami-Dade County, FL"


# 3. Stay at home orders

In [9]:
import sys
sys.path.append('/Users/nathanieldake/.virtualenvs/road-runner-dropbox-download-venv/lib/python3.7/site-packages')

import time 

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

chromeOptions = webdriver.ChromeOptions()
# chromeOptions.headless = True
prefs = {"download.default_directory" : '/Users/nathanieldake/development/unsupervised/customer-projects/covid'}
chromeOptions.add_experimental_option("prefs",prefs)
chromedriver = "path/to/chromedriver.exe"
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chromeOptions)

url = 'https://www.kff.org/health-costs/issue-brief/state-data-and-policy-actions-to-address-coronavirus/'

driver.get(url)
time.sleep(10)

python_accept_terms_button = driver.find_element_by_xpath(
    '//*[@id="hs-eu-confirmation-button"]'
)
python_accept_terms_button.click()

python_export_button = driver.find_element_by_xpath('//*[@id="report-wrapper"]/div/div[2]/div[10]/div[10]/div/div/div/div/div/div/div[1]/div/button[2]')
python_export_button.click()

driver.quit()


Looking for [chromedriver 80.0.3987.106 mac64] driver in cache 
File found in cache by path [/Users/nathanieldake/.wdm/drivers/chromedriver/80.0.3987.106/mac64/chromedriver]


In [22]:
df_stay_at_home_orders = pd.read_csv('raw_data.csv', skiprows=2)
df_stay_at_home_orders.head()

Unnamed: 0,Location,Stay At Home Order,Mandatory Quarantine for Travelers,Non-Essential Business Closures,Large Gatherings Ban,State-Mandated School Closures,Bar/Restaurant Limits,Primary Election Postponement,Emergency Declaration,Footnotes
0,United States,Statewide (41); High-risk Groups (2); Other (1...,All Travelers (9); All Air Travelers (1); From...,All Non-Essential Businesses (35); All Non-Ess...,All Gatherings Prohibited (18); >10 Prohibited...,Yes (49); Effectively Closed (2),Closed except for takeout/delivery (47); Limit...,Yes (13); No (38),Yes (51); No (0),
1,Alabama,Statewide,-,All Non-Essential Businesses,>10 People Prohibited,Yes,Closed except for takeout/delivery,Yes,Yes,
2,Alaska,Statewide,All Travelers,All Non-Essential Businesses,All Gatherings Prohibited,Yes,Closed except for takeout/delivery,-,Yes,
3,Arizona,Statewide,-,All Non-Essential Businesses,>10 People Prohibited,Yes,Closed except for takeout/delivery,-,Yes,
4,Arkansas,-,-,-,>10 People Prohibited,Yes,Closed except for takeout/delivery,-,Yes,


## 4. Zip code map

In [24]:
df_zip_map = pd.read_csv('zip_code_database.csv')

In [25]:
df_zip_map

Unnamed: 0,zip,type,primary_city,acceptable_cities,unacceptable_cities,state,county,timezone,area_codes,latitude,longitude,world_region,country,decommissioned,estimated_population,notes
0,501,UNIQUE,Holtsville,,I R S Service Center,NY,Suffolk County,America/New_York,631,40.81,-73.04,,US,0,384,
1,544,UNIQUE,Holtsville,,Irs Service Center,NY,Suffolk County,America/New_York,631,40.81,-73.04,,US,0,0,
2,601,STANDARD,Adjuntas,,"Colinas Del Gigante, Jard De Adjuntas, Urb San...",PR,Adjuntas,America/Puerto_Rico,787939,18.16,-66.72,,US,0,0,
3,602,STANDARD,Aguada,,"Alts De Aguada, Bo Guaniquilla, Comunidad Las ...",PR,,,787,18.38,-67.18,,US,0,0,
4,603,STANDARD,Aguadilla,Ramey,"Bda Caban, Bda Esteves, Bo Borinquen, Bo Ceiba...",PR,Aguadilla,America/Puerto_Rico,787,18.43,-67.15,,US,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42517,99926,PO BOX,Metlakatla,,,AK,Prince of Wales-Outer Ketchikan Borough,America/Juneau,907,55.14,-131.49,,US,0,1018,
42518,99927,PO BOX,Point Baker,,,AK,Prince of Wales-Outer Ketchikan Borough,America/Juneau,907,56.30,-133.57,,US,0,0,
42519,99928,PO BOX,Ward Cove,,,AK,Ketchikan Gateway Borough,America/Juneau,907,55.45,-131.79,,US,0,1281,
42520,99929,PO BOX,Wrangell,,,AK,Wrangell Borough,,907,55.95,-131.96,,US,0,1956,


In [112]:
df.county[df.county.str.contains('New Y')]

416      New York City
448      New York City
482      New York City
518      New York City
565      New York City
627      New York City
715      New York City
820      New York City
947      New York City
1098     New York City
1285     New York City
1521     New York City
1814     New York City
2163     New York City
2574     New York City
3033     New York City
3549     New York City
4150     New York City
4879     New York City
5728     New York City
6722     New York City
7817     New York City
9015     New York City
10326    New York City
11767    New York City
13380    New York City
15124    New York City
16964    New York City
18902    New York City
20952    New York City
23093    New York City
25297    New York City
27559    New York City
29882    New York City
32273    New York City
34709    New York City
37184         New York
37185    New York City
Name: county, dtype: object

In [32]:
df_zip_map.county.isin(df.county).sum()

395

In [43]:
df_zip_map.[df_zip_map.county.str.contains('Boulder').fillna(False)]

Unnamed: 0,zip,type,primary_city,acceptable_cities,unacceptable_cities,state,county,timezone,area_codes,latitude,longitude,world_region,country,decommissioned,estimated_population,notes
35205,80025,PO BOX,Eldorado Springs,Eldorado Sprg,,CO,Boulder County,America/Denver,303,39.93,-105.27,,US,0,0,
35206,80026,STANDARD,Lafayette,,,CO,Boulder County,America/Denver,720303,39.99,-105.09,,US,0,22364,
35207,80027,STANDARD,Louisville,Superior,,CO,Boulder County,America/Denver,303720,39.96,-105.14,,US,0,27059,
35208,80028,UNIQUE,Louisville,,Storage Technology Corp,CO,Boulder County,America/Denver,303,39.95,-105.14,,US,1,0,Converted Decommisioned Zipcodes
35338,80301,STANDARD,Boulder,,,CO,Boulder County,America/Denver,303720,40.02,-105.25,,US,0,19036,
35339,80302,STANDARD,Boulder,,,CO,Boulder County,America/Denver,303720,40.02,-105.25,,US,0,15073,
35340,80303,STANDARD,Boulder,,,CO,Boulder County,America/Denver,303720,40.02,-105.25,,US,0,16791,
35341,80304,STANDARD,Boulder,,,CO,Boulder County,America/Denver,720303,40.02,-105.25,,US,0,21108,
35342,80305,STANDARD,Boulder,,,CO,Boulder County,America/Denver,303720,40.02,-105.25,,US,0,13065,
35343,80306,PO BOX,Boulder,,,CO,Boulder County,America/Denver,303,40.02,-105.25,,US,0,1605,


In [68]:
county_clipped = df_zip_map.county.str.split(' ').str[0:-1]

In [98]:
county_clipped = df_zip_map.county.str.replace(' County', '')

In [99]:
county_clipped

0                                        Suffolk
1                                        Suffolk
2                                       Adjuntas
3                                            NaN
4                                      Aguadilla
                          ...                   
42517    Prince of Wales-Outer Ketchikan Borough
42518    Prince of Wales-Outer Ketchikan Borough
42519                  Ketchikan Gateway Borough
42520                           Wrangell Borough
42521    Prince of Wales-Outer Ketchikan Borough
Name: county, Length: 42522, dtype: object

In [100]:
df.county.isin(county_clipped).sum()

36227

In [101]:
df.county[~df.county.isin(county_clipped)]

416          New York City
418                Unknown
448          New York City
450                Unknown
482          New York City
               ...        
38041      Waynesboro city
38043    Williamsburg city
38044      Winchester city
38080              Unknown
38168            St. Croix
Name: county, Length: 1970, dtype: object

In [61]:
df.county.shape

(38197,)

In [107]:
county_clipped[county_clipped.str.contains('New York').fillna(False)]

Series([], Name: county, dtype: object)

In [108]:
df_zip_map[df_zip_map.county.str.contains('New York').fillna(False)]

Unnamed: 0,zip,type,primary_city,acceptable_cities,unacceptable_cities,state,county,timezone,area_codes,latitude,longitude,world_region,country,decommissioned,estimated_population,notes
3669,10001,STANDARD,New York,,"Empire State, G P O, Greeley Square, Macys Fin...",NY,New York County,America/New_York,212718917518646516,40.71,-73.99,,US,0,16553,
3670,10002,STANDARD,New York,Knickerbocker,"Manhattan, New York City, Ny, Ny City, Nyc",NY,New York County,America/New_York,646917212516718,40.71,-73.99,,US,0,70604,
3671,10003,STANDARD,New York,,"Cooper, Manhattan",NY,New York County,America/New_York,212646917347718,40.71,-73.99,,US,0,36569,
3672,10004,STANDARD,New York,Bowling Green,,NY,New York County,America/New_York,212917646516347718,40.71,-73.99,,US,0,2909,
3673,10005,STANDARD,New York,Wall Street,"Manhattan, Nyc",NY,New York County,America/New_York,212646917347718,40.71,-73.99,,US,0,6293,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3829,10281,STANDARD,New York,,"Manhattan, Nyc",NY,New York County,America/New_York,212646,40.71,-73.99,,US,0,2180,
3830,10282,STANDARD,New York,,"Manhattan, Nyc",NY,New York County,America/New_York,212646,40.71,-73.99,,US,0,3575,
3831,10285,UNIQUE,New York,,Shearson American Express,NY,New York County,America/New_York,212,40.71,-73.99,,US,0,0,
3832,10286,UNIQUE,New York,,"Bank Of New York, Manhattan, Ny, Nyc",NY,New York County,America/New_York,212,40.71,-73.99,,US,0,0,


In [113]:
!pip install python-Levenshtein

Collecting python-Levenshtein
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
    100% |████████████████████████████████| 51kB 1.3MB/s ta 0:00:01
Building wheels for collected packages: python-Levenshtein
  Running setup.py bdist_wheel for python-Levenshtein ... [?25ldone
[?25h  Stored in directory: /Users/nathanieldake/Library/Caches/pip/wheels/de/c2/93/660fd5f7559049268ad2dc6d81c4e39e9e36518766eaf7e342
Successfully built python-Levenshtein
tensorboard 2.0.1 has requirement setuptools>=41.0.0, but you'll have setuptools 39.0.1 which is incompatible.
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.0
You are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.


In [116]:
import Levenshtein

In [120]:
Levenshtein.distance('New York', 'New York City')

5

In [125]:
df_zip_map.county.dropna().apply(lambda x: Levenshtein.distance(x, 'New York City')).sort_values()

3747      3
3775      3
3776      3
3777      3
3778      3
         ..
42512    34
42511    34
42510    34
42515    34
42521    34
Name: county, Length: 41672, dtype: int64

In [129]:
idx = df_zip_map.county.dropna().apply(lambda x: Levenshtein.distance(x, 'New York City')).sort_values().index

In [130]:
df_zip_map.county[idx]

3747                             New York County
3775                             New York County
3776                             New York County
3777                             New York County
3778                             New York County
                          ...                   
42512    Prince of Wales-Outer Ketchikan Borough
42511    Prince of Wales-Outer Ketchikan Borough
42510    Prince of Wales-Outer Ketchikan Borough
42515    Prince of Wales-Outer Ketchikan Borough
42521    Prince of Wales-Outer Ketchikan Borough
Name: county, Length: 41672, dtype: object

## Expansion dataset join class

In [193]:
df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [195]:
df_zip_map.head()

Unnamed: 0,zip,type,primary_city,acceptable_cities,unacceptable_cities,state,county,timezone,area_codes,latitude,longitude,world_region,country,decommissioned,estimated_population,notes
0,501,UNIQUE,Holtsville,,I R S Service Center,NY,Suffolk County,America/New_York,631,40.81,-73.04,,US,0,384,
1,544,UNIQUE,Holtsville,,Irs Service Center,NY,Suffolk County,America/New_York,631,40.81,-73.04,,US,0,0,
2,601,STANDARD,Adjuntas,,"Colinas Del Gigante, Jard De Adjuntas, Urb San...",PR,Adjuntas,America/Puerto_Rico,787939,18.16,-66.72,,US,0,0,
3,602,STANDARD,Aguada,,"Alts De Aguada, Bo Guaniquilla, Comunidad Las ...",PR,,,787,18.38,-67.18,,US,0,0,
4,603,STANDARD,Aguadilla,Ramey,"Bda Caban, Bda Esteves, Bo Borinquen, Bo Ceiba...",PR,Aguadilla,America/Puerto_Rico,787,18.43,-67.15,,US,0,0,


In [None]:
df = df.ass

### Define a preprocess function for each join col 

In [220]:
def preprocess_county(county_col):
    """Lower and remove County from end"""
    return county_col.str.lower().str.replace(' county', '')

In [225]:
a = preprocess_county(df.county)
b = preprocess_county(df_zip_map.county)
a.isin(b).sum()

36834

In [230]:
df = df.assign(
    **{f'processed_county': preprocess_county(df.county)}
)

In [231]:
df_zip_map = df_zip_map.assign(
    **{f'processed_county': preprocess_county(df_zip_map.county)}
)

### Based on preprocessed join col, find closest match

In [274]:
def closest_match(row, col_map=None):
    sorted_dist_indices = col_map.apply(lambda x: Levenshtein.distance(x, row)).sort_values().index
    return col_map[sorted_dist_indices].iloc[0]

In [275]:
# 1000 takes 23 seconds
# 2000 takes 47 seconds
# 3000 takes 70 seconds

# After only passing in unique values, down to 6.25 seconds!
# Full set takes 1 min 18 seconds
# Full set takes 3 seconds once calling unique twice!

In [334]:
%%time
output = pd.Series(df.processed_county.dropna().unique()).apply(
    closest_match, col_map=pd.Series(df_zip_map.processed_county.unique()).dropna()
)

CPU times: user 3.2 s, sys: 7.99 ms, total: 3.21 s
Wall time: 3.21 s


In [284]:
output.shape

(38197,)

In [292]:
df = df.assign(join_county=output)

In [293]:
df.head()

Unnamed: 0,date,county,state,fips,cases,deaths,county_processed,processed_county,join_country,join_county
0,2020-01-21,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish
1,2020-01-22,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish
2,2020-01-23,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish
3,2020-01-24,Cook,Illinois,17031.0,1,0,cook,cook,cook,cook
4,2020-01-24,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish


In [289]:
df_zip_map.head()

Unnamed: 0,zip,type,primary_city,acceptable_cities,unacceptable_cities,state,county,timezone,area_codes,latitude,longitude,world_region,country,decommissioned,estimated_population,notes,county_processed,processed_county
0,501,UNIQUE,Holtsville,,I R S Service Center,NY,Suffolk County,America/New_York,631,40.81,-73.04,,US,0,384,,suffolk,suffolk
1,544,UNIQUE,Holtsville,,Irs Service Center,NY,Suffolk County,America/New_York,631,40.81,-73.04,,US,0,0,,suffolk,suffolk
2,601,STANDARD,Adjuntas,,"Colinas Del Gigante, Jard De Adjuntas, Urb San...",PR,Adjuntas,America/Puerto_Rico,787939,18.16,-66.72,,US,0,0,,adjuntas,adjuntas
3,602,STANDARD,Aguada,,"Alts De Aguada, Bo Guaniquilla, Comunidad Las ...",PR,,,787,18.38,-67.18,,US,0,0,,,
4,603,STANDARD,Aguadilla,Ramey,"Bda Caban, Bda Esteves, Bo Borinquen, Bo Ceiba...",PR,Aguadilla,America/Puerto_Rico,787,18.43,-67.15,,US,0,0,,aguadilla,aguadilla


In [310]:
df_zip_map[['zip', 'processed_county']]

Unnamed: 0,zip,processed_county
0,501,suffolk
1,544,suffolk
2,601,adjuntas
3,602,
4,603,aguadilla
...,...,...
42517,99926,prince of wales-outer ketchikan borough
42518,99927,prince of wales-outer ketchikan borough
42519,99928,ketchikan gateway borough
42520,99929,wrangell borough


In [311]:
df_final = pd.merge(
    df,
    df_zip_map[['zip', 'processed_county']],
    how='left',
    left_on='join_county',
    right_on='processed_county'
)

In [312]:
df_final.shape

(2794472, 12)

In [313]:
df_final

Unnamed: 0,date,county,state,fips,cases,deaths,county_processed,processed_county_x,join_country,join_county,zip,processed_county_y
0,2020-01-21,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish,98012,snohomish
1,2020-01-21,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish,98020,snohomish
2,2020-01-21,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish,98021,snohomish
3,2020-01-21,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish,98026,snohomish
4,2020-01-21,Snohomish,Washington,53061.0,1,0,snohomish,snohomish,snohomish,snohomish,98036,snohomish
...,...,...,...,...,...,...,...,...,...,...,...,...
2794467,2020-04-06,Uinta,Wyoming,56041.0,3,0,uinta,uinta,uinta,uinta,82937,uinta
2794468,2020-04-06,Uinta,Wyoming,56041.0,3,0,uinta,uinta,uinta,uinta,82939,uinta
2794469,2020-04-06,Uinta,Wyoming,56041.0,3,0,uinta,uinta,uinta,uinta,82944,uinta
2794470,2020-04-06,Washakie,Wyoming,56043.0,4,0,washakie,washakie,washakie,washakie,82401,washakie


In [295]:
df_zip_map.shape

(42522, 18)

In [296]:
df.shape

(38197, 10)

In [None]:
# The dataset expanded because there were many zip codes that corresponded to a given county!

In [250]:
# Full thing will take 14 minutes...this could be REALLY bad performance depending on the use case
(df.shape[0] / 1000) * 23 / 60

14.642183333333334

In [180]:
df_zip_map.county.dropna()[df_zip_map.county.str.lower().str.contains('cook').dropna()]

13617     Cook County
13623     Cook County
13631     Cook County
13638     Cook County
24356     Cook County
             ...     
33320    Cooke County
33327    Cooke County
33329    Cooke County
33336    Cooke County
33344    Cooke County
Name: county, Length: 242, dtype: object

In [270]:
df_zip_map.processed_county.unique()

array(['suffolk', 'adjuntas', nan, ..., 'ketchikan gateway borough',
       'prince of wales-outer ketchikan borough', 'wrangell borough'],
      dtype=object)

### Based on preprocessed join col, find closest match (with multiprocessing)

In [329]:
counties = df.processed_county.dropna()
counties_map = df_zip_map.processed_county.dropna().unique()

In [None]:
# TODO:
# 1) Split out counties into a number of blocks equal to number of processes, via array_split
# 2) For each block, pass to process (with counties_map)
# 3) In each individual process perform a pandas apply, return lists
# 4) recombine 
# Note: should be 12 times faster than before (not including overhead, so maybe 10xs faster)

# Note: will definitely want to have a way to calculate this mapping via parallelization everytime, since 
# we will be applying more computation than usual

# References
# https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply
# https://stackoverflow.com/questions/25510482/python-pandas-multiprocessing-apply

In [332]:
counties.nunique()

1532

12

In [315]:
import multiprocessing
from multiprocessing import Pool
from functools import partial
import numpy as np

nproc = multiprocessing.cpu_count()

def parallelize(data, func, num_of_processes=8):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def run_on_subset(func, data_subset):
    return data_subset.apply(func, axis=1)

def parallelize_on_rows(data, func, num_of_processes=8):
    return parallelize(data, partial(run_on_subset, func), nproc)

In [None]:
%%time
output = df.processed_county.dropna().apply(
    closest_match, col_map=pd.Series(df_zip_map.processed_county.unique()).dropna()
)

## Via Dask

In [326]:
import pandas as pd
import dask.dataframe as dd
from dask.multiprocessing import get

ImportError: Dask dataframe requirements are not installed.

Please either conda or pip install as follows:

  conda install dask                     # either conda install
  python -m pip install dask[dataframe] --upgrade  # or python -m pip install

In [None]:
# Leaving off here: 
# https://stackoverflow.com/questions/45545110/how-do-you-parallelize-apply-on-pandas-dataframes-making-use-of-all-cores-on-o

In [325]:
!pip install dask[dataframe] --upgrade

Requirement already up-to-date: dask[dataframe] in /Users/nathanieldake/.pyenv/versions/3.7.1/lib/python3.7/site-packages (2.14.0)
Requirement not upgraded as not directly required: pandas>=0.23.0; extra == "dataframe" in /Users/nathanieldake/.pyenv/versions/3.7.1/lib/python3.7/site-packages (from dask[dataframe]) (0.25.2)
Requirement not upgraded as not directly required: toolz>=0.8.2; extra == "dataframe" in /Users/nathanieldake/.pyenv/versions/3.7.1/lib/python3.7/site-packages (from dask[dataframe]) (0.10.0)
Requirement not upgraded as not directly required: numpy>=1.13.0; extra == "dataframe" in /Users/nathanieldake/.pyenv/versions/3.7.1/lib/python3.7/site-packages (from dask[dataframe]) (1.17.3)
Requirement not upgraded as not directly required: fsspec>=0.6.0; extra == "dataframe" in /Users/nathanieldake/.pyenv/versions/3.7.1/lib/python3.7/site-packages (from dask[dataframe]) (0.7.2)
Requirement not upgraded as not directly required: partd>=0.3.10; extra == "dataframe" in /Users/n

### Ideas for combination of distance metrics
 * https://pypi.org/project/python-Levenshtein/0.12.0/
 * https://rawgit.com/ztane/python-Levenshtein/master/docs/Levenshtein.html#Levenshtein-distance
 * https://towardsdatascience.com/fuzzy-string-matching-in-python-68f240d910fe

In [186]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [264]:
dist1 = Levenshtein.distance('Cook', 'Cook County')
ratio_1 = fuzz.ratio('Cook', 'Cook County')
print('distance: ', dist1, ' ratio: ', ratio_1)
print('Similarity score: ', ratio_1 / dist1)

distance:  7  ratio:  53
Similarity score:  7.571428571428571


In [265]:
dist2 = Levenshtein.distance('Cook', 'Carolina')
ratio_2 = fuzz.ratio('Cook', 'Carolina')
print('distance: ', dist2, ' ratio: ', ratio_2)
print('Similarity score: ', ratio_2 / dist2)


distance:  6  ratio:  33
Similarity score:  5.5


In [266]:
dist2 = Levenshtein.distance('Cook', 'Cooke')
ratio_2 = fuzz.ratio('Cook', 'Cooke')
print('distance: ', dist2, ' ratio: ', ratio_2)
print('Similarity score: ', ratio_2 / dist2)


distance:  1  ratio:  89
Similarity score:  89.0
