In [1]:
import pandas as pd
import chardet
import re

#### Create functions to normalize zip codes if any are greater than or less than 5 digits

In [2]:
def first_five(s):
    return s[0:5]

# def pad_tony(s):
#     zip_len = len(s)
#     if s == 3 or s == 7:
#         padded_s = "00" + s
#     elif s == 4 or s == 8:
#         padded_s = "0" + s
#     else:
#         padded_s = s
#     return padded_s

def pad_rjust(s):
    zip_len = len(s)
    if zip_len <= 5:
        final_len = 5
    else:
        final_len = 9
    padded_s = s.rjust(final_len, "0")
    return padded_s

def normalize_zip(s):
    padded_zip = pad_rjust(s)
    stripped_zip = first_five(padded_zip)
    return stripped_zip

### County & Per Capita Income (2020)

In [3]:
# Link as reference
# https://apps.bea.gov/iTable/iTable.cfm?reqid=70&step=30&isuri=1&major_area=4&area=xx&year=2020&tableid=20&category=720&area_type=4&year_end=-1&classification=non-industry&state=xx&statistic=3&yearbegin=-1&unit_of_measure=levels

In [4]:
# NOTE: Skip rows needed to remove header info.  Error occurs otherwise.
# Create DataFrame for county and per capita personal income

bea_income_original_df = pd.read_csv("Resources/bea_income_2020.csv", skiprows=4)
bea_income_original_df

Unnamed: 0,GeoFips,GeoName,2020
0,01001,"Autauga, AL",46814
1,01003,"Baldwin, AL",50953
2,01005,"Barbour, AL",37850
3,01007,"Bibb, AL",34300
4,01009,"Blount, AL",38808
...,...,...,...
3154,"* Shawano, WI and Menominee, WI are combined a...",,
3155,Metropolitan Areas are defined (geographically...,,
3156,Note. All dollar estimates are in thousands of...,,
3157,(NA) Not available.,,


In [5]:
# Check footer info
bea_income_original_df.tail(20)

Unnamed: 0,GeoFips,GeoName,2020
3139,56045,"Weston, WY",47599.0
3140,Legend / Footnotes:,,
3141,2/ Per capita personal income was computed usi...,,
3142,* Estimates for 1979 forward reflect Alaska Ce...,,
3143,* Estimates from 1988 forward separate Aleutia...,,
3144,* Estimates for 1991 forward separate Denali B...,,
3145,* Estimates from 1993 forward separate Skagway...,,
3146,* Wade Hampton Census Area was renamed Kusilva...,,
3147,"* On January 2, 2019, two new county equivalen...",,
3148,"* La Paz County, AZ was separated from Yuma Co...",,


In [6]:
# Use NAN in last two columns to find where to remove footer rows
both_columns_null_idx = bea_income_original_df[["GeoName", "2020"]].isna().all(axis="columns")
both_columns_null_idx

0       False
1       False
2       False
3       False
4       False
        ...  
3154     True
3155     True
3156     True
3157     True
3158     True
Length: 3159, dtype: bool

In [7]:
# Use NAN in last two columns to find where to remove footer rows
bea_income_original_df.loc[both_columns_null_idx]

Unnamed: 0,GeoFips,GeoName,2020
3140,Legend / Footnotes:,,
3141,2/ Per capita personal income was computed usi...,,
3142,* Estimates for 1979 forward reflect Alaska Ce...,,
3143,* Estimates from 1988 forward separate Aleutia...,,
3144,* Estimates for 1991 forward separate Denali B...,,
3145,* Estimates from 1993 forward separate Skagway...,,
3146,* Wade Hampton Census Area was renamed Kusilva...,,
3147,"* On January 2, 2019, two new county equivalen...",,
3148,"* La Paz County, AZ was separated from Yuma Co...",,
3149,"* Broomfield County, CO, was created from part...",,


In [8]:
# Drop footer rows
bea_income_original_df.drop(bea_income_original_df.tail(19).index, inplace = True)
bea_income_original_df

Unnamed: 0,GeoFips,GeoName,2020
0,01001,"Autauga, AL",46814
1,01003,"Baldwin, AL",50953
2,01005,"Barbour, AL",37850
3,01007,"Bibb, AL",34300
4,01009,"Blount, AL",38808
...,...,...,...
3135,56037,"Sweetwater, WY",54000
3136,56039,"Teton, WY",220645
3137,56041,"Uinta, WY",42854
3138,56043,"Washakie, WY",54361


In [9]:
# Rename columnns
bea_income_original_df = bea_income_original_df.rename(columns={"2020": "per_capita_income"})
bea_income_original_df

Unnamed: 0,GeoFips,GeoName,per_capita_income
0,01001,"Autauga, AL",46814
1,01003,"Baldwin, AL",50953
2,01005,"Barbour, AL",37850
3,01007,"Bibb, AL",34300
4,01009,"Blount, AL",38808
...,...,...,...
3135,56037,"Sweetwater, WY",54000
3136,56039,"Teton, WY",220645
3137,56041,"Uinta, WY",42854
3138,56043,"Washakie, WY",54361


In [10]:
# Check dtypes
bea_income_original_df.dtypes

GeoFips              object
GeoName              object
per_capita_income    object
dtype: object

### County & Population (2020)

In [11]:
# Link as reference
# https://apps.bea.gov/iTable/iTable.cfm?reqid=70&step=30&isuri=1&major_area=4&area=xx&year=2020&tableid=20&category=720&area_type=4&year_end=-1&classification=non-industry&state=xx&statistic=2&yearbegin=-1&unit_of_measure=levels

In [12]:
# NOTE: Skip rows needed to remove header info.  Error occurs otherwise.
# Create DataFrame for county and population

bea_population_original_df = pd.read_csv("Resources/bea_population_2020.csv", skiprows=4)
bea_population_original_df

Unnamed: 0,GeoFips,GeoName,2020
0,01001,"Autauga, AL",56145
1,01003,"Baldwin, AL",229287
2,01005,"Barbour, AL",24589
3,01007,"Bibb, AL",22136
4,01009,"Blount, AL",57879
...,...,...,...
3154,"* Shawano, WI and Menominee, WI are combined a...",,
3155,Metropolitan Areas are defined (geographically...,,
3156,Note. All dollar estimates are in thousands of...,,
3157,(NA) Not available.,,


In [13]:
# Check footer info
bea_population_original_df.tail(20)

Unnamed: 0,GeoFips,GeoName,2020
3139,56045,"Weston, WY",6743.0
3140,Legend / Footnotes:,,
3141,1/ Census Bureau midyear population estimates....,,
3142,* Estimates for 1979 forward reflect Alaska Ce...,,
3143,* Estimates from 1988 forward separate Aleutia...,,
3144,* Estimates for 1991 forward separate Denali B...,,
3145,* Estimates from 1993 forward separate Skagway...,,
3146,* Wade Hampton Census Area was renamed Kusilva...,,
3147,"* On January 2, 2019, two new county equivalen...",,
3148,"* La Paz County, AZ was separated from Yuma Co...",,


In [14]:
# Use NAN in last two columns to find where to remove footer rows
both_columns_null_idx = bea_population_original_df[["GeoName", "2020"]].isna().all(axis="columns")
both_columns_null_idx

0       False
1       False
2       False
3       False
4       False
        ...  
3154     True
3155     True
3156     True
3157     True
3158     True
Length: 3159, dtype: bool

In [15]:
# Use NAN in last two columns to find where to remove footer rows
bea_population_original_df.loc[both_columns_null_idx]

Unnamed: 0,GeoFips,GeoName,2020
3140,Legend / Footnotes:,,
3141,1/ Census Bureau midyear population estimates....,,
3142,* Estimates for 1979 forward reflect Alaska Ce...,,
3143,* Estimates from 1988 forward separate Aleutia...,,
3144,* Estimates for 1991 forward separate Denali B...,,
3145,* Estimates from 1993 forward separate Skagway...,,
3146,* Wade Hampton Census Area was renamed Kusilva...,,
3147,"* On January 2, 2019, two new county equivalen...",,
3148,"* La Paz County, AZ was separated from Yuma Co...",,
3149,"* Broomfield County, CO, was created from part...",,


In [16]:
# Drop footer rows
bea_population_original_df.drop(bea_population_original_df.tail(19).index, inplace = True)
bea_population_original_df

Unnamed: 0,GeoFips,GeoName,2020
0,01001,"Autauga, AL",56145
1,01003,"Baldwin, AL",229287
2,01005,"Barbour, AL",24589
3,01007,"Bibb, AL",22136
4,01009,"Blount, AL",57879
...,...,...,...
3135,56037,"Sweetwater, WY",42673
3136,56039,"Teton, WY",23497
3137,56041,"Uinta, WY",20215
3138,56043,"Washakie, WY",7760


In [17]:
# Rename columnns
bea_population_original_df = bea_population_original_df.rename(columns={"2020": "population"})
bea_population_original_df

Unnamed: 0,GeoFips,GeoName,population
0,01001,"Autauga, AL",56145
1,01003,"Baldwin, AL",229287
2,01005,"Barbour, AL",24589
3,01007,"Bibb, AL",22136
4,01009,"Blount, AL",57879
...,...,...,...
3135,56037,"Sweetwater, WY",42673
3136,56039,"Teton, WY",23497
3137,56041,"Uinta, WY",20215
3138,56043,"Washakie, WY",7760


In [18]:
# Check dtypes
bea_population_original_df.dtypes

GeoFips       object
GeoName       object
population    object
dtype: object

### Merge income & population

In [19]:
# Merge bea dataframes (income and popuation)
bea_merge_df = bea_income_original_df.merge(bea_population_original_df, left_on="GeoFips", right_on="GeoFips")
bea_merge_df

Unnamed: 0,GeoFips,GeoName_x,per_capita_income,GeoName_y,population
0,01001,"Autauga, AL",46814,"Autauga, AL",56145
1,01003,"Baldwin, AL",50953,"Baldwin, AL",229287
2,01005,"Barbour, AL",37850,"Barbour, AL",24589
3,01007,"Bibb, AL",34300,"Bibb, AL",22136
4,01009,"Blount, AL",38808,"Blount, AL",57879
...,...,...,...,...,...
3135,56037,"Sweetwater, WY",54000,"Sweetwater, WY",42673
3136,56039,"Teton, WY",220645,"Teton, WY",23497
3137,56041,"Uinta, WY",42854,"Uinta, WY",20215
3138,56043,"Washakie, WY",54361,"Washakie, WY",7760


In [20]:
# Saving file as reference
bea_merge_df.to_csv("Resources/bea_merge_df.csv")

In [21]:
# Confirm that geonames match after merge
# geoname_mismatch_idx = bea_merge_df["GeoName_x"] != bea_merge_df["GeoName_y"]
# sum(geoname_mismatch_idx)

In [22]:
# Confirm that geonames match after merge
# bea_merge_df.loc[geoname_mismatch_idx]

#### Use split to get county and state

In [23]:
# Create DataFrame to split county and state
geo_name_df = bea_merge_df["GeoName_x"].str.rsplit(pat=", ", n= 1, expand=True)
geo_name_df[1] = geo_name_df[1].str.replace("*", "", regex=False)
geo_name_df[2] = bea_merge_df["GeoFips"]
geo_name_df

Unnamed: 0,0,1,2
0,Autauga,AL,01001
1,Baldwin,AL,01003
2,Barbour,AL,01005
3,Bibb,AL,01007
4,Blount,AL,01009
...,...,...,...
3135,Sweetwater,WY,56037
3136,Teton,WY,56039
3137,Uinta,WY,56041
3138,Washakie,WY,56043


In [24]:
# geo_name_df[1].unique()
# len(geo_name_df[1].value_counts())
geo_name_df[1].count()

3140

In [25]:
# Rename columnns
geo_name_df = geo_name_df.rename(columns={0: "county"})
geo_name_df = geo_name_df.rename(columns={1: "state"})
geo_name_df = geo_name_df.rename(columns={2: "GeoFips"})
geo_name_df

Unnamed: 0,county,state,GeoFips
0,Autauga,AL,01001
1,Baldwin,AL,01003
2,Barbour,AL,01005
3,Bibb,AL,01007
4,Blount,AL,01009
...,...,...,...
3135,Sweetwater,WY,56037
3136,Teton,WY,56039
3137,Uinta,WY,56041
3138,Washakie,WY,56043


In [26]:
# Saving file as reference
geo_name_df.to_csv("Resources/geo_name_df.csv")

#### Merge county & state with income & population

In [27]:
# Merge county and zip code
bea_county_merge_df = bea_merge_df.merge(geo_name_df, left_on="GeoFips", right_on="GeoFips")
bea_county_merge_df

Unnamed: 0,GeoFips,GeoName_x,per_capita_income,GeoName_y,population,county,state
0,01001,"Autauga, AL",46814,"Autauga, AL",56145,Autauga,AL
1,01003,"Baldwin, AL",50953,"Baldwin, AL",229287,Baldwin,AL
2,01005,"Barbour, AL",37850,"Barbour, AL",24589,Barbour,AL
3,01007,"Bibb, AL",34300,"Bibb, AL",22136,Bibb,AL
4,01009,"Blount, AL",38808,"Blount, AL",57879,Blount,AL
...,...,...,...,...,...,...,...
3135,56037,"Sweetwater, WY",54000,"Sweetwater, WY",42673,Sweetwater,WY
3136,56039,"Teton, WY",220645,"Teton, WY",23497,Teton,WY
3137,56041,"Uinta, WY",42854,"Uinta, WY",20215,Uinta,WY
3138,56043,"Washakie, WY",54361,"Washakie, WY",7760,Washakie,WY


In [28]:
# Drop redundant columns
bea_county_merge_df.drop(columns=["GeoName_x", "GeoName_y"], inplace=True)
bea_county_merge_df

Unnamed: 0,GeoFips,per_capita_income,population,county,state
0,01001,46814,56145,Autauga,AL
1,01003,50953,229287,Baldwin,AL
2,01005,37850,24589,Barbour,AL
3,01007,34300,22136,Bibb,AL
4,01009,38808,57879,Blount,AL
...,...,...,...,...,...
3135,56037,54000,42673,Sweetwater,WY
3136,56039,220645,23497,Teton,WY
3137,56041,42854,20215,Uinta,WY
3138,56043,54361,7760,Washakie,WY


In [29]:
# Move zip code column to 1st column (not index).  Move city and state immediately after.
bea_county_merge_df = bea_county_merge_df[["GeoFips", "county", "state", "population", "per_capita_income"]]
bea_county_merge_df

Unnamed: 0,GeoFips,county,state,population,per_capita_income
0,01001,Autauga,AL,56145,46814
1,01003,Baldwin,AL,229287,50953
2,01005,Barbour,AL,24589,37850
3,01007,Bibb,AL,22136,34300
4,01009,Blount,AL,57879,38808
...,...,...,...,...,...
3135,56037,Sweetwater,WY,42673,54000
3136,56039,Teton,WY,23497,220645
3137,56041,Uinta,WY,20215,42854
3138,56043,Washakie,WY,7760,54361


In [30]:
# Saving file as reference
bea_county_merge_df.to_csv("Resources/bea_county_merge_df.csv")

#### Create database for zip code / county

In [31]:
# Link as reference:
# https://www.unitedstateszipcodes.org/zip-code-database/

In [32]:
# Set dtypes prior to reading in csv file
zip_county_dtypes = {"zip": str}

In [33]:
# Create DataFrame
zip_county_df = pd.read_csv("Resources/zip_code_database.csv", dtype=zip_county_dtypes)
zip_county_df

Unnamed: 0,zip,type,decommissioned,primary_city,acceptable_cities,unacceptable_cities,state,county,timezone,area_codes,world_region,country,latitude,longitude,irs_estimated_population
0,00501,UNIQUE,0,Holtsville,,Internal Revenue Service,NY,Suffolk County,America/New_York,631,,US,40.81,-73.04,562
1,00544,UNIQUE,0,Holtsville,,Internal Revenue Service,NY,Suffolk County,America/New_York,631,,US,40.81,-73.04,0
2,00601,STANDARD,0,Adjuntas,,"Colinas Del Gigante, Jard De Adjuntas, Urb San...",PR,Adjuntas Municipio,America/Puerto_Rico,787939,,US,18.16,-66.72,0
3,00602,STANDARD,0,Aguada,,"Alts De Aguada, Bo Guaniquilla, Comunidad Las ...",PR,Aguada Municipio,America/Puerto_Rico,787939,,US,18.38,-67.18,0
4,00603,STANDARD,0,Aguadilla,Ramey,"Bda Caban, Bda Esteves, Bo Borinquen, Bo Ceiba...",PR,Aguadilla Municipio,America/Puerto_Rico,787,,US,18.43,-67.15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42719,99926,PO BOX,0,Metlakatla,,,AK,Prince of Wales-Outer Ketchikan Borough,America/Metlakatla,907,,US,55.14,-131.49,1140
42720,99927,PO BOX,0,Point Baker,,,AK,Prince of Wales-Hyder Census Area,America/Sitka,907,,US,56.30,-133.57,48
42721,99928,PO BOX,0,Ward Cove,,,AK,Ketchikan Gateway Borough,America/Sitka,907,,US,55.45,-131.79,1530
42722,99929,PO BOX,0,Wrangell,,,AK,Wrangell City and Borough,America/Sitka,907,,US,56.41,-131.61,2145


In [34]:
# Check count
zip_county_df.count()

zip                         42724
type                        42724
decommissioned              42724
primary_city                42724
acceptable_cities            9287
unacceptable_cities         11666
state                       42724
county                      41799
timezone                    41926
area_codes                  39698
world_region                  333
country                     42650
latitude                    42724
longitude                   42724
irs_estimated_population    42724
dtype: int64

In [35]:
# List columns
zip_county_df.columns.tolist()

['zip',
 'type',
 'decommissioned',
 'primary_city',
 'acceptable_cities',
 'unacceptable_cities',
 'state',
 'county',
 'timezone',
 'area_codes',
 'world_region',
 'country',
 'latitude',
 'longitude',
 'irs_estimated_population']

In [36]:
# Columns to drop
zip_county_df.drop(columns=["type", "decommissioned", "primary_city", "acceptable_cities", "unacceptable_cities", 
                            "state", "timezone", "area_codes", "world_region", "country", "irs_estimated_population"]
                   , inplace=True)
zip_county_df

Unnamed: 0,zip,county,latitude,longitude
0,00501,Suffolk County,40.81,-73.04
1,00544,Suffolk County,40.81,-73.04
2,00601,Adjuntas Municipio,18.16,-66.72
3,00602,Aguada Municipio,18.38,-67.18
4,00603,Aguadilla Municipio,18.43,-67.15
...,...,...,...,...
42719,99926,Prince of Wales-Outer Ketchikan Borough,55.14,-131.49
42720,99927,Prince of Wales-Hyder Census Area,56.30,-133.57
42721,99928,Ketchikan Gateway Borough,55.45,-131.79
42722,99929,Wrangell City and Borough,56.41,-131.61


In [37]:
# Check count
zip_county_df.count()

zip          42724
county       41799
latitude     42724
longitude    42724
dtype: int64

In [38]:
# Check dtypes
zip_county_df.dtypes

zip           object
county        object
latitude     float64
longitude    float64
dtype: object

In [39]:
# Create series of "zip"
county_s = zip_county_df["zip"]
county_s

0        00501
1        00544
2        00601
3        00602
4        00603
         ...  
42719    99926
42720    99927
42721    99928
42722    99929
42723    99950
Name: zip, Length: 42724, dtype: object

In [40]:
# Check number of digits for each zip
county_s_len = county_s.map(len)
county_s_len

0        5
1        5
2        5
3        5
4        5
        ..
42719    5
42720    5
42721    5
42722    5
42723    5
Name: zip, Length: 42724, dtype: int64

In [41]:
# Count of digit lengths
county_s_len.value_counts()

5    42724
Name: zip, dtype: int64

#### Merge county and zip code

In [42]:
# Check count before merge
bea_county_merge_df.count()

GeoFips              3140
county               3140
state                3140
population           3140
per_capita_income    3140
dtype: int64

In [43]:
# Check count before merge
zip_county_df.count()

zip          42724
county       41799
latitude     42724
longitude    42724
dtype: int64

In [44]:
# Merge county and zip code
zip_county_merge_df = bea_county_merge_df.merge(zip_county_df, left_on="county", right_on="county")
zip_county_merge_df

Unnamed: 0,GeoFips,county,state,population,per_capita_income,zip,latitude,longitude
0,02013,Aleutians East Borough,AK,3401,59574,99553,54.12,-165.83
1,02013,Aleutians East Borough,AK,3401,59574,99571,55.22,-162.78
2,02013,Aleutians East Borough,AK,3401,59574,99583,54.89,-163.41
3,02013,Aleutians East Borough,AK,3401,59574,99612,55.08,-162.32
4,02013,Aleutians East Borough,AK,3401,59574,99661,55.26,-160.66
...,...,...,...,...,...,...,...,...
559,11001,District of Columbia,DC,712816,86567,56945,38.89,-77.03
560,11001,District of Columbia,DC,712816,86567,56950,38.89,-77.03
561,11001,District of Columbia,DC,712816,86567,56965,38.89,-77.03
562,11001,District of Columbia,DC,712816,86567,56972,38.86,-76.99


In [45]:
# DEBUG - WHY IS NUMBER OF ROWS SO SMALL?

In [46]:
# Saving file as reference
zip_county_merge_df.to_csv("Resources/zip_county_merge_df.csv")

### Create DataFrame for Physicians
#### Original version (source: cms.gov, updated Jan 2022)

In [47]:
# Link as reference:
# https://data.cms.gov/provider-data/dataset/mj5m-pzi6

In [48]:
# with open("Resources/physician_data.csv", "rb") as csv_file:
#     rawdata = csv_file.read()
#     chardet.detect(rawdata)

In [49]:
# import chardet
# rawdata = open("Resources/physician_data.csv", 'rb').read()
# result = chardet.detect(rawdata)
# charenc = result['encoding']

In [50]:
# print(result)

### Use original dataframe or Michigan only?

In [51]:
# Set dtypes prior to reading in csv file
physicians_dtypes = {"zip": str, "grd_yr": str}

In [52]:
# Original DataFrame: all states
# physicians_original_df = pd.read_csv("Resources/physician_data.csv", encoding="ISO-8859-1", dtype=physicians_dtypes)
# physicians_original_df

In [53]:
# Michigan DataFrame: Michigan only
physicians_original_df = pd.read_csv("Resources/michigan_physician_data.csv", encoding="ISO-8859-1", dtype=physicians_dtypes)
physicians_original_df

Unnamed: 0,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn
0,1215376108,2013,ALLERGY/IMMUNOLOGY,LAPEER,MI,484464421,Y,M
1,1215234745,2007,OPHTHALMOLOGY,PORTAGE,MI,490241232,Y,M
2,1215167812,1998,FAMILY MEDICINE,DETROIT,MI,482271749,Y,M
3,1215178306,2001,CLINICAL SOCIAL WORKER,JACKSON,MI,492033652,Y,M
4,1215172580,1977,"PSYCHOLOGIST, CLINICAL",WEST BLOOMFIELD,MI,483223784,Y,M
...,...,...,...,...,...,...,...,...
104570,1306272042,2011,"PSYCHOLOGIST, CLINICAL",ANN ARBOR,MI,481058800,Y,Y
104571,1487107249,2015,PODIATRY,ALMA,MI,488011026,Y,Y
104572,1679837504,2012,PODIATRY,ALMA,MI,488011026,Y,Y
104573,1033648944,2015,CHIROPRACTIC,BAY CITY,MI,487063318,Y,Y


In [54]:
# NOTE: NEED TO CHANGE TYPE FOR NPI?

physicians_original_df.dtypes

npi           int64
grd_yr       object
pri_spec     object
cty          object
st           object
zip          object
ind_assgn    object
grp_assgn    object
dtype: object

In [55]:
# Create series of "zip"
physicians_zip_s = physicians_original_df["zip"]
physicians_zip_s

0         484464421
1         490241232
2         482271749
3         492033652
4         483223784
            ...    
104570    481058800
104571    488011026
104572    488011026
104573    487063318
104574    487063318
Name: zip, Length: 104575, dtype: object

In [56]:
# Check number of digits for each zip
physicians_zip_len = physicians_zip_s.map(len)
physicians_zip_len

0         9
1         9
2         9
3         9
4         9
         ..
104570    9
104571    9
104572    9
104573    9
104574    9
Name: zip, Length: 104575, dtype: int64

In [57]:
# Count of digit lengths
physicians_zip_len.value_counts()

9    104450
5       125
Name: zip, dtype: int64

In [58]:
# Apply function to standardize number of zip digits
physicians_zip_five_s = physicians_zip_s.map(first_five)
physicians_zip_five_s

0         48446
1         49024
2         48227
3         49203
4         48322
          ...  
104570    48105
104571    48801
104572    48801
104573    48706
104574    48706
Name: zip, Length: 104575, dtype: object

In [59]:
# Apply function to standardize number of zip digits
physicians_original_df["zip"] = physicians_zip_five_s
physicians_original_df

Unnamed: 0,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn
0,1215376108,2013,ALLERGY/IMMUNOLOGY,LAPEER,MI,48446,Y,M
1,1215234745,2007,OPHTHALMOLOGY,PORTAGE,MI,49024,Y,M
2,1215167812,1998,FAMILY MEDICINE,DETROIT,MI,48227,Y,M
3,1215178306,2001,CLINICAL SOCIAL WORKER,JACKSON,MI,49203,Y,M
4,1215172580,1977,"PSYCHOLOGIST, CLINICAL",WEST BLOOMFIELD,MI,48322,Y,M
...,...,...,...,...,...,...,...,...
104570,1306272042,2011,"PSYCHOLOGIST, CLINICAL",ANN ARBOR,MI,48105,Y,Y
104571,1487107249,2015,PODIATRY,ALMA,MI,48801,Y,Y
104572,1679837504,2012,PODIATRY,ALMA,MI,48801,Y,Y
104573,1033648944,2015,CHIROPRACTIC,BAY CITY,MI,48706,Y,Y


In [60]:
# Check number of digits for each zip
physicians_zip_len = physicians_zip_s.map(len)
physicians_zip_len

0         5
1         5
2         5
3         5
4         5
         ..
104570    5
104571    5
104572    5
104573    5
104574    5
Name: zip, Length: 104575, dtype: int64

In [61]:
# Count of digit lengths
physicians_zip_len.value_counts()

5    104575
Name: zip, dtype: int64

In [62]:
# Find null values
for column in physicians_original_df.columns:
    print(f"Column {column} has {physicians_original_df[column].isnull().sum()} null values")

Column npi has 0 null values
Column grd_yr has 63 null values
Column pri_spec has 0 null values
Column cty has 0 null values
Column st has 0 null values
Column zip has 0 null values
Column ind_assgn has 0 null values
Column grp_assgn has 0 null values


### Additional steps for Rev 04 to filter by Michigan and only primary care physicians

In [63]:
# Filter by state of Michigan only
# NOT NEEDED IF WE START WITH MICHIGAN ONLY DATA

# physicians_original_df = physicians_original_df.loc[physicians_original_df["st"] == "MI"]
# physicians_original_df

In [64]:
# Find all unique names for Primary Specialty
physicians_original_df.pri_spec.unique()

array(['ALLERGY/IMMUNOLOGY', 'OPHTHALMOLOGY', 'FAMILY MEDICINE',
       'CLINICAL SOCIAL WORKER', 'PSYCHOLOGIST, CLINICAL', 'CHIROPRACTIC',
       'PHYSICAL THERAPY', 'NURSE PRACTITIONER', 'PODIATRY', 'OPTOMETRY',
       'INTERNAL MEDICINE', 'PSYCHIATRY', 'GENERAL PRACTICE', 'NEUROLOGY',
       'PHYSICAL MEDICINE AND REHABILITATION', 'DERMATOLOGY',
       'GENERAL SURGERY', 'OBSTETRICS/GYNECOLOGY',
       'PREVENTATIVE MEDICINE', 'ORTHOPEDIC SURGERY', 'RHEUMATOLOGY',
       'PULMONARY DISEASE', 'UNDERSEA AND HYPERBARIC MEDICINE',
       'PAIN MANAGEMENT', 'OTOLARYNGOLOGY',
       'QUALIFIED SPEECH LANGUAGE PATHOLOGIST', 'GASTROENTEROLOGY',
       'PEDIATRIC MEDICINE', 'CRITICAL CARE (INTENSIVISTS)',
       'EMERGENCY MEDICINE', 'QUALIFIED AUDIOLOGIST', 'ORAL SURGERY',
       'CARDIOVASCULAR DISEASE (CARDIOLOGY)', 'UROLOGY', 'SLEEP MEDICINE',
       'REGISTERED DIETITIAN OR NUTRITION PROFESSIONAL',
       'DIAGNOSTIC RADIOLOGY', 'NEUROSURGERY', 'GERIATRIC MEDICINE',
       'INTERVENTION

In [65]:
# Filter by general practice / primary care
# NOTE: DISCUSS WITH TEAM IF WE WANT TO MODIFY SELECTION

# 'FAMILY MEDICINE'
# 'NURSE PRACTITIONER'
# 'GENERAL PRACTICE'
# 'PREVENTATIVE MEDICINE'
# 'EMERGENCY MEDICINE'
# 'PHYSICIAN ASSISTANT'
# 'INTERNAL MEDICINE'
# 'PEDIATRIC MEDICINE'
# 'OBSTETRICS/GYNECOLOGY'

# df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]

physicians_original_df = physicians_original_df.loc[
    (physicians_original_df["pri_spec"] == 'FAMILY MEDICINE') |
    (physicians_original_df["pri_spec"] == 'NURSE PRACTITIONER') |
    (physicians_original_df["pri_spec"] == 'GENERAL PRACTICE') |
    (physicians_original_df["pri_spec"] == 'PREVENTATIVE MEDICINE') |
    (physicians_original_df["pri_spec"] == 'EMERGENCY MEDICINE') |
    (physicians_original_df["pri_spec"] == 'PHYSICIAN ASSISTANT') |
    (physicians_original_df["pri_spec"] == 'INTERNAL MEDICINE') |
    (physicians_original_df["pri_spec"] == 'PEDIATRIC MEDICINE') |
    (physicians_original_df["pri_spec"] == 'OBSTETRICS/GYNECOLOGY'
)]
physicians_original_df

Unnamed: 0,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn
2,1215167812,1998,FAMILY MEDICINE,DETROIT,MI,48227,Y,M
9,1215385851,2015,NURSE PRACTITIONER,SOUTHFIELD,MI,48076,Y,M
18,1215021472,1991,INTERNAL MEDICINE,FLINT,MI,48532,Y,M
21,1215059241,1977,GENERAL PRACTICE,JENISON,MI,49428,Y,M
22,1215038906,1976,INTERNAL MEDICINE,ST JOHNS,MI,48879,Y,M
...,...,...,...,...,...,...,...,...
104558,1902288947,2015,EMERGENCY MEDICINE,OWOSSO,MI,48867,Y,Y
104560,1043621451,2014,INTERNAL MEDICINE,WARREN,MI,48088,Y,Y
104561,1366418667,1975,INTERNAL MEDICINE,WARREN,MI,48088,Y,Y
104565,1720035991,1978,INTERNAL MEDICINE,WARREN,MI,48088,Y,Y


In [66]:
# Check unique names for Primary Specialty after filtering
physicians_original_df.pri_spec.unique()

array(['FAMILY MEDICINE', 'NURSE PRACTITIONER', 'INTERNAL MEDICINE',
       'GENERAL PRACTICE', 'OBSTETRICS/GYNECOLOGY',
       'PREVENTATIVE MEDICINE', 'PEDIATRIC MEDICINE',
       'EMERGENCY MEDICINE', 'PHYSICIAN ASSISTANT'], dtype=object)

### Create DataFrame for Income versus Zip Code
#### Original version (source: kaggle, 2018)

In [67]:
# Link for reference:
# https://www.kaggle.com/hamishgunasekara/average-income-per-zip-code-usa-2018

In [68]:
# Set dtypes prior to reading in csv file
income_dtypes = {"zipcode": str}

In [69]:
# Read in csv and create DataFrame
income_original_df = pd.read_csv("Resources/postcode_level_averages.csv", dtype=income_dtypes)
income_original_df

Unnamed: 0,state,zipcode,total_pop,total_income,country,avg_income
0,AK,0,345220,25354974,USA,73445.843230
1,AK,99501,7690,612242,USA,79615.344603
2,AK,99502,11860,1000685,USA,84374.789207
3,AK,99503,7510,462394,USA,61570.439414
4,AK,99504,19440,1176109,USA,60499.434156
...,...,...,...,...,...,...
27653,WY,83126,200,17041,USA,85205.000000
27654,WY,83127,1620,135778,USA,83813.580247
27655,WY,83128,950,119435,USA,125721.052632
27656,WY,83414,230,60700,USA,263913.043478


In [70]:
# Check dtypes
income_original_df.dtypes

state            object
zipcode          object
total_pop         int64
total_income      int64
country          object
avg_income      float64
dtype: object

In [71]:
# Create series of "zip"
income_zip_s = income_original_df["zipcode"]
income_zip_s

0            0
1        99501
2        99502
3        99503
4        99504
         ...  
27653    83126
27654    83127
27655    83128
27656    83414
27657    99999
Name: zipcode, Length: 27658, dtype: object

In [72]:
# Check number of digits for each zip
income_zip_len = income_zip_s.map(len)
income_zip_len

0        1
1        5
2        5
3        5
4        5
        ..
27653    5
27654    5
27655    5
27656    5
27657    5
Name: zipcode, Length: 27658, dtype: int64

In [73]:
# Count of digit lengths
income_zip_len.value_counts()

5    25420
4     2187
1       51
Name: zipcode, dtype: int64

In [74]:
# Apply function to standardize number of zip digits
income_zip_five_s = income_zip_s.map(normalize_zip)
income_zip_five_s

0        00000
1        99501
2        99502
3        99503
4        99504
         ...  
27653    83126
27654    83127
27655    83128
27656    83414
27657    99999
Name: zipcode, Length: 27658, dtype: object

In [75]:
# Apply function to standardize number of zip digits
income_original_df["zipcode"] = income_zip_five_s
income_original_df

Unnamed: 0,state,zipcode,total_pop,total_income,country,avg_income
0,AK,00000,345220,25354974,USA,73445.843230
1,AK,99501,7690,612242,USA,79615.344603
2,AK,99502,11860,1000685,USA,84374.789207
3,AK,99503,7510,462394,USA,61570.439414
4,AK,99504,19440,1176109,USA,60499.434156
...,...,...,...,...,...,...
27653,WY,83126,200,17041,USA,85205.000000
27654,WY,83127,1620,135778,USA,83813.580247
27655,WY,83128,950,119435,USA,125721.052632
27656,WY,83414,230,60700,USA,263913.043478


In [76]:
# Check number of digits for each zip
income_zip_len = income_zip_s.map(len)
income_zip_len

0        5
1        5
2        5
3        5
4        5
        ..
27653    5
27654    5
27655    5
27656    5
27657    5
Name: zipcode, Length: 27658, dtype: int64

In [77]:
# Count of digit lengths
income_zip_len.value_counts()

5    27658
Name: zipcode, dtype: int64

In [78]:
# Remove irrelevant "country" column
income_original_df.drop(columns="country", inplace=True)
income_original_df

Unnamed: 0,state,zipcode,total_pop,total_income,avg_income
0,AK,00000,345220,25354974,73445.843230
1,AK,99501,7690,612242,79615.344603
2,AK,99502,11860,1000685,84374.789207
3,AK,99503,7510,462394,61570.439414
4,AK,99504,19440,1176109,60499.434156
...,...,...,...,...,...
27653,WY,83126,200,17041,85205.000000
27654,WY,83127,1620,135778,83813.580247
27655,WY,83128,950,119435,125721.052632
27656,WY,83414,230,60700,263913.043478


In [79]:
# Find null values
for column in income_original_df.columns:
    print(f"Column {column} has {income_original_df[column].isnull().sum()} null values")

Column state has 0 null values
Column zipcode has 0 null values
Column total_pop has 0 null values
Column total_income has 0 null values
Column avg_income has 0 null values


In [80]:
# Saving file as reference
income_original_df.to_csv("Resources/income_original_df.csv")

### Additional steps for Rev 04 to filter by Michigan

In [81]:
# Filter by state of Michigan only
income_original_df = income_original_df.loc[income_original_df["state"] == "MI"]
income_original_df

Unnamed: 0,state,zipcode,total_pop,total_income,avg_income
11153,MI,00000,4729780,320740706,67813.028513
11154,MI,48001,6270,385108,61420.733652
11155,MI,48002,1520,102460,67407.894737
11156,MI,48003,3220,210579,65397.204969
11157,MI,48005,2700,210225,77861.111111
...,...,...,...,...,...
12040,MI,49967,210,9768,46514.285714
12041,MI,49968,930,39279,42235.483871
12042,MI,49969,590,132776,225044.067797
12043,MI,49970,110,4691,42645.454545


### Create Dataframe for ZCTA info
#### Original version
#### https://simplemaps.com/data/us-zips
#### * Note: All code in this section is identical to Jenny's ZCTAMI.ipynb file unless noted in comments

In [82]:
# NOTE: MODIFIED FROM JENNY'S FILE.  ZIPCODE CHANGED TO A STRING.

zipcode_dtypes = {"zip": str}

In [83]:
# NOTE: MODIFIED FROM JENNY'S FILE.  ZIPCODE CHANGED TO A STRING.

zip_data_df = pd.read_csv("Resources/uszips.csv", dtype=zipcode_dtypes)
zip_data_df

Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,00601,18.18005,-66.75218,Adjuntas,PR,Puerto Rico,True,,17113.0,102.7,72001,Adjuntas,"{""72001"": ""99.43"", ""72141"": ""0.57""}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,00602,18.36074,-67.17519,Aguada,PR,Puerto Rico,True,,37751.0,476.0,72003,Aguada,"{""72003"": ""100""}",Aguada,72003,False,False,America/Puerto_Rico
2,00603,18.45440,-67.12201,Aguadilla,PR,Puerto Rico,True,,47081.0,574.9,72005,Aguadilla,"{""72005"": ""100""}",Aguadilla,72005,False,False,America/Puerto_Rico
3,00606,18.16721,-66.93828,Maricao,PR,Puerto Rico,True,,6392.0,58.3,72093,Maricao,"{""72093"": ""94.88"", ""72153"": ""3.78"", ""72121"": ""...",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,00610,18.29032,-67.12244,Anasco,PR,Puerto Rico,True,,26686.0,286.9,72011,Añasco,"{""72011"": ""99.45"", ""72003"": ""0.55""}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33115,99923,55.97796,-130.03671,Hyder,AK,Alaska,True,,14.0,0.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33116,99925,55.55796,-132.97482,Klawock,AK,Alaska,True,,908.0,6.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33117,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,,1654.0,4.8,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Metlakatla
33118,99927,56.25100,-133.37572,Point Baker,AK,Alaska,True,,0.0,0.0,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka


In [84]:
# Column types
zip_data_df.dtypes

zip                  object
lat                 float64
lng                 float64
city                 object
state_id             object
state_name           object
zcta                   bool
parent_zcta         float64
population          float64
density             float64
county_fips           int64
county_name          object
county_weights       object
county_names_all     object
county_fips_all      object
imprecise              bool
military               bool
timezone             object
dtype: object

In [85]:
# Row count
zip_data_df.count()

zip                 33120
lat                 33120
lng                 33120
city                33120
state_id            33120
state_name          33120
zcta                33120
parent_zcta             0
population          33096
density             33096
county_fips         33120
county_name         33120
county_weights      33120
county_names_all    33120
county_fips_all     33120
imprecise           33120
military            33120
timezone            33120
dtype: int64

In [86]:
# Drop duplicates
zip_data_df.drop_duplicates()

Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,00601,18.18005,-66.75218,Adjuntas,PR,Puerto Rico,True,,17113.0,102.7,72001,Adjuntas,"{""72001"": ""99.43"", ""72141"": ""0.57""}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,00602,18.36074,-67.17519,Aguada,PR,Puerto Rico,True,,37751.0,476.0,72003,Aguada,"{""72003"": ""100""}",Aguada,72003,False,False,America/Puerto_Rico
2,00603,18.45440,-67.12201,Aguadilla,PR,Puerto Rico,True,,47081.0,574.9,72005,Aguadilla,"{""72005"": ""100""}",Aguadilla,72005,False,False,America/Puerto_Rico
3,00606,18.16721,-66.93828,Maricao,PR,Puerto Rico,True,,6392.0,58.3,72093,Maricao,"{""72093"": ""94.88"", ""72153"": ""3.78"", ""72121"": ""...",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,00610,18.29032,-67.12244,Anasco,PR,Puerto Rico,True,,26686.0,286.9,72011,Añasco,"{""72011"": ""99.45"", ""72003"": ""0.55""}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33115,99923,55.97796,-130.03671,Hyder,AK,Alaska,True,,14.0,0.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33116,99925,55.55796,-132.97482,Klawock,AK,Alaska,True,,908.0,6.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33117,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,,1654.0,4.8,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Metlakatla
33118,99927,56.25100,-133.37572,Point Baker,AK,Alaska,True,,0.0,0.0,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka


In [87]:
# Check for populations with 0
zip_data_df.sort_values(["population"], ascending=True)

Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
15892,48233,42.32370,-83.06166,Detroit,MI,Michigan,True,,0.0,0.0,26163,Wayne,"{""26163"": ""0""}",Wayne,26163,False,False,America/Detroit
2629,10173,40.75413,-73.97936,New York,NY,New York,True,,0.0,0.0,36061,New York,"{""36061"": ""100""}",New York,36061,False,False,America/New_York
19235,56658,48.09036,-93.82919,Margie,MN,Minnesota,True,,0.0,0.0,27071,Koochiching,"{""27071"": ""100""}",Koochiching,27071,False,False,America/Chicago
14420,44652,40.90044,-81.32618,Middlebranch,OH,Ohio,True,,0.0,0.0,39151,Stark,"{""39151"": ""100""}",Stark,39151,False,False,America/New_York
2628,10172,40.75527,-73.97431,New York,NY,New York,True,,0.0,0.0,36061,New York,"{""36061"": ""0""}",New York,36061,False,False,America/New_York
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31864,96929,13.56510,144.87620,Yigo,GU,Guam,True,,,,66010,Guam,"{""66010"": 100}",Guam,66010,False,False,Pacific/Guam
31865,96932,13.47524,144.74216,Hagatna,GU,Guam,True,,,,66010,Guam,"{""66010"": 100}",Guam,66010,False,False,Pacific/Guam
31866,96950,15.18887,145.75356,Saipan,MP,Northern Mariana Islands,True,,,,69110,Saipan,"{""69110"": 100}",Saipan,69110,False,False,Pacific/Saipan
31867,96951,14.15733,145.21450,Rota,MP,Northern Mariana Islands,True,,,,69100,Rota,"{""69100"": 100}",Rota,69100,False,False,Pacific/Saipan


In [88]:
# Drop population values equal to zero 
zip_data_df.drop(zip_data_df.index[zip_data_df['population'] == 0], inplace = True)
zip_data_df

Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,00601,18.18005,-66.75218,Adjuntas,PR,Puerto Rico,True,,17113.0,102.7,72001,Adjuntas,"{""72001"": ""99.43"", ""72141"": ""0.57""}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,00602,18.36074,-67.17519,Aguada,PR,Puerto Rico,True,,37751.0,476.0,72003,Aguada,"{""72003"": ""100""}",Aguada,72003,False,False,America/Puerto_Rico
2,00603,18.45440,-67.12201,Aguadilla,PR,Puerto Rico,True,,47081.0,574.9,72005,Aguadilla,"{""72005"": ""100""}",Aguadilla,72005,False,False,America/Puerto_Rico
3,00606,18.16721,-66.93828,Maricao,PR,Puerto Rico,True,,6392.0,58.3,72093,Maricao,"{""72093"": ""94.88"", ""72153"": ""3.78"", ""72121"": ""...",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,00610,18.29032,-67.12244,Anasco,PR,Puerto Rico,True,,26686.0,286.9,72011,Añasco,"{""72011"": ""99.45"", ""72003"": ""0.55""}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33114,99922,55.30211,-133.03248,Hydaburg,AK,Alaska,True,,342.0,1.1,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33115,99923,55.97796,-130.03671,Hyder,AK,Alaska,True,,14.0,0.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33116,99925,55.55796,-132.97482,Klawock,AK,Alaska,True,,908.0,6.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33117,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,,1654.0,4.8,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Metlakatla


In [89]:
# Drop population if null (most are duplicates)
zip_data_df.drop(zip_data_df.index[zip_data_df['population'] == "NaN"], inplace = True)
zip_data_df

Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,00601,18.18005,-66.75218,Adjuntas,PR,Puerto Rico,True,,17113.0,102.7,72001,Adjuntas,"{""72001"": ""99.43"", ""72141"": ""0.57""}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,00602,18.36074,-67.17519,Aguada,PR,Puerto Rico,True,,37751.0,476.0,72003,Aguada,"{""72003"": ""100""}",Aguada,72003,False,False,America/Puerto_Rico
2,00603,18.45440,-67.12201,Aguadilla,PR,Puerto Rico,True,,47081.0,574.9,72005,Aguadilla,"{""72005"": ""100""}",Aguadilla,72005,False,False,America/Puerto_Rico
3,00606,18.16721,-66.93828,Maricao,PR,Puerto Rico,True,,6392.0,58.3,72093,Maricao,"{""72093"": ""94.88"", ""72153"": ""3.78"", ""72121"": ""...",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,00610,18.29032,-67.12244,Anasco,PR,Puerto Rico,True,,26686.0,286.9,72011,Añasco,"{""72011"": ""99.45"", ""72003"": ""0.55""}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33114,99922,55.30211,-133.03248,Hydaburg,AK,Alaska,True,,342.0,1.1,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33115,99923,55.97796,-130.03671,Hyder,AK,Alaska,True,,14.0,0.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33116,99925,55.55796,-132.97482,Klawock,AK,Alaska,True,,908.0,6.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33117,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,,1654.0,4.8,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Metlakatla


In [90]:
# Rename zip column
zip_data_df.rename(columns={'zip': 'zipcode'}, inplace=True)
zip_data_df

Unnamed: 0,zipcode,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,00601,18.18005,-66.75218,Adjuntas,PR,Puerto Rico,True,,17113.0,102.7,72001,Adjuntas,"{""72001"": ""99.43"", ""72141"": ""0.57""}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,00602,18.36074,-67.17519,Aguada,PR,Puerto Rico,True,,37751.0,476.0,72003,Aguada,"{""72003"": ""100""}",Aguada,72003,False,False,America/Puerto_Rico
2,00603,18.45440,-67.12201,Aguadilla,PR,Puerto Rico,True,,47081.0,574.9,72005,Aguadilla,"{""72005"": ""100""}",Aguadilla,72005,False,False,America/Puerto_Rico
3,00606,18.16721,-66.93828,Maricao,PR,Puerto Rico,True,,6392.0,58.3,72093,Maricao,"{""72093"": ""94.88"", ""72153"": ""3.78"", ""72121"": ""...",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,00610,18.29032,-67.12244,Anasco,PR,Puerto Rico,True,,26686.0,286.9,72011,Añasco,"{""72011"": ""99.45"", ""72003"": ""0.55""}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33114,99922,55.30211,-133.03248,Hydaburg,AK,Alaska,True,,342.0,1.1,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33115,99923,55.97796,-130.03671,Hyder,AK,Alaska,True,,14.0,0.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33116,99925,55.55796,-132.97482,Klawock,AK,Alaska,True,,908.0,6.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33117,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,,1654.0,4.8,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Metlakatla


In [91]:
# Drop columns
zip_data_df.drop(columns=["lat", "lng", "state_id", "state_name", "city", "parent_zcta", "population", "county_name", "density", "county_fips", "county_weights", 
                          "county_names_all", "county_fips_all", "imprecise", "military", "timezone"])

Unnamed: 0,zipcode,zcta
0,00601,True
1,00602,True
2,00603,True
3,00606,True
4,00610,True
...,...,...
33114,99922,True
33115,99923,True
33116,99925,True
33117,99926,True


In [92]:
zip_data_df

Unnamed: 0,zipcode,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,00601,18.18005,-66.75218,Adjuntas,PR,Puerto Rico,True,,17113.0,102.7,72001,Adjuntas,"{""72001"": ""99.43"", ""72141"": ""0.57""}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,00602,18.36074,-67.17519,Aguada,PR,Puerto Rico,True,,37751.0,476.0,72003,Aguada,"{""72003"": ""100""}",Aguada,72003,False,False,America/Puerto_Rico
2,00603,18.45440,-67.12201,Aguadilla,PR,Puerto Rico,True,,47081.0,574.9,72005,Aguadilla,"{""72005"": ""100""}",Aguadilla,72005,False,False,America/Puerto_Rico
3,00606,18.16721,-66.93828,Maricao,PR,Puerto Rico,True,,6392.0,58.3,72093,Maricao,"{""72093"": ""94.88"", ""72153"": ""3.78"", ""72121"": ""...",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,00610,18.29032,-67.12244,Anasco,PR,Puerto Rico,True,,26686.0,286.9,72011,Añasco,"{""72011"": ""99.45"", ""72003"": ""0.55""}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33114,99922,55.30211,-133.03248,Hydaburg,AK,Alaska,True,,342.0,1.1,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33115,99923,55.97796,-130.03671,Hyder,AK,Alaska,True,,14.0,0.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33116,99925,55.55796,-132.97482,Klawock,AK,Alaska,True,,908.0,6.3,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33117,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,,1654.0,4.8,2198,Prince of Wales-Hyder,"{""02198"": ""100""}",Prince of Wales-Hyder,02198,False,False,America/Metlakatla


In [93]:
# NOTE: MODIFIED FROM JENNY'S FILE.  ORIGINAL CODE IS TWO CELLS ABOVE.

# Drop columns
zip_data_df = zip_data_df.drop(columns=["lat", "lng", "state_id", "state_name", "city", "parent_zcta", "population", "county_name", "density", "county_fips", "county_weights", 
                          "county_names_all", "county_fips_all", "imprecise", "military", "timezone"])

In [94]:
# NOTE: MODIFIED FROM JENNY'S FILE.  ADDITIONAL DISPLAY AFTER CHANGES MADE TO DF.

zip_data_df

Unnamed: 0,zipcode,zcta
0,00601,True
1,00602,True
2,00603,True
3,00606,True
4,00610,True
...,...,...
33114,99922,True
33115,99923,True
33116,99925,True
33117,99926,True


In [95]:
# NOTE: MODIFIED FROM JENNY'S FILE.  COMMENTED OUT.

# Output to CSV
# zip_data_df.to_csv("Resources/zipcodes.csv")

### Use "normalize_zip" function in "zip_data_df" to ensure that all zip codes have 5 digits
#### Left in as placeholder in case needed later

In [96]:
# zip_data_df.dtypes

In [97]:
# zipcode_s = zip_data_df["zipcode"]
# zipcode_s

In [98]:
# zipcode_len = zipcode_s.map(len)
# zipcode_len

In [99]:
# zipcode_len.value_counts()

In [100]:
# zipcode_s = zip_data_df["zipcode"]
# zipcode_s

In [101]:
# zip_five_s = zipcode_s.map(first_five)
# zip_five_s

In [102]:
# zip_data_df["zipcode"] = zip_five_s
# zip_data_df

In [103]:
# zipcode_len = zipcode_s.map(len)
# zipcode_len

In [104]:
# zipcode_len.value_counts()

### Additional steps for Rev 04 to filter by Michigan

In [105]:
# Filter by state of Michigan only
# NOTE: NEEDED FOR THIS DATAFRAME?  IF SO, NEEDS TO BE INPUT BEFORE DROPPING state_id COLUMN

### Merge Dataframes

In [106]:
#1
merged_df = income_original_df.merge(physicians_original_df, left_on="zipcode", right_on="zip")
merged_df

# NOTE: CHOOSE DIFFERENT MERGE OPTIONS?

Unnamed: 0,state,zipcode,total_pop,total_income,avg_income,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn
0,MI,48001,6270,385108,61420.733652,1457637506,2011,FAMILY MEDICINE,ALGONAC,MI,48001,Y,M
1,MI,48001,6270,385108,61420.733652,1043201767,1989,FAMILY MEDICINE,ALGONAC,MI,48001,Y,Y
2,MI,48001,6270,385108,61420.733652,1578008025,2016,NURSE PRACTITIONER,ALGONAC,MI,48001,Y,Y
3,MI,48001,6270,385108,61420.733652,1699766030,1995,FAMILY MEDICINE,ALGONAC,MI,48001,Y,Y
4,MI,48001,6270,385108,61420.733652,1902897689,1986,FAMILY MEDICINE,ALGONAC,MI,48001,Y,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40074,MI,49969,590,132776,225044.067797,1407851413,1994,INTERNAL MEDICINE,WATERSMEET,MI,49969,Y,Y
40075,MI,49969,590,132776,225044.067797,1376056135,2017,NURSE PRACTITIONER,WATERSMEET,MI,49969,Y,Y
40076,MI,49969,590,132776,225044.067797,1629115977,2004,FAMILY MEDICINE,WATERSMEET,MI,49969,Y,Y
40077,MI,49969,590,132776,225044.067797,1811363690,2015,PHYSICIAN ASSISTANT,WATERSMEET,MI,49969,Y,Y


In [107]:
# NOTE: WHY HAS THE DATA BEEN FILTERED DOWN 101,695 (FROM 2,375,857 TO 2,274,162)?

In [108]:
# Check dtypes
merged_df.dtypes

state            object
zipcode          object
total_pop         int64
total_income      int64
avg_income      float64
npi               int64
grd_yr           object
pri_spec         object
cty              object
st               object
zip              object
ind_assgn        object
grp_assgn        object
dtype: object

In [109]:
# Drop redundant columns
merged_df.drop(columns=["st", "zip"], inplace=True)
merged_df

Unnamed: 0,state,zipcode,total_pop,total_income,avg_income,npi,grd_yr,pri_spec,cty,ind_assgn,grp_assgn
0,MI,48001,6270,385108,61420.733652,1457637506,2011,FAMILY MEDICINE,ALGONAC,Y,M
1,MI,48001,6270,385108,61420.733652,1043201767,1989,FAMILY MEDICINE,ALGONAC,Y,Y
2,MI,48001,6270,385108,61420.733652,1578008025,2016,NURSE PRACTITIONER,ALGONAC,Y,Y
3,MI,48001,6270,385108,61420.733652,1699766030,1995,FAMILY MEDICINE,ALGONAC,Y,Y
4,MI,48001,6270,385108,61420.733652,1902897689,1986,FAMILY MEDICINE,ALGONAC,Y,Y
...,...,...,...,...,...,...,...,...,...,...,...
40074,MI,49969,590,132776,225044.067797,1407851413,1994,INTERNAL MEDICINE,WATERSMEET,Y,Y
40075,MI,49969,590,132776,225044.067797,1376056135,2017,NURSE PRACTITIONER,WATERSMEET,Y,Y
40076,MI,49969,590,132776,225044.067797,1629115977,2004,FAMILY MEDICINE,WATERSMEET,Y,Y
40077,MI,49969,590,132776,225044.067797,1811363690,2015,PHYSICIAN ASSISTANT,WATERSMEET,Y,Y


In [110]:
# Move zip code column to 1st column (not index).  Move city and state immediately after.
merged_df = merged_df[["zipcode", "cty", "state", "total_pop", "total_income", "avg_income", "npi", "grd_yr", "pri_spec", "ind_assgn", "grp_assgn"]]
merged_df

Unnamed: 0,zipcode,cty,state,total_pop,total_income,avg_income,npi,grd_yr,pri_spec,ind_assgn,grp_assgn
0,48001,ALGONAC,MI,6270,385108,61420.733652,1457637506,2011,FAMILY MEDICINE,Y,M
1,48001,ALGONAC,MI,6270,385108,61420.733652,1043201767,1989,FAMILY MEDICINE,Y,Y
2,48001,ALGONAC,MI,6270,385108,61420.733652,1578008025,2016,NURSE PRACTITIONER,Y,Y
3,48001,ALGONAC,MI,6270,385108,61420.733652,1699766030,1995,FAMILY MEDICINE,Y,Y
4,48001,ALGONAC,MI,6270,385108,61420.733652,1902897689,1986,FAMILY MEDICINE,Y,Y
...,...,...,...,...,...,...,...,...,...,...,...
40074,49969,WATERSMEET,MI,590,132776,225044.067797,1407851413,1994,INTERNAL MEDICINE,Y,Y
40075,49969,WATERSMEET,MI,590,132776,225044.067797,1376056135,2017,NURSE PRACTITIONER,Y,Y
40076,49969,WATERSMEET,MI,590,132776,225044.067797,1629115977,2004,FAMILY MEDICINE,Y,Y
40077,49969,WATERSMEET,MI,590,132776,225044.067797,1811363690,2015,PHYSICIAN ASSISTANT,Y,Y


In [111]:
# merged_df["total_pop"] = pd.to_numeric(merged_df["total_pop"])

In [112]:
# Check dtypes
# merged_df.dtypes

In [113]:
# Find null values
for column in merged_df.columns:
    print(f"Column {column} has {merged_df[column].isnull().sum()} null values")

Column zipcode has 0 null values
Column cty has 0 null values
Column state has 0 null values
Column total_pop has 0 null values
Column total_income has 0 null values
Column avg_income has 0 null values
Column npi has 0 null values
Column grd_yr has 25 null values
Column pri_spec has 0 null values
Column ind_assgn has 0 null values
Column grp_assgn has 0 null values


In [114]:
# len(merged_df["zipcode"].unique())

In [115]:
# Add column calculating total number of doctors per zipcode
# Use transform with count?
merged_df["doctor_count"] = merged_df.groupby(["zipcode"])["zipcode"].transform("count")
merged_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,zipcode,cty,state,total_pop,total_income,avg_income,npi,grd_yr,pri_spec,ind_assgn,grp_assgn,doctor_count
0,48001,ALGONAC,MI,6270,385108,61420.733652,1457637506,2011,FAMILY MEDICINE,Y,M,5
1,48001,ALGONAC,MI,6270,385108,61420.733652,1043201767,1989,FAMILY MEDICINE,Y,Y,5
2,48001,ALGONAC,MI,6270,385108,61420.733652,1578008025,2016,NURSE PRACTITIONER,Y,Y,5
3,48001,ALGONAC,MI,6270,385108,61420.733652,1699766030,1995,FAMILY MEDICINE,Y,Y,5
4,48001,ALGONAC,MI,6270,385108,61420.733652,1902897689,1986,FAMILY MEDICINE,Y,Y,5
...,...,...,...,...,...,...,...,...,...,...,...,...
40074,49969,WATERSMEET,MI,590,132776,225044.067797,1407851413,1994,INTERNAL MEDICINE,Y,Y,6
40075,49969,WATERSMEET,MI,590,132776,225044.067797,1376056135,2017,NURSE PRACTITIONER,Y,Y,6
40076,49969,WATERSMEET,MI,590,132776,225044.067797,1629115977,2004,FAMILY MEDICINE,Y,Y,6
40077,49969,WATERSMEET,MI,590,132776,225044.067797,1811363690,2015,PHYSICIAN ASSISTANT,Y,Y,6


In [116]:
# Add column calculating avg_income per # of doctors
merged_df["pcp_per_capita"] = merged_df["doctor_count"]/merged_df["total_pop"]
merged_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,zipcode,cty,state,total_pop,total_income,avg_income,npi,grd_yr,pri_spec,ind_assgn,grp_assgn,doctor_count,pcp_per_capita
0,48001,ALGONAC,MI,6270,385108,61420.733652,1457637506,2011,FAMILY MEDICINE,Y,M,5,0.000797
1,48001,ALGONAC,MI,6270,385108,61420.733652,1043201767,1989,FAMILY MEDICINE,Y,Y,5,0.000797
2,48001,ALGONAC,MI,6270,385108,61420.733652,1578008025,2016,NURSE PRACTITIONER,Y,Y,5,0.000797
3,48001,ALGONAC,MI,6270,385108,61420.733652,1699766030,1995,FAMILY MEDICINE,Y,Y,5,0.000797
4,48001,ALGONAC,MI,6270,385108,61420.733652,1902897689,1986,FAMILY MEDICINE,Y,Y,5,0.000797
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40074,49969,WATERSMEET,MI,590,132776,225044.067797,1407851413,1994,INTERNAL MEDICINE,Y,Y,6,0.010169
40075,49969,WATERSMEET,MI,590,132776,225044.067797,1376056135,2017,NURSE PRACTITIONER,Y,Y,6,0.010169
40076,49969,WATERSMEET,MI,590,132776,225044.067797,1629115977,2004,FAMILY MEDICINE,Y,Y,6,0.010169
40077,49969,WATERSMEET,MI,590,132776,225044.067797,1811363690,2015,PHYSICIAN ASSISTANT,Y,Y,6,0.010169


In [117]:
# Review calculated info in new pcp_per_capita column
merged_df.pcp_per_capita.describe()

count    40079.000000
mean         3.466516
std          8.396947
min          0.000149
25%          0.008753
50%          0.020898
75%          0.055357
max         23.975000
Name: pcp_per_capita, dtype: float64

In [118]:
# Check dtypes
merged_df.dtypes

zipcode            object
cty                object
state              object
total_pop           int64
total_income        int64
avg_income        float64
npi                 int64
grd_yr             object
pri_spec           object
ind_assgn          object
grp_assgn          object
doctor_count        int64
pcp_per_capita    float64
dtype: object

In [119]:
# Format columns
# merged_df["total_pop"] = merged_df["total_pop"].map("{:,}".format)
merged_df["total_income"] = merged_df["total_income"].map("${:,.0f}".format)
merged_df["avg_income"] = merged_df["avg_income"].map("${:,.0f}".format)
merged_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,zipcode,cty,state,total_pop,total_income,avg_income,npi,grd_yr,pri_spec,ind_assgn,grp_assgn,doctor_count,pcp_per_capita
0,48001,ALGONAC,MI,6270,"$385,108","$61,421",1457637506,2011,FAMILY MEDICINE,Y,M,5,0.000797
1,48001,ALGONAC,MI,6270,"$385,108","$61,421",1043201767,1989,FAMILY MEDICINE,Y,Y,5,0.000797
2,48001,ALGONAC,MI,6270,"$385,108","$61,421",1578008025,2016,NURSE PRACTITIONER,Y,Y,5,0.000797
3,48001,ALGONAC,MI,6270,"$385,108","$61,421",1699766030,1995,FAMILY MEDICINE,Y,Y,5,0.000797
4,48001,ALGONAC,MI,6270,"$385,108","$61,421",1902897689,1986,FAMILY MEDICINE,Y,Y,5,0.000797
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40074,49969,WATERSMEET,MI,590,"$132,776","$225,044",1407851413,1994,INTERNAL MEDICINE,Y,Y,6,0.010169
40075,49969,WATERSMEET,MI,590,"$132,776","$225,044",1376056135,2017,NURSE PRACTITIONER,Y,Y,6,0.010169
40076,49969,WATERSMEET,MI,590,"$132,776","$225,044",1629115977,2004,FAMILY MEDICINE,Y,Y,6,0.010169
40077,49969,WATERSMEET,MI,590,"$132,776","$225,044",1811363690,2015,PHYSICIAN ASSISTANT,Y,Y,6,0.010169


### Add additional merge for zcta

In [120]:
# Merge to zcta
final_merge_df = merged_df.merge(zip_data_df, left_on="zipcode", right_on="zipcode")
final_merge_df

Unnamed: 0,zipcode,cty,state,total_pop,total_income,avg_income,npi,grd_yr,pri_spec,ind_assgn,grp_assgn,doctor_count,pcp_per_capita,zcta
0,48001,ALGONAC,MI,6270,"$385,108","$61,421",1457637506,2011,FAMILY MEDICINE,Y,M,5,0.000797,True
1,48001,ALGONAC,MI,6270,"$385,108","$61,421",1043201767,1989,FAMILY MEDICINE,Y,Y,5,0.000797,True
2,48001,ALGONAC,MI,6270,"$385,108","$61,421",1578008025,2016,NURSE PRACTITIONER,Y,Y,5,0.000797,True
3,48001,ALGONAC,MI,6270,"$385,108","$61,421",1699766030,1995,FAMILY MEDICINE,Y,Y,5,0.000797,True
4,48001,ALGONAC,MI,6270,"$385,108","$61,421",1902897689,1986,FAMILY MEDICINE,Y,Y,5,0.000797,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40074,49969,WATERSMEET,MI,590,"$132,776","$225,044",1407851413,1994,INTERNAL MEDICINE,Y,Y,6,0.010169,True
40075,49969,WATERSMEET,MI,590,"$132,776","$225,044",1376056135,2017,NURSE PRACTITIONER,Y,Y,6,0.010169,True
40076,49969,WATERSMEET,MI,590,"$132,776","$225,044",1629115977,2004,FAMILY MEDICINE,Y,Y,6,0.010169,True
40077,49969,WATERSMEET,MI,590,"$132,776","$225,044",1811363690,2015,PHYSICIAN ASSISTANT,Y,Y,6,0.010169,True


### Add additional merge for county

In [121]:
# Merge to county
final_merge_df2 = final_merge_df.merge(zip_county_merge_df, left_on="zipcode", right_on="zip")
final_merge_df2

Unnamed: 0,zipcode,cty,state_x,total_pop,total_income,avg_income,npi,grd_yr,pri_spec,ind_assgn,...,pcp_per_capita,zcta,GeoFips,county,state_y,population,per_capita_income,zip,latitude,longitude


In [122]:
# Saving file as reference
final_merge_df.to_csv("Resources/final_merge_df.csv")