In [1]:
# Create functions to normalize zip codes if any are greater than or less than 5 digits

def first_five(s):
    return s[0:5]

# def pad_tony(s):
#     zip_len = len(s)
#     if s == 3 or s == 7:
#         padded_s = "00" + s
#     elif s == 4 or s == 8:
#         padded_s = "0" + s
#     else:
#         padded_s = s
#     return padded_s

def pad_rjust(s):
    zip_len = len(s)
    if zip_len <= 5:
        final_len = 5
    else:
        final_len = 9
    padded_s = s.rjust(final_len, "0")
    return padded_s

def normalize_zip(s):
    padded_zip = pad_rjust(s)
    stripped_zip = first_five(padded_zip)
    return stripped_zip

In [2]:
import pandas as pd

In [3]:
# Link as reference:
# https://data.cms.gov/provider-data/dataset/mj5m-pzi6

In [4]:
# with open("Resources/physician_data.csv", "rb") as csv_file:
#     rawdata = csv_file.read()
#     chardet.detect(rawdata)

In [5]:
# import chardet
# rawdata = open("Resources/physician_data.csv", 'rb').read()
# result = chardet.detect(rawdata)
# charenc = result['encoding']

In [6]:
# print(result)

In [7]:
# Set dtypes prior to reading in csv file
physicians_dtypes = {"zip": str, "grd_yr": str}

In [8]:
# Read in csv and create DataFrame
physicians_original_df = pd.read_csv("Resources/physician_data.csv", encoding="ISO-8859-1", dtype=physicians_dtypes)
physicians_original_df

Unnamed: 0,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn
0,1215143243,1985,CHIROPRACTIC,RAYMOND,WA,985772535,Y,M
1,1215385364,2016,OPTOMETRY,PANAMA CITY,FL,324054508,Y,M
2,1215144167,1989,CHIROPRACTIC,SANTA ROSA,CA,954014642,M,M
3,1215239728,2010,CHIROPRACTIC,HASLET,TX,760524069,M,M
4,1215230917,2004,CLINICAL SOCIAL WORKER,BLOOMINGTON,IL,617046037,Y,M
...,...,...,...,...,...,...,...,...
2375852,1891957023,2008,PODIATRY,DENVER,CO,802204000,Y,Y
2375853,1316146939,1996,PHYSICIAN ASSISTANT,ANCHORAGE,AK,995086804,Y,Y
2375854,1710963681,1976,INTERVENTIONAL PAIN MANAGEMENT,ANCHORAGE,AK,995086804,Y,Y
2375855,1487956637,2010,PHYSICIAN ASSISTANT,CHILLICOTHE,OH,456012535,M,M


In [9]:
# Find all unique names for Primary Specialty
physicians_original_df.pri_spec.unique()

array(['CHIROPRACTIC', 'OPTOMETRY', 'CLINICAL SOCIAL WORKER',
       'FAMILY MEDICINE', 'HOSPITALIST', 'PHYSICAL THERAPY',
       'ORTHOPEDIC SURGERY', 'GENERAL PRACTICE', 'PODIATRY',
       'INTERNAL MEDICINE', 'ALLERGY/IMMUNOLOGY',
       'REGISTERED DIETITIAN OR NUTRITION PROFESSIONAL',
       'PSYCHOLOGIST, CLINICAL', 'QUALIFIED AUDIOLOGIST', 'OPHTHALMOLOGY',
       'NURSE PRACTITIONER', 'ORAL SURGERY', 'OCCUPATIONAL THERAPY',
       'INFECTIOUS DISEASE', 'GENERAL SURGERY', 'DERMATOLOGY',
       'OBSTETRICS/GYNECOLOGY', 'SLEEP MEDICINE',
       'PHYSICAL MEDICINE AND REHABILITATION',
       'CERTIFIED REGISTERED NURSE ANESTHETIST (CRNA)', 'OTOLARYNGOLOGY',
       'ENDOCRINOLOGY', 'PHYSICIAN ASSISTANT', 'PSYCHIATRY',
       'ANESTHESIOLOGY', 'NEPHROLOGY', 'PULMONARY DISEASE',
       'DIAGNOSTIC RADIOLOGY', 'PATHOLOGY', 'INTERVENTIONAL RADIOLOGY',
       'UROLOGY', 'SPORTS MEDICINE',
       'QUALIFIED SPEECH LANGUAGE PATHOLOGIST',
       'INTERVENTIONAL CARDIOLOGY', 'PAIN MANAGEMENT'

In [10]:
# Filter by general practice / primary care
physicians_filtered_df = physicians_original_df.loc[
    (physicians_original_df["pri_spec"] == 'FAMILY MEDICINE') |
    (physicians_original_df["pri_spec"] == 'NURSE PRACTITIONER') |
    (physicians_original_df["pri_spec"] == 'GENERAL PRACTICE') |
    (physicians_original_df["pri_spec"] == 'PREVENTATIVE MEDICINE') |
    (physicians_original_df["pri_spec"] == 'EMERGENCY MEDICINE') |
    (physicians_original_df["pri_spec"] == 'PHYSICIAN ASSISTANT') |
    (physicians_original_df["pri_spec"] == 'INTERNAL MEDICINE') |
    (physicians_original_df["pri_spec"] == 'PEDIATRIC MEDICINE') |
    (physicians_original_df["pri_spec"] == 'OBSTETRICS/GYNECOLOGY'
)]
physicians_filtered_df

Unnamed: 0,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn
7,1215283908,1998,FAMILY MEDICINE,MIAMI,FL,331673407,Y,M
19,1215257605,2007,GENERAL PRACTICE,CAMUY,PR,00627,Y,M
22,1215248273,2010,INTERNAL MEDICINE,BRAWLEY,CA,922277755,Y,M
31,1215258017,2010,INTERNAL MEDICINE,OKLAHOMA CITY,OK,731067216,Y,M
39,1215223664,2011,NURSE PRACTITIONER,MCCONNELSVILLE,OH,437562200,Y,M
...,...,...,...,...,...,...,...,...
2375838,1235603606,2018,NURSE PRACTITIONER,OKLAHOMA CITY,OK,731208175,Y,Y
2375839,1497033740,2011,NURSE PRACTITIONER,OKLAHOMA CITY,OK,731208175,Y,Y
2375853,1316146939,1996,PHYSICIAN ASSISTANT,ANCHORAGE,AK,995086804,Y,Y
2375855,1487956637,2010,PHYSICIAN ASSISTANT,CHILLICOTHE,OH,456012535,M,M


In [11]:
# Check unique names for Primary Specialty after filtering
physicians_filtered_df.pri_spec.unique()

array(['FAMILY MEDICINE', 'GENERAL PRACTICE', 'INTERNAL MEDICINE',
       'NURSE PRACTITIONER', 'OBSTETRICS/GYNECOLOGY',
       'PHYSICIAN ASSISTANT', 'EMERGENCY MEDICINE', 'PEDIATRIC MEDICINE',
       'PREVENTATIVE MEDICINE'], dtype=object)

In [12]:
# Create series of "zip"
physicians_zip_s = physicians_filtered_df["zip"]
physicians_zip_s

7          331673407
19             00627
22         922277755
31         731067216
39         437562200
             ...    
2375838    731208175
2375839    731208175
2375853    995086804
2375855    456012535
2375856    456012535
Name: zip, Length: 1006814, dtype: object

In [13]:
# Check number of digits for each zip
physicians_zip_len = physicians_zip_s.map(len)
physicians_zip_len

7          9
19         5
22         9
31         9
39         9
          ..
2375838    9
2375839    9
2375853    9
2375855    9
2375856    9
Name: zip, Length: 1006814, dtype: int64

In [14]:
# Count of digit lengths
physicians_zip_len.value_counts()

9    1000804
5       6010
Name: zip, dtype: int64

In [15]:
# Apply function to standardize number of zip digits
physicians_zip_five_s = physicians_zip_s.map(first_five)
physicians_zip_five_s

7          33167
19         00627
22         92227
31         73106
39         43756
           ...  
2375838    73120
2375839    73120
2375853    99508
2375855    45601
2375856    45601
Name: zip, Length: 1006814, dtype: object

In [16]:
# Apply function to standardize number of zip digits
physicians_filtered_df["zip"] = physicians_zip_five_s
physicians_filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn
7,1215283908,1998,FAMILY MEDICINE,MIAMI,FL,33167,Y,M
19,1215257605,2007,GENERAL PRACTICE,CAMUY,PR,00627,Y,M
22,1215248273,2010,INTERNAL MEDICINE,BRAWLEY,CA,92227,Y,M
31,1215258017,2010,INTERNAL MEDICINE,OKLAHOMA CITY,OK,73106,Y,M
39,1215223664,2011,NURSE PRACTITIONER,MCCONNELSVILLE,OH,43756,Y,M
...,...,...,...,...,...,...,...,...
2375838,1235603606,2018,NURSE PRACTITIONER,OKLAHOMA CITY,OK,73120,Y,Y
2375839,1497033740,2011,NURSE PRACTITIONER,OKLAHOMA CITY,OK,73120,Y,Y
2375853,1316146939,1996,PHYSICIAN ASSISTANT,ANCHORAGE,AK,99508,Y,Y
2375855,1487956637,2010,PHYSICIAN ASSISTANT,CHILLICOTHE,OH,45601,M,M


In [17]:
# Check number of digits for each zip
physicians_zip_len = physicians_zip_s.map(len)
physicians_zip_len

7          5
19         5
22         5
31         5
39         5
          ..
2375838    5
2375839    5
2375853    5
2375855    5
2375856    5
Name: zip, Length: 1006814, dtype: int64

In [18]:
# Count of digit lengths
physicians_zip_len.value_counts()

5    1006814
Name: zip, dtype: int64

In [19]:
# Find null values
for column in physicians_filtered_df.columns:
    print(f"Column {column} has {physicians_filtered_df[column].isnull().sum()} null values")

Column npi has 0 null values
Column grd_yr has 560 null values
Column pri_spec has 0 null values
Column cty has 0 null values
Column st has 0 null values
Column zip has 0 null values
Column ind_assgn has 0 null values
Column grp_assgn has 0 null values


### Import zip_data_df to add county columns to data

In [20]:
# Set dtypes before reading in csv
zipcode_dtypes = {"zip": str, "county_fips": str}

In [21]:
# Read in csv and create DataFrame
zip_data_df = pd.read_csv("Resources/zip_data_df.csv", dtype=zipcode_dtypes)
zip_data_df

Unnamed: 0.1,Unnamed: 0,zip,state_id,county_fips,county_name
0,0,00601,PR,72001,Adjuntas
1,1,00602,PR,72003,Aguada
2,2,00603,PR,72005,Aguadilla
3,3,00606,PR,72093,Maricao
4,4,00610,PR,72011,Añasco
...,...,...,...,...,...
32777,33114,99922,AK,02198,Prince of Wales-Hyder
32778,33115,99923,AK,02198,Prince of Wales-Hyder
32779,33116,99925,AK,02198,Prince of Wales-Hyder
32780,33117,99926,AK,02198,Prince of Wales-Hyder


In [22]:
# Check dtypes
zip_data_df.dtypes

Unnamed: 0      int64
zip            object
state_id       object
county_fips    object
county_name    object
dtype: object

In [23]:
# Drop Unnamed column
zip_data_df.drop(columns=["Unnamed: 0"], inplace=True)
zip_data_df

Unnamed: 0,zip,state_id,county_fips,county_name
0,00601,PR,72001,Adjuntas
1,00602,PR,72003,Aguada
2,00603,PR,72005,Aguadilla
3,00606,PR,72093,Maricao
4,00610,PR,72011,Añasco
...,...,...,...,...
32777,99922,AK,02198,Prince of Wales-Hyder
32778,99923,AK,02198,Prince of Wales-Hyder
32779,99925,AK,02198,Prince of Wales-Hyder
32780,99926,AK,02198,Prince of Wales-Hyder


### Merge physician data with zip data

In [24]:
# Merge county and zip code
physicians_merge_df = physicians_filtered_df.merge(zip_data_df, left_on="zip", right_on="zip")
physicians_merge_df

Unnamed: 0,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn,state_id,county_fips,county_name
0,1215283908,1998,FAMILY MEDICINE,MIAMI,FL,33167,Y,M,FL,12086,Miami-Dade
1,1053365692,1983,INTERNAL MEDICINE,MIAMI,FL,33167,Y,Y,FL,12086,Miami-Dade
2,1043627128,1984,GENERAL PRACTICE,MIAMI,FL,33167,Y,Y,FL,12086,Miami-Dade
3,1043627128,1984,GENERAL PRACTICE,MIAMI,FL,33167,Y,Y,FL,12086,Miami-Dade
4,1366581118,1979,GENERAL PRACTICE,MIAMI,FL,33167,Y,Y,FL,12086,Miami-Dade
...,...,...,...,...,...,...,...,...,...,...,...
978471,1073730669,2005,FAMILY MEDICINE,NEGAUNEE,MI,49866,Y,Y,MI,26103,Marquette
978472,1255322301,1973,FAMILY MEDICINE,NEGAUNEE,MI,49866,Y,Y,MI,26103,Marquette
978473,1598711368,2004,PHYSICIAN ASSISTANT,NEGAUNEE,MI,49866,Y,Y,MI,26103,Marquette
978474,1205327202,2017,PHYSICIAN ASSISTANT,WEST DEPTFORD,NJ,08051,Y,Y,NJ,34015,Gloucester


In [25]:
# Add column calculating total number of doctors per GeoFips
# Use transform with count?
physicians_merge_df["doctor_count"] = physicians_merge_df.groupby(["county_fips"])["county_fips"].transform("count")
physicians_merge_df

Unnamed: 0,npi,grd_yr,pri_spec,cty,st,zip,ind_assgn,grp_assgn,state_id,county_fips,county_name,doctor_count
0,1215283908,1998,FAMILY MEDICINE,MIAMI,FL,33167,Y,M,FL,12086,Miami-Dade,13619
1,1053365692,1983,INTERNAL MEDICINE,MIAMI,FL,33167,Y,Y,FL,12086,Miami-Dade,13619
2,1043627128,1984,GENERAL PRACTICE,MIAMI,FL,33167,Y,Y,FL,12086,Miami-Dade,13619
3,1043627128,1984,GENERAL PRACTICE,MIAMI,FL,33167,Y,Y,FL,12086,Miami-Dade,13619
4,1366581118,1979,GENERAL PRACTICE,MIAMI,FL,33167,Y,Y,FL,12086,Miami-Dade,13619
...,...,...,...,...,...,...,...,...,...,...,...,...
978471,1073730669,2005,FAMILY MEDICINE,NEGAUNEE,MI,49866,Y,Y,MI,26103,Marquette,313
978472,1255322301,1973,FAMILY MEDICINE,NEGAUNEE,MI,49866,Y,Y,MI,26103,Marquette,313
978473,1598711368,2004,PHYSICIAN ASSISTANT,NEGAUNEE,MI,49866,Y,Y,MI,26103,Marquette,313
978474,1205327202,2017,PHYSICIAN ASSISTANT,WEST DEPTFORD,NJ,08051,Y,Y,NJ,34015,Gloucester,542


In [26]:
# Drop irrelevant columns
physicians_merge_df.drop(columns=["npi", "grd_yr", "pri_spec", "cty", "st", 
                                  "zip", "ind_assgn", "grp_assgn", "state_id", "county_name"], inplace=True)
physicians_merge_df

Unnamed: 0,county_fips,doctor_count
0,12086,13619
1,12086,13619
2,12086,13619
3,12086,13619
4,12086,13619
...,...,...
978471,26103,313
978472,26103,313
978473,26103,313
978474,34015,542


In [27]:
# Drop duplicates
physicians_merge_df = physicians_merge_df.drop_duplicates()
physicians_merge_df

Unnamed: 0,county_fips,doctor_count
0,12086,13619
12,72027,20
32,06025,236
101,40109,2914
108,39115,19
...,...,...
978090,06051,30
978127,48169,2
978219,35021,1
978335,29133,14


In [28]:
# Saving as output
physicians_merge_df.to_csv("Resources/physicians_merge_df.csv")