# _Physician Compare National: Explore #9_

This notebook is a continuation from of my analysis on the following data gathered via [Data.Medicare.gov](https://data.medicare.gov/Physician-Compare/Physician-Compare-National-Downloadable-File/mj5m-pzi6). It contains general information about individual eligible professionals (EPs) such as demographic information and Medicare quality program participation. This dataset is updated twice a month with the most current demographic information available at that time.

# _Today's Goal(s)_

Continue exploring how to loop through all the cities and extract the correct zip code for each one.

In [1]:
from datetime import datetime

# current date and time
now = datetime.now()

# timestamp to signify the beginning of work
print("Work started: ", now)

Work started:  2019-10-05 15:42:14.445020


In [2]:
# first thing we need to do --> load in the data
# import pandas
import pandas as pd
pd.options.display.max_columns = None
%load_ext autoreload
%autoreload 2

# import data from yesterday
data = pd.read_csv('physician_compare_national-updates-2.csv', low_memory=False);

# inspect the first five rows
data.head()

Unnamed: 0,npi,ind_pac_id,ind_enrl_id,full_nm,gndr,cred,med_sch,grd_yr,pri_spec,sec_spec_1,sec_spec_2,sec_spec_3,sec_spec_4,sec_spec_all,org_lgl_nm,org_pac_id,num_org_mem,full_adr,ln_2_sprs,cty,st,zip,phn_numbr,hosp_afl_1,hosp_afl_lbn_1,hosp_afl_2,hosp_afl_lbn_2,hosp_afl_3,hosp_afl_lbn_3,hosp_afl_4,hosp_afl_lbn_4,hosp_afl_5,hosp_afl_lbn_5,assgn
0,1003000126,7517003643,I20130530000085,ARDALAN ENKESHAFI,M,Not Listed,OTHER,1994,INTERNAL MEDICINE,,,,,,EMERGENCY MEDICINE ASSOCIATES PA PC,8022915000.0,182,1850 TOWN CTR PKWY,N,RESTON,VA,201903219,7036899000.0,490112.0,CJW MEDICAL CENTER,210028.0,MEDSTAR SAINT MARY'S HOSPITAL,,,,,,,Y
1,1003000126,7517003643,I20130530000085,ARDALAN ENKESHAFI,M,Not Listed,OTHER,1994,INTERNAL MEDICINE,,,,,,EMERGENCY MEDICINE ASSOCIATES PA PC,8022915000.0,182,1701 N GEORGE MASON DR,N,ARLINGTON,VA,222053610,7035586000.0,490112.0,CJW MEDICAL CENTER,210028.0,MEDSTAR SAINT MARY'S HOSPITAL,,,,,,,Y
2,1003000126,7517003643,I20150824000105,ARDALAN ENKESHAFI,M,Not Listed,OTHER,1994,INTERNAL MEDICINE,,,,,,EMERGENCY MEDICINE ASSOCIATES PA PC,8022915000.0,182,24440 STONE SPRINGS BLVD,N,DULLES,VA,201662247,5713674000.0,490112.0,CJW MEDICAL CENTER,210028.0,MEDSTAR SAINT MARY'S HOSPITAL,,,,,,,Y
3,1003000126,7517003643,I20150824000105,ARDALAN ENKESHAFI,M,Not Listed,OTHER,1994,INTERNAL MEDICINE,,,,,,SOUTHEASTERN INTENSIVIST SERVICES PC,9335152000.0,133,1401 JOHNSTON WILLIS DR,N,NORTH CHESTERFIELD,VA,232354730,8044835000.0,490112.0,CJW MEDICAL CENTER,210028.0,MEDSTAR SAINT MARY'S HOSPITAL,,,,,,,Y
4,1003000126,7517003643,I20150824000105,ARDALAN ENKESHAFI,M,Not Listed,OTHER,1994,INTERNAL MEDICINE,,,,,,SOUTHEASTERN INTENSIVIST SERVICES PC,9335152000.0,133,411 W RANDOLPH RD,N,HOPEWELL,VA,238602938,8045412000.0,490112.0,CJW MEDICAL CENTER,210028.0,MEDSTAR SAINT MARY'S HOSPITAL,,,,,,,Y


In [3]:
# what are the unique lengths of the zip codes?
print('Length of zip codes take the following values: {}'.format(sorted(data['zip'].str.len().unique())))

Length of zip codes take the following values: [3, 4, 5, 7, 8, 9]


In [4]:
# what are the value counts for each length of the zip codes?
data['zip'].str.len().value_counts()

9    2059140
8     132917
5      13060
7       3039
3       1710
4        924
Name: zip, dtype: int64

In [5]:
# create subset of data where zip code is length 4
length_4 = data[data['zip'].str.len() == 4]; length_4.head(6)

Unnamed: 0,npi,ind_pac_id,ind_enrl_id,full_nm,gndr,cred,med_sch,grd_yr,pri_spec,sec_spec_1,sec_spec_2,sec_spec_3,sec_spec_4,sec_spec_all,org_lgl_nm,org_pac_id,num_org_mem,full_adr,ln_2_sprs,cty,st,zip,phn_numbr,hosp_afl_1,hosp_afl_lbn_1,hosp_afl_2,hosp_afl_lbn_2,hosp_afl_3,hosp_afl_lbn_3,hosp_afl_4,hosp_afl_lbn_4,hosp_afl_5,hosp_afl_lbn_5,assgn
102,1003003088,4981791837,I20071025000015,AVANI VORA,F,Not Listed,OTHER,2010,PHYSICAL THERAPY,,,,,,JERSEY PHYSICAL THERAPY ASSOCIATES LLC,547240300.0,6,3228 ROUTE 27,N,KENDALL PARK,NJ,8824,7322970000.0,,,,,,,,,,,Y
339,1003011065,4082888722,I20111115000827,LAURA M LEI,F,Not Listed,UMDNJ-NEW JERSEY MEDICAL SCHOOL,2007,ANESTHESIOLOGY,,,,,,ANESTHESIA CONSULTANTS OF NEW JERSEY LLC,3375449000.0,65,SOMERSET MED CTR 110 REHILL AVENUE,N,SOMERVILLE,NJ,8876,9086852000.0,310048.0,ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL - SOME...,,,,,,,,,Y
1183,1003030651,9739230780,I20161107000694,SALVA BILAL,F,Not Listed,OTHER,1996,FAMILY MEDICINE,,,,,,ELLIOT PROFESSIONAL SERVICES,6103728000.0,227,ONE ELLIOT WAY,N,MANCHESTER,NH,3103,6036634000.0,300012.0,ELLIOT HOSPITAL,300034.0,CATHOLIC MEDICAL CENTER,,,,,,,Y
1242,1003033390,1759474927,I20070904000383,BRIAN C CAMBI,M,Not Listed,STATE UNIVERSITY OF NEW YORK AT BUFFALO SCHOOL...,1999,INTERVENTIONAL CARDIOLOGY,CARDIOVASCULAR DISEASE (CARDIOLOGY),,,,CARDIOVASCULAR DISEASE (CARDIOLOGY),NORTHEAST MEDICAL GROUP INC,1254234000.0,1042,196 PKWY S SUITE 103,N,WATERFORD,CT,6385,8604434000.0,70007.0,LAWRENCE & MEMORIAL HOSPITAL,70022.0,YALE-NEW HAVEN HOSPITAL,410013.0,WESTERLY HOSPITAL,,,,,Y
1462,1003041427,6103953765,I20100430000016,LAURIE A. FORTY,F,Not Listed,OTHER,2007,NURSE PRACTITIONER,,,,,,"COMMUNITY HEALTH CARE, INC.",9335040000.0,42,251 BROAD ST,N,BRIDGETON,NJ,8302,8564531000.0,310032.0,INSPIRA MEDICAL CENTER VINELAND,,,,,,,,,Y
1578,1003043209,648424283,I20170517002241,MARJORIE E AFFEL,F,Not Listed,OTHER,2009,FAMILY MEDICINE,,,,,,"COMMUNITY HEALTH CARE, INC.",9335040000.0,42,251 BROAD ST,N,BRIDGETON,NJ,8302,8564531000.0,310081.0,INSPIRA MEDICAL CENTER WOODBURY,,,,,,,,,Y


In [20]:
# reset index so numbers aren't off
length4 = length_4.reset_index(drop=True)

In [61]:
# load zip code library
from uszipcode import SearchEngine

# create search object for zip code look-up
search = SearchEngine(simple_zipcode=True)

In [36]:
# list for tuples
tuple_list = []

for row in length4[['cty', 'st']].head(15).itertuples(index=True, name=None):
    tuple_list.append(row)

In [37]:
tuple_list

[(0, 'KENDALL PARK', 'NJ'),
 (1, 'SOMERVILLE', 'NJ'),
 (2, 'MANCHESTER', 'NH'),
 (3, 'WATERFORD', 'CT'),
 (4, 'BRIDGETON', 'NJ'),
 (5, 'BRIDGETON', 'NJ'),
 (6, 'MAHWAH', 'NJ'),
 (7, 'MAHWAH', 'NJ'),
 (8, 'MAHWAH', 'NJ'),
 (9, 'CONCORD', 'MA'),
 (10, 'OAK BLUFFS', 'MA'),
 (11, 'CAMBRIDGE', 'MA'),
 (12, 'HYANNIS', 'MA'),
 (13, 'MARSTON MILLS', 'MA'),
 (14, 'MASHPEE', 'MA')]

In [40]:
# how to extract city name
print(tuple_list[0][1])

# how to extract state
print(tuple_list[0][2])

KENDALL PARK
NJ


In [46]:
for tup in tuple_list:
    print(tup[1], tup[2])

KENDALL PARK NJ
SOMERVILLE NJ
MANCHESTER NH
WATERFORD CT
BRIDGETON NJ
BRIDGETON NJ
MAHWAH NJ
MAHWAH NJ
MAHWAH NJ
CONCORD MA
OAK BLUFFS MA
CAMBRIDGE MA
HYANNIS MA
MARSTON MILLS MA
MASHPEE MA


In [47]:
for tup in tuple_list:
    print(search.by_city_and_state(city=tup[1], state=tup[2])[0].to_dict()['zipcode'])

08824
08876
03101
06385
08302
08302
07430
07430
07430
01742
02557
02138
02601
02648
02649


In [49]:
# same as above but with all rows
all_rows = []

for row in length4[['cty', 'st']].itertuples(index=True, name=None):
    all_rows.append(row)

In [51]:
# what is the length of all_rows?
print(len(all_rows))

924


In [52]:
# make sure this worked, loop through last ~10 observations in list
for tup in all_rows[914:924]:
    print(tup[1], tup[2])

NEPTUNE NJ
CHERRY HILL NJ
WEST SPRINGFIELD MA
CRANSTON RI
SIMSBURY CT
CUMBERLAND RI
MORRISVILLE VT
WESTFIELD NJ
SCARBOROUGH ME
BUCKSPORT ME


In [53]:
for tup in all_rows[914:924]:
    print(search.by_city_and_state(city=tup[1], state=tup[2])[0].to_dict()['zipcode'])

07753
08002
01089
02910
06070
02864
05661
07090
04074
04416


In [67]:
for i in range(len(all_rows[900:])):
    print(search.by_city_and_state(city=tuple_list[i][1], state=tuple_list[i][2])[0].to_dict()['zipcode'])

08824
08876
03101
06385
08302
08302
07430
07430
07430
01742
02557
02138
02601
02648
02649


IndexError: list index out of range

In [79]:
import time
# create search object for zip code look-up
search = SearchEngine(simple_zipcode=True)

with SearchEngine(simple_zipcode=True) as search:
    for row in length4[['cty', 'st']].head(25).itertuples(index=True, name=None):
        print(search.by_city_and_state(city=row[1], state=row[2])[0].values()[0])
        time.sleep(1)

08824
08876
03101
06385
08302
08302
07430
07430
07430
01742
02557
02138
02601
02648
02649
07722
08002
08102
08807
06426


IndexError: list index out of range

In [81]:
import time
# create search object for zip code look-up
search = SearchEngine(simple_zipcode=True)

with SearchEngine(simple_zipcode=True) as search:
    for n in range(len(length4.head(25))):
        print(search.by_city_and_state(city=length4.loc[n, 'cty'], state=length4.loc[n, 'st'])[0].values()[0])
        time.sleep(1)

08824
08876
03101
06385
08302
08302
07430
07430
07430
01742
02557
02138
02601
02648
02649
07722
08002
08102
08807
06426


IndexError: list index out of range

In [57]:
# lets combine all these together and return the associated zip code for cty, st combo
all_zips = []

for tup in all_rows:
    z = search.by_city_and_state(city=tup[1], state=tup[2])[0].to_dict()['zipcode']
    all_zips.append(z)

IndexError: list index out of range

In [62]:
# define function that gathers city and state and returns zipcode info
def zipcode(row):
    zipcode = search.by_city_and_state(city=row['cty'], state=row['st'])[0].to_dict()['zipcode']
    return zipcode

In [64]:
# test to see if we can use apply with our recently defined function to get zipcode
length4.head().apply(zipcode, axis=1)

0    08824
1    08876
2    03101
3    06385
4    08302
dtype: object

In [65]:
# see if this will work on enture dataframe
length4.apply(zipcode, axis=1)

IndexError: ('list index out of range', 'occurred at index 20')

In [82]:
# current date and time
now = datetime.now()

# timestamp to signify the beginning of work
print("Work ended: ", now)

Work ended:  2019-10-05 17:09:23.716091
