In [1]:
# download physician NPI number from NPPES API
import string
import funcy
import numpy as np
import pandas as pd
import os
import requests
import json
from fuzzywuzzy import fuzz

from dev import APP_DATA_DIR, NAME_COLS
from data_cleaning_functions import remove_punc

NPI_API_URL = 'https://npiregistry.cms.hhs.gov/api/'

In [2]:
def search(fname, lname, state=None):
    search_params = {'first_name': str(fname),
                    'last_name': str(lname),
                    'country_code': 'US',
                    'pretty': True}
    if state:
        search_params = funcy.merge(search_params, {'state': str(state)})
    r = requests.get(NPI_API_URL, params=search_params)
    if r.status_code==200:
        return r.json()
    print r.status_code
    return 

In [3]:
def check_middle_name(dr_middle_name, json):
    json_middle_name = funcy.get_in(json, ['basic', 'middle_name'], None)
    if not json_middle_name:
        return True
    dr_middle_name = str(dr_middle_name).upper().strip()
    json_middle_name = str(json_middle_name).upper().strip()
    one_letter = (len(dr_middle_name)==1 or len(json_middle_name)==1)
    if one_letter:
        # check first letter of middle names
        return (fuzz.ratio(dr_middle_name[0], json_middle_name[0]) > 95)
    return (fuzz.ratio(dr_middle_name, json_middle_name) > 90)

def clean_cred(cred):
    c2 = remove_punc(cred)
    c3 = c2.split(' ')
    return ''.join(c3)
    
def check_credential(json):
    cred = funcy.get_in(json, ['basic', 'credential'], None)
    if cred:
        creds = cred.split(' ')
        creds = map(clean_cred, creds)
        print creds
        return any(map(lambda x: x in ['MD', 'DDS', 'PHD'], creds))
    return True


def parse_results(results_json, dr_middle_name=None):
    if results_json['result_count'] == 0:
        return (0, None)
    results = results_json['results']
    males = filter(lambda x: x['basic']['gender']=='M', results)
    print len(males)
    mds = filter(check_credential, males)
    print len(mds)
    if dr_middle_name:
        mds = filter(lambda x: check_middle_name(dr_middle_name, x), mds)
    return (len(mds), mds)

In [4]:
df = pd.read_csv(os.path.join(APP_DATA_DIR, 'fuzzy_all_apps_plus_NIH_info.csv'))


In [9]:
df.loc[df.clean_first_name=='HOWARD UNIVERSITY COLLEGE OF MEDICINE', 'clean_first_name'] = 'HOWARD'
df.loc[pd.isnull(df.clean_middle_name), 'clean_middle_name'] = 'None'

In [10]:
npi_res = []
tups = list(df[['person_uuid', 'clean_first_name', 'clean_middle_name', 'clean_last_name']].itertuples())

In [11]:
for t in tups:
    if pd.isnull(t.clean_first_name):
        continue
    res = search(t.clean_first_name, t.clean_last_name)
    parsed = parse_results(res, t.clean_middle_name)
    npi_res.append((t.person_uuid, ' '.join([t.clean_first_name, t.clean_middle_name, t.clean_last_name]), parsed))

1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
5
['PHD']
['MD']
['PHD']
['LCSW']
4
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
9
['MD']
['CRNA']
['MS']
['MD', 'FACS']
['MD']
['MD']
['CRNA']
['MD']
['MD']
6
1
['MD']
1
1
['MD']
1
10
['MD']
['DPM']
['PHD', '', 'MD']
['MS', 'CCCSLP']
['PHD', 'CCC', 'SLP']
['MD']
['MD']
['MD']
['PHARMACIST']
7
10
['DDS']
['MD']
['DDS']
['MD', 'FAAP']
['MD']
['MD']
['DC']
['DDS']
['DDS']
['MD']
9
3
['PHD', 'MD']
['MD']
['MD']
3
1
['MD']
1
3
['MD']
['MD']
['MD']
3
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
1
1
['MD']
1
2
['MD']
['COTAL']
1
2
['MD']
['MD', 'PHD']
2
3
['CRNA']
['MD']
['MD']
2
4
['MD']
['MD']
['MD']
['DC']
3
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
1
['MD']
1
10
['MD']
['MD']
['MD']
['MD']
['MD']
['MD']
['MD']
['MD']
['DMD']
['MD']
9
10
['MD']
['MD']
['MD']
['DO']
['MD']
['MD']
['MD']
['OD']
['MD']
['MD']
8
10
['CRNA']
['MD']
['M', 'D']
['DMD']
['DDS']
['CRNA']
[

In [12]:
npi_res[0]

(3497.0,
 'ROY KENNETH AARON',
 (1,
  [{u'addresses': [{u'address_1': u'100 BUTLER DR',
      u'address_2': u'',
      u'address_purpose': u'LOCATION',
      u'address_type': u'DOM',
      u'city': u'PROVIDENCE',
      u'country_code': u'US',
      u'country_name': u'United States',
      u'postal_code': u'029064862',
      u'state': u'RI',
      u'telephone_number': u'401-274-9660'},
     {u'address_1': u'PO BOX 1119',
      u'address_2': u'',
      u'address_purpose': u'MAILING',
      u'address_type': u'DOM',
      u'city': u'PROVIDENCE',
      u'country_code': u'US',
      u'country_name': u'United States',
      u'postal_code': u'029011119',
      u'state': u'RI',
      u'telephone_number': u'401-274-9660'}],
    u'basic': {u'credential': u'M.D.',
     u'enumeration_date': u'2006-06-07',
     u'first_name': u'ROY',
     u'gender': u'M',
     u'last_name': u'AARON',
     u'last_updated': u'2014-08-25',
     u'middle_name': u'KENNETH',
     u'name': u'AARON ROY',
     u'sole_proprie

In [13]:
no_npi = filter(lambda x: x[2][0]==0, npi_res) 
one_npi = filter(lambda x: x[2][0]==1, npi_res) 
multiple_npi = filter(lambda x: x[2][0]>1, npi_res) 

In [14]:
one_npi_numbers = {x[0]: x[2][1][0]['number'] for x in one_npi}
multiple_npi_numbers = {x[0]: map(lambda y: y['number'], x[2][1]) for x in multiple_npi}

In [15]:
one_npi_numbers[1]

1801850656

In [16]:
df['NPI'] = np.nan

In [17]:
' | '.join(['asd'])

'asd'

In [18]:
def get_npi_number(person_uuid):
    npi = one_npi_numbers.get(person_uuid, None)
    if not npi:
        npi = multiple_npi_numbers.get(person_uuid, None)
    if not npi:
        return np.nan
    if isinstance(npi, list):
        return ' | '.join(map(str, npi))
    return str(npi)

def get_num_npi_res(npi_str):
    if pd.isnull(npi_str):
        return 0
    return len(npi_str.split(' | '))

In [19]:
df['NPI'] = df.person_uuid.apply(get_npi_number)
df['number_NPI_results'] = df.NPI.apply(get_num_npi_res)

In [20]:
df.head()

Unnamed: 0,dno,person_uuid,application_year_min,application_year_max,eod_year,clean_first_name,clean_middle_name,clean_last_name,control_flag,time_period_flag,...,supervisor,teaching,to_drop,undergrad_year_grad,undergraduate_school,withdrawal,year_grad,zip_code,NPI,number_NPI_results
0,1.0,3497.0,1970.0,1970.0,1972.0,ROY,KENNETH,AARON,0,1,...,,1.0,False,,,-9.0,1969.0,11694.0,1336189661,1
1,2.0,3743.0,1966.0,1966.0,1967.0,STUART,ALAN,AARONSON,0,1,...,Stephenson,1.0,False,,,-9.0,1966.0,,1336513100,1
2,3.0,4127.0,1965.0,1965.0,1965.0,JON,MORTON,AASE,0,1,...,,,False,,,-9.0,,,1790744316,1
3,,2243.0,1971.0,1971.0,,LELAND,RUSSELL,ABBEY,1,1,...,,1.0,False,,,0.0,,,1801906383,1
4,5.0,1418.0,1962.0,1962.0,1964.0,HOWARD,R,ABEL,0,0,...,Bailar,0.0,False,,,-9.0,1962.0,,1750496568,1


In [22]:
df.to_csv(os.path.join(APP_DATA_DIR, 'NPI_df.csv'))

In [23]:
df2 = df[NAME_COLS + ['medical_school', 'medschool_year_grad', 'control_flag', 'time_period_flag', 'person_uuid', 
    'dno', 'is_female', 'is_foreign', 'eod_year', 'application_year_max']]

In [24]:
df2.to_csv(os.path.join(APP_DATA_DIR, 'NPI_info.csv'), index=False)

In [88]:
print len(no_npi)
print len(one_npi)
print len(multiple_npi)

1168
1949
500


In [50]:
res = search('Jacob', 'ackerman')

parse_results(res, 'lewis')

1
1
middle
{u'addresses': [{u'city': u'BROOKLYN', u'address_2': u'', u'telephone_number': u'718-968-8700', u'fax_number': u'718-968-8743', u'state': u'NY', u'postal_code': u'11234', u'address_1': u'1901 UTICA AVE', u'country_code': u'US', u'country_name': u'United States', u'address_type': u'DOM', u'address_purpose': u'LOCATION'}, {u'city': u'BROOKLYN', u'address_2': u'', u'telephone_number': u'718-968-8700', u'fax_number': u'718-968-8743', u'state': u'NY', u'postal_code': u'11234', u'address_1': u'1901 UTICA AVE', u'country_code': u'US', u'country_name': u'United States', u'address_type': u'DOM', u'address_purpose': u'MAILING'}], u'created_epoch': 1158278400, u'identifiers': [{u'issuer': u'NHIC', u'state': u'NY', u'code': u'01', u'identifier': u'0147830001', u'desc': u'Other'}, {u'issuer': u'', u'state': u'NY', u'code': u'04', u'identifier': u'5770100', u'desc': u'MEDICARE ID-Type Unspecified'}, {u'issuer': u'RAILROAD MEDICARE', u'state': u'NY', u'code': u'01', u'identifier': u'180029

(1,
 [{u'addresses': [{u'address_1': u'1901 UTICA AVE',
     u'address_2': u'',
     u'address_purpose': u'LOCATION',
     u'address_type': u'DOM',
     u'city': u'BROOKLYN',
     u'country_code': u'US',
     u'country_name': u'United States',
     u'fax_number': u'718-968-8743',
     u'postal_code': u'11234',
     u'state': u'NY',
     u'telephone_number': u'718-968-8700'},
    {u'address_1': u'1901 UTICA AVE',
     u'address_2': u'',
     u'address_purpose': u'MAILING',
     u'address_type': u'DOM',
     u'city': u'BROOKLYN',
     u'country_code': u'US',
     u'country_name': u'United States',
     u'fax_number': u'718-968-8743',
     u'postal_code': u'11234',
     u'state': u'NY',
     u'telephone_number': u'718-968-8700'}],
   u'basic': {u'credential': u'MD',
    u'enumeration_date': u'2006-09-15',
    u'first_name': u'JACOB',
    u'gender': u'M',
    u'last_name': u'ACKERMAN',
    u'last_updated': u'2012-04-03',
    u'middle_name': u'L',
    u'name': u'ACKERMAN JACOB',
    u'sole_

In [38]:
res

{u'result_count': 1,
 u'results': [{u'addresses': [{u'address_1': u'1901 UTICA AVE',
     u'address_2': u'',
     u'address_purpose': u'LOCATION',
     u'address_type': u'DOM',
     u'city': u'BROOKLYN',
     u'country_code': u'US',
     u'country_name': u'United States',
     u'fax_number': u'718-968-8743',
     u'postal_code': u'11234',
     u'state': u'NY',
     u'telephone_number': u'718-968-8700'},
    {u'address_1': u'1901 UTICA AVE',
     u'address_2': u'',
     u'address_purpose': u'MAILING',
     u'address_type': u'DOM',
     u'city': u'BROOKLYN',
     u'country_code': u'US',
     u'country_name': u'United States',
     u'fax_number': u'718-968-8743',
     u'postal_code': u'11234',
     u'state': u'NY',
     u'telephone_number': u'718-968-8700'}],
   u'basic': {u'credential': u'MD',
    u'enumeration_date': u'2006-09-15',
    u'first_name': u'JACOB',
    u'gender': u'M',
    u'last_name': u'ACKERMAN',
    u'last_updated': u'2012-04-03',
    u'middle_name': u'L',
    u'name': u

In [11]:
df.head(20)

Unnamed: 0,dno,person_uuid,application_year_min,application_year_max,eod_year,clean_first_name,clean_middle_name,clean_last_name,control_flag,time_period_flag,...,ssn,state,supervisor,teaching,to_drop,undergrad_year_grad,undergraduate_school,withdrawal,year_grad,zip_code
0,1.0,3497.0,1970.0,1970.0,1972.0,ROY,KENNETH,AARON,0,1,...,,New York,,1.0,False,,,-9.0,1969.0,11694.0
1,2.0,3743.0,1966.0,1966.0,1967.0,STUART,ALAN,AARONSON,0,1,...,,California,Stephenson,1.0,False,,,-9.0,1966.0,
2,3.0,4127.0,1965.0,1965.0,1965.0,JON,MORTON,AASE,0,1,...,,Washington,,,False,,,-9.0,,
3,,2243.0,1971.0,1971.0,,LELAND,RUSSELL,ABBEY,1,1,...,,New York,,1.0,False,,,0.0,,
4,5.0,1418.0,1962.0,1962.0,1964.0,HOWARD,R,ABEL,0,0,...,,New York,Bailar,0.0,False,,,-9.0,1962.0,
5,6.0,3466.0,1966.0,1966.0,1968.0,RONALD,M,ABEL,0,1,...,,New York,Morrow,1.0,False,,,-9.0,1966.0,
6,8.0,2423.0,1966.0,1966.0,1967.0,MARTIN,DAVID,ABELOFF,0,1,...,,Pennsylvania,Block,0.0,False,,,-9.0,1966.0,
7,9.0,1366.0,1966.0,1966.0,1967.0,HERBERT,T,ABELSON,0,1,...,,Missouri,Dalton,1.0,False,,,-9.0,1966.0,63132.0
8,,3181.0,1970.0,1970.0,,ROBERT,CLARK,ABER,0,1,...,,Pennsylvania,,1.0,False,,,-9.0,,18042.0
9,10.0,3715.0,1967.0,1967.0,1971.0,STEVEN,LURIA,ABLON,0,1,...,,Ohio,,1.0,False,1963.0,Amherst College,-9.0,1967.0,44120.0


In [3]:
search('stuart', 'aaronson')

{u'result_count': 1,
 u'results': [{u'addresses': [{u'address_1': u'40 E 94TH ST APT 23B',
     u'address_2': u'',
     u'address_purpose': u'LOCATION',
     u'address_type': u'DOM',
     u'city': u'NEW YORK',
     u'country_code': u'US',
     u'country_name': u'United States',
     u'fax_number': u'212-987-2240',
     u'postal_code': u'101280738',
     u'state': u'NY',
     u'telephone_number': u'212-659-5400'},
    {u'address_1': u'40 E 94TH ST APT 23B',
     u'address_2': u'',
     u'address_purpose': u'MAILING',
     u'address_type': u'DOM',
     u'city': u'NEW YORK',
     u'country_code': u'US',
     u'country_name': u'United States',
     u'fax_number': u'212-987-2240',
     u'postal_code': u'101280738',
     u'state': u'NY',
     u'telephone_number': u'212-659-5400'}],
   u'basic': {u'credential': u'M.D.',
    u'enumeration_date': u'2015-11-17',
    u'first_name': u'STUART',
    u'gender': u'M',
    u'last_name': u'AARONSON',
    u'last_updated': u'2015-11-17',
    u'name': u'AA