In [1]:
from collections import Counter
import funcy
from fuzzywuzzy import fuzz
import numpy as np 
import pandas as pd 
import os

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings)

from dev import (
    APP_DATA_DIR, SUM_STAT_DIR, ATT_DATA_DIR, CARD_DATA_DIR, CORRECTIONS_DIR, AWARDS_KEYWORDS, NAME_COLS, RAW_NAME_COLS, 
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, FEMALE_FIRST_NAMES,
    PICKLE_DIR, STD_DIR)

from merging_functions import *

OUTPUT_CSV = False 


# load autoreload extension
%load_ext autoreload
%autoreload 2

In [2]:
apps = pd.read_csv(os.path.join(APP_DATA_DIR, 'aamc_apps_nih.csv'))

In [3]:
# sorted(apps.columns)

In [4]:
inst = pd.read_csv(os.path.join(STD_DIR, 'institutions.csv'))

inst['inst'] = inst.Institution.apply(clean_names)

apps['original_medical_school'] = apps.original_medical_school.apply(clean_names)

apps.loc[apps['medical_school']=='ILLINOIS', ['original_medical_school', 'med_school', 'medical_school', 'degree_inst_1_desc']]

Unnamed: 0,original_medical_school,med_school,medical_school,degree_inst_1_desc


In [5]:
# apps.loc[apps.medical_school=='SUNY', ['med_school', 'medical_school', 'degree_inst_1_desc']]

In [6]:
inst2 = inst.drop('Institution', axis=1).rename(columns={'aamc_id': 'institution_aamc_id'})
inst2['medical_school'] = inst2.inst.apply(clean_med_school)


CALIFORNIA INSTITUTE OF TECHNOLOGY
CITY OF HOPE BECKMAN RESEARCH INSTITUTE
CLEVELAND CLINIC
COLD SPRING HARBOR LABORATORY
FOX CHASE CANCER CENTER
HENRY FORD HEALTH SCIENCE CENTER
MAYO MEDICAL SCHOOL
MOREHOUSE SCHOOL OF MEDICINE
NIH
PRINCETON UNIVERSITY
ROCKEFELLER UNIVERSITY
SALK INSTITUTE FOR BIOLOGICAL STUDIES
SCRIPPS RESEARCH INSTITUTE
TEXAS A M UNIVERSITY HEALTH SCIENCE CENTER COLLEGE OF MEDICINE
TEXAS TECH UNIVERSITY HEALTH SCIENCES CENTER SCHOOL OF MEDICINE
UNIFORMED SERVICES UNIVERSITY OF THE HEALTH SCIENCES
UNIVERSITY OF SOUTH DAKOTA SCHOOL OF MEDICINE
nan


  result = lib.scalar_compare(x, y, op)


In [7]:
# apps.loc[apps['medical_school']!=apps['original_medical_school'], ['medical_school', 'original_medical_school']]

In [8]:
possible_matches = pd.merge(
    left=apps[['original_medical_school', 'medical_school']].dropna().drop_duplicates('original_medical_school'), 
    right=inst2, on=['medical_school'], how='inner')

In [9]:
others = apps.loc[~apps['medical_school'].isin(
        possible_matches['medical_school']), ['medical_school', 'original_medical_school']].dropna().drop_duplicates()

possible_matches2 = pd.concat([possible_matches, others], axis=0)

In [10]:
possible_matches2.to_csv(os.path.join(CORRECTIONS_DIR, 'institution_manual_matches_raw.csv'))

In [11]:
mm = pd.read_csv(os.path.join(CORRECTIONS_DIR, 'institution_manual_matches.csv')).drop_duplicates()
mm['inst'] = mm.inst.apply(clean_names)

In [12]:
apps1 = pd.merge(left=apps, right=mm, on='medical_school', how='left').drop_duplicates(PERSON_ID)
apps1['inst'] = apps1.inst.apply(clean_names)
print apps.shape
print apps1.shape

(3615, 122)
(3612, 123)


In [13]:
apps2 = pd.merge(left=apps1, right=inst2[['inst', 'institution_aamc_id']], on=['inst'], how='left').drop_duplicates(PERSON_ID)
print apps2.shape
print apps1.shape
print apps.shape

(3612, 124)
(3612, 123)
(3615, 122)


In [14]:
apps2 = apps2.sort_values(['clean_last_name', 'clean_first_name'])
# a2.loc[a2.duplicated(
#         ['clean_last_name', 'clean_first_name'], keep=False), [
#             'residency_hospital', 'original_medical_school','inst', 'year_grad', 'application_year', PERSON_ID]+NAME_COLS]

In [15]:
deps = pd.read_csv(os.path.join(STD_DIR, 'departments.csv'))

In [16]:
deps['department'] = deps['Department'].apply(clean_names)
deps['sub_department'] = deps['Sub-department'].apply(clean_names)
apps2['residency'] = apps2['residency'].apply(clean_names)

In [17]:
deps1 = deps.drop(['Department', 'Sub-department'], axis=1)
deps1.loc[pd.isnull(deps1.sub_department), 'sub_department'] = deps1.department
apps2['sub_department'] = apps2['residency']

In [18]:
deps2 = df_get_closest_matches(deps1, apps2[['residency', 'sub_department']].dropna().drop_duplicates(), 'sub_department')
deps3 = deps2.reset_index(drop=False).rename(columns={'index': 'sub_department'})

In [19]:
deps3['sim'] = deps3.apply(lambda x: str_sim(x, 'residency', 'sub_department'), axis=1)
deps4 = deps3[deps3['sim'] > .6]

In [20]:
res = apps2['residency'].dropna().drop_duplicates()
deps5 = pd.concat([deps4, res[~res.isin(deps4['residency'])]], axis=0)

In [21]:
deps5.to_csv(os.path.join(CORRECTIONS_DIR, 'residency_departments_raw.csv'))

In [22]:
maps = pd.read_csv(os.path.join(CORRECTIONS_DIR, 'residency_departments.csv'))

In [23]:
maps.loc[
    pd.isnull(maps['sub_department']), 'sub_department'] =  maps.loc[pd.isnull(maps['sub_department']), 'department']

In [24]:
apps3 = pd.merge(left=apps2.drop('sub_department', axis=1), right=maps, on='residency', how='left').drop_duplicates(PERSON_ID)

In [25]:
apps3['application_year_min'] = apps3[
    ['application_year', 'application_year_1', 'application_year_2', 'application_year_3']].apply(lambda x: np.nanmin(x.values), axis=1)
apps3['application_year_max'] = apps3[
    ['application_year', 'application_year_1', 'application_year_2', 'application_year_3']].apply(lambda x: np.nanmax(x.values), axis=1)



In [26]:
mask = ~pd.isnull(apps3.age) & ~pd.isnull(apps3.application_year_min) & pd.isnull(apps3.yobb)
apps3.loc[mask, 'yobb'] = apps3.loc[mask, ['application_year_min', 'age']].apply(lambda x: x['application_year_min'] - x['age'], axis=1)

In [27]:
# consolidate college columns
# apps4[['clean_college', 'undergraduate_school']]
mask = pd.isnull(apps3['medschool_year_grad']) & ~pd.isnull(apps3['degree_year_1'])
apps3.loc[mask, 'medschool_year_grad'] = apps3.loc[mask, 'degree_year_1']
mask = pd.isnull(apps3['medschool_year_grad']) & ~pd.isnull(apps3['year_grad'])
apps3.loc[mask, 'medschool_year_grad'] = apps3.loc[mask, 'year_grad']

In [28]:
mask = pd.isnull(apps3['clean_suffix']) & ~pd.isnull(apps3['suffix_cd'])
apps3.loc[mask, 'clean_suffix']= apps3.loc[mask, 'suffix_cd']

In [29]:
apps3['residency_1'] = apps3[['residency_dates', 'residency_year(s)']].apply(consolidate_col, axis=1)
apps3['internship_1'] = apps3[['internship_dates', 'internship_year(s)']].apply(consolidate_col, axis=1)

def parse_dt(str_date, start=True):
    if pd.isnull(str_date):
        return np.nan
    dts = str_date.split('-')
    if len(dts) == 1 or start:
        return dts[0]
    return dts[1]
    

apps3['residency_start'] = apps3['residency_1'].apply(parse_dt)
apps3['internship_start'] = apps3['internship_1'].apply(parse_dt)
apps3['residency_end'] = apps3['residency_1'].apply(funcy.rpartial(parse_dt, False))
apps3['internship_end'] = apps3['internship_1'].apply(funcy.rpartial(parse_dt, False))


In [30]:
apps3['residency_hospital'] = apps3['residency_hospital'].apply(clean_names)
apps3['internship_hospital'] = apps3['internship_hospital'].apply(clean_names)

In [31]:
apps3.loc[apps3['clean_first_name'] == apps3['clean_middle_name'], NAME_COLS+['medical_school','mname', 'middle_name', 'first_name', 'fname']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,medical_school,mname,middle_name,first_name,fname
294,KENNETH,KENNETH,BLAYLOCK,MEDICAL COLLEGE OF VIRGINIA,,Kenneth,Kenneth,
509,HOWARD,HOWARD,CEDAR,OHIO STATE UNIVERSITY COLLEGE OF MEDICINE AND ...,,Howard,Howard,
551,DOUGLAS,DOUGLAS,CLARK,UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL SC...,,Douglas,Douglas,
880,MYRON,MYRON,FALCHUK,HARVARD MEDICAL SCHOOL,,Myron,Myron,
916,FREDERICK,FREDERICK,FENSTER,HARVARD MEDICAL SCHOOL,,Frederick,Frederick,
1469,ALLAN,ALLAN,HOBSON,HARVARD MEDICAL SCHOOL,,Allan,Allan,
2027,HERBERT,HERBERT,LUBOWITZ,WASHINGTON UNIVERSITY SCHOOL OF MEDICINE,,,Herbert,
2330,FREDERIC,FREDERIC,MUSHINSKI,HARVARD MEDICAL SCHOOL,,Frederic,Jos,
2685,LAWRENCE,LAWRENCE,REINER,BAYLOR COLLEGE OF MEDICINE,,Lawrence,Lawrence,
2819,RICHARD,RICHARD,ROVNER,SUNY UPSTATE MEDICAL UNIVERSITY AT SYRACUSE,,,Richard,


In [32]:
apps3.loc[
    pd.isnull(
        apps3.clean_last_name), 'clean_last_name'] = apps3.loc[pd.isnull(apps3.clean_last_name), 'last_name'].apply(clean_names)

In [33]:
apps4 = apps3.drop(
    ['Unnamed: 0', 'birth_country_desc', 'data_source', 'is_match', 
     'last_name_counts', 'reviewer', 'source', 'ssn', 'age', 
     'lname', 'fname', 'mname', 'undergraduate_school', 'year_grad', 'degree_year_1', 
        'suffix_cd', 'residency_dates', 'residency_year(s)', 'internship_dates', 'internship_year(s)', 
        'internship_1', 'residency_1'], axis=1).rename(
        columns={'yobb': 'birth_year', 'ident_cat_desc': 'race'})

In [34]:
# read in hospital ipf codes
hos = pd.read_excel(os.path.join(STD_DIR, 'hospital_ipfcodes.xlsx'))

In [35]:
hos2 = hos.sort_values('org_name').drop_duplicates('org_name').rename(columns={'org_name': 'hospital'})

In [36]:
orgs = pd.DataFrame(
    np.concatenate(
        [apps4.residency_hospital.dropna().unique(), apps4.internship_hospital.dropna().unique()], axis=0), columns=['hospital'])
orgs2 = orgs.drop_duplicates()
orgs2['index_card_hospital'] = orgs2['hospital']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [37]:
matches = df_get_closest_matches(orgs2, hos2[['hospital', 'ipfcode', 'org_index_name']], 'hospital')

In [38]:
matches['sim'] = matches[['org_index_name', 'index_card_hospital']].apply(get_name_str_sim, axis=1)

In [39]:
matches2 = matches.reset_index(
    drop=False).rename(columns={'index': 'hospital'}).sort_values(
        ['hospital', 'sim'], ascending=False).drop_duplicates(['hospital', 'ipfcode']).dropna()

In [40]:
matches3 = matches2[matches2.sim>95]

In [41]:
print matches3.shape
# matches3.head()

(273, 5)


In [42]:
# not matched
nm = orgs2[~orgs2.hospital.isin(matches3.index_card_hospital)]
nm.shape

(820, 2)

In [43]:
# do a cartesian merge between remaining nonmatched hospitals, calculate string sim

In [44]:
nm['key'] = 0
hos2['key'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [45]:
cart_product = pd.merge(left=nm, right=hos2[['hospital', 'ipfcode', 'org_index_name', 'key']], how='left', on='key')

In [46]:
print cart_product.shape

(2919200, 6)


In [47]:
# sort cartesian product by hospital, calculate string sim
cart_product['sim'] = cart_product[['hospital_x', 'hospital_y']].apply(get_name_str_sim, axis=1)

In [48]:
cp2 = cart_product.sort_values(['hospital_x', 'sim'], ascending=False).drop_duplicates(['hospital_x'], keep='first')

In [49]:
cp2.to_csv(os.path.join(CORRECTIONS_DIR, 'hospitals_corrections_raw.csv'))

In [50]:
cp2.head()

Unnamed: 0,hospital_x,index_card_hospital,key,hospital_y,ipfcode,org_index_name,sim
2881948,YORK HOSPITAL YORK PENNSYLVANIA,YORK HOSPITAL YORK PENNSYLVANIA,0,NEW YORK HOSPITAL,7554403,NEW YORK-PRESBYTERIAN HEALTHCARE--NEW YORK-PRE...,76
1945668,YORK HOSPITAL,YORK HOSPITAL,0,NEW YORK HOSPITAL,7554403,NEW YORK-PRESBYTERIAN HEALTHCARE--NEW YORK-PRE...,100
922029,YALE UNIVERSITY SCHOOL OF MEDICINE,YALE UNIVERSITY SCHOOL OF MEDICINE,0,YALE UNIVERSITY,9420201,YALE UNIVERSITY,100
1516549,YALE UNIVERSITY GRACE NEW HAVEN COMMUNITY,YALE UNIVERSITY GRACE NEW HAVEN COMMUNITY,0,YALE UNIVERSITY,9420201,YALE UNIVERSITY,100
170015,YALE NEW HAVEN MEDICAL CENTER TEMPLE UNIVERSIT...,YALE NEW HAVEN MEDICAL CENTER TEMPLE UNIVERSIT...,0,TEMPLE UNIVERSITY,8240301,TEMPLE UNIVERSITY,100


In [51]:
residency_info = pd.read_csv(os.path.join(CORRECTIONS_DIR, 'hospital_corrections.csv'))


In [52]:
hospitals = pd.concat([matches3, residency_info], axis=0)
hospitals['ipf_hospital'] = hospitals['hospital']
hospitals2 = hospitals[['ipf_hospital', 'ipfcode']]

In [53]:
apps5 = pd.merge(left=apps4, right=hospitals2, left_on='residency_hospital', right_on='ipf_hospital', how='left')
print apps5.shape
print apps4.shape

(3793, 112)
(3612, 110)


In [54]:
apps5['sim'] = apps5[['residency_hospital', 'ipf_hospital']].apply(get_name_str_sim, axis=1)

In [55]:
# apps5.loc[apps5.duplicated([PERSON_ID], keep=False), NAME_COLS+['residency_hospital', 'ipf_hospital']]

In [56]:
apps5= apps5.sort_values([PERSON_ID, 'sim'], ascending=False).drop_duplicates(PERSON_ID)
apps6 = apps5.rename(columns={'ipf_hospital': 'residency_hospital_std', 'ipfcode': 'residency_hospital_ipfcode'})

In [57]:
apps7 = pd.merge(left=apps6, right=hospitals2, left_on='internship_hospital', right_on='ipf_hospital', how='left')

In [58]:
apps7['sim'] = apps7[['internship_hospital', 'ipf_hospital']].apply(get_name_str_sim, axis=1)
apps7= apps7.sort_values([PERSON_ID, 'sim'], ascending=False).drop_duplicates(PERSON_ID)

In [59]:
apps8 = apps7.rename(columns={'ipf_hospital': 'internship_hospital_std', 'ipfcode': 'internship_hospital_ipfcode'})

In [60]:
print apps5.shape
print apps8.shape

(3612, 113)
(3612, 115)


In [61]:
apps8.to_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_standardized.csv'))