In [122]:
from collections import Counter
import funcy
from fuzzywuzzy import fuzz
import numpy as np 
import pandas as pd 
import os

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings)

from dev import (
    APP_DATA_DIR, GRANT_DATA_DIR,  NAME_COLS, RAW_NAME_COLS, FEMALE_FIRST_NAMES, FEMALE_MIDDLE_NAMES,
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, PICKLE_DIR, STD_DIR, ATT_DATA_DIR)

from merging_functions import *

OUTPUT_CSV = False 

PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']


# load autoreload extension
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
apps = pd.read_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_standardized.csv')).drop('Unnamed: 0', axis=1)

In [51]:
grant_data = pd.read_stata(os.path.join(GRANT_DATA_DIR, 'grant_data.dta'))

In [52]:
stars_data = pd.read_stata(os.path.join(GRANT_DATA_DIR, 'stars_data.dta'))

In [53]:
grant_data.columns = map(lambda x: x.lower(), grant_data.columns)

In [54]:
stars_data.columns = map(lambda x: x.lower(), stars_data.columns)

In [55]:
stars_data['is_star'] = 1

In [57]:
unique_grants = grant_data.sort_values('year').drop_duplicates(
    ['newsetnb'], keep='first')[
        ['first', 'middle', 'last', 'suffix', 'newsetnb', 'mdyear', 'phdyear', 'year']].rename(columns={'year': 'first_grant_year'})

In [58]:
unique_grants2 = pd.concat([stars_data.drop(['fullname'], axis=1), unique_grants], axis=0)

In [59]:
unique_grants2['clean_first_name'] = unique_grants2['first'].apply(clean_names)
unique_grants2['clean_middle_name'] = unique_grants2['middle'].apply(clean_names)

unique_grants2['clean_last_name'] = unique_grants2['last'].apply(clean_names)

In [60]:
female_mask = (
    (unique_grants2['clean_first_name'].isin(FEMALE_FIRST_NAMES))| (
        unique_grants2['clean_middle_name'].isin(FEMALE_MIDDLE_NAMES)))
unique_grants2['is_female'] = 0
unique_grants2.loc[female_mask, 'is_female'] = 1

In [61]:
def get_initial(raw_str):
    if pd.isnull(raw_str):
        return np.nan
    if len(raw_str) == 1:
        return raw_str
    return raw_str[0]

unique_grants2['clean_middle_initial'] = unique_grants2.clean_middle_name.apply(get_initial)
unique_grants2['clean_first_initial'] = unique_grants2.clean_first_name.apply(get_initial)

In [62]:
unique_grants3 = unique_grants2[unique_grants2.is_female==0].sort_values(['newsetnb', 'is_star'], ascending=False).drop_duplicates(['newsetnb'], keep='first')

In [63]:
apps_counter = Counter(apps.clean_last_name.values)
unique_grants3['last_name_counts'] = unique_grants3.clean_last_name.apply(lambda x: apps_counter[x])
unique_grants4 = unique_grants3[
    ((unique_grants3.last_name_counts>1) & pd.isnull(unique_grants3.mdyear)) | 
    ((unique_grants3.last_name_counts>1) & (unique_grants3.mdyear<1975) & (unique_grants3.mdyear>1960))]

In [64]:
unique_grants4 = unique_grants4.rename(columns={'mdyear': 'medschool_year_grad', 'phdyear': 'phd_year_grad', 
                                               'star_yob':'birth_year', 'star_yod': 'death_year'})

In [65]:
unique_grants4.shape

(24590, 21)

In [66]:
del unique_grants2, unique_grants

In [67]:
exact = pd.merge(
    left=apps, right=unique_grants4, on=['clean_first_name', 'clean_last_name'], how='inner', suffixes=['_x', '_y'])
exact.shape

(2680, 132)

In [68]:
print apps.shape

(3729, 113)


In [69]:
exact['grad_sim'] = exact.apply(lambda x: abs(x['medschool_year_grad_x'] - x['medschool_year_grad_y']), axis=1)
exact['activity_year_sim'] = exact.apply(lambda x: x['first_grant_year'] - x['medschool_year_grad_x'], axis=1)

def check_match(row):
    if row['grad_sim'] > 5:
        return 0
    # first grant shouldn't be given prior to med school graduation
    if row['activity_year_sim'] < 0:
        return 0
    if not pd.isnull(row['clean_middle_initial_sim']) and row['clean_middle_initial_sim'] < 40:
        return 0
    if row['last_name_counts'] == 1:
        return 1
    return 1

feature_dict = {
    'clean_middle_name': get_name_str_sim,
    'clean_middle_initial': get_name_str_sim,
} 

exact2 = add_similarity_features(exact, feature_dict, check_match, suffixes=['_x', '_y'])

print unique_grants4.is_star.sum()
print exact2[exact2['is_match']==1].is_star.sum()
print exact2.is_star.sum()

404.0
166.0
192.0


In [70]:
# change raw sim columns to 100- so can be sorted with small values first
exact2.loc[:, 'clean_middle_name_sim'] = 100 - exact2['clean_middle_name_sim']
exact2.loc[:, 'clean_middle_initial_sim'] = 100 - exact2['clean_middle_initial_sim']

In [71]:
# do sort in ascending, because grad year sim and activity sim should be as small as possible
sim_cols =  ['grad_sim', 'activity_year_sim', 'clean_middle_name_sim', 'clean_middle_initial_sim']
exact3 = filter_one_match_per_group_simple(exact2[exact2['is_match']==1], PERSON_ID, sim_cols, True)
exact4 = filter_one_match_per_group_simple(exact3, 'newsetnb', sim_cols, True)


In [72]:
exact5 = consolidate_merge_cols(exact4, ['_x', '_y'], [])

['clean_first_initial', 'clean_middle_initial', 'clean_middle_name', 'is_female', 'medschool_year_grad', 'birth_year']


In [73]:
nm_apps = get_nonmatched(apps, exact4[PERSON_ID].values, PERSON_ID)
nm_grant = get_nonmatched(unique_grants4, exact4['newsetnb'].values, 'newsetnb')

(3139, 113)
(24000, 21)


In [74]:
nm_apps_c = Counter(nm_apps.clean_last_name.values)
nm_grants_c = Counter(nm_grant.clean_last_name.values)
nm_apps.loc[:, 'last_name_counts'] = nm_apps.clean_last_name.apply(lambda x: nm_apps_c[x])

nm_grant.loc[:, 'last_name_counts'] = nm_grant.clean_last_name.apply(lambda x: nm_grants_c[x])

In [75]:
last_merge = pd.merge(left=nm_apps, right=nm_grant, how='inner', on=['clean_last_name'], suffixes=['_x', '_y'])
last_merge.shape

(66842, 134)

In [76]:
last_merge['grad_sim'] = last_merge.apply(lambda x: abs(x['medschool_year_grad_x'] - x['medschool_year_grad_y']), axis=1)
last_merge['activity_year_sim'] = last_merge.apply(lambda x: x['first_grant_year'] - x['medschool_year_grad_x'], axis=1)

def check_match(row):
    if row['grad_sim'] > 5:
        return 0
    # first grant shouldn't be given prior to med school graduation
    if row['activity_year_sim'] < 0 or row['activity_year_sim'] < 3:
        return 0
    if row['clean_first_initial_sim'] < 100:
        return 0
    if row['clean_middle_initial_sim'] < 100:
        return 0
    if row['clean_first_name_sim'] < 80:
        return 0
    if row['clean_middle_name_sim'] < 80:
        return 0
    return 1

feature_dict = {
    'clean_middle_name': get_name_str_sim,
    'clean_middle_initial': get_name_str_sim,
    'clean_first_name': get_name_str_sim,
    'clean_first_initial': get_name_str_sim,
} 

last_merge2 = add_similarity_features(last_merge, feature_dict, check_match, suffixes=['_x', '_y'])
print unique_grants4.is_star.sum()
print last_merge2[last_merge2['is_match']==1].is_star.sum()
print last_merge2.is_star.sum()

404.0
7.0
710.0


In [77]:
#filter to one match per id
last_merge2.loc[:, 'clean_first_name_sim'] = 100 - last_merge2['clean_first_name_sim']
last_merge2.loc[:, 'clean_middle_name_sim'] = 100 - last_merge2['clean_middle_name_sim']
last_merge2.loc[:, 'clean_middle_initial_sim'] = 100 - last_merge2['clean_middle_initial_sim']
last_merge2.loc[:, 'clean_first_initial_sim'] = 100 - last_merge2['clean_first_initial_sim']

In [78]:
sim_cols =  ['grad_sim', 'activity_year_sim', 'clean_first_name_sim', 'clean_first_initial_sim', 'clean_middle_name_sim', 'clean_middle_initial_sim']
last_merge3 = filter_one_match_per_group_simple(last_merge2[last_merge2['is_match']==1], PERSON_ID, sim_cols, True)
last_merge4 = filter_one_match_per_group_simple(last_merge3, 'newsetnb', sim_cols, True)

In [79]:
last_merge4.shape

(49, 143)

In [80]:
last_merge5 = consolidate_merge_cols(last_merge4, ['_x', '_y'], [])

['clean_first_initial', 'clean_first_name', 'clean_middle_initial', 'clean_middle_name', 'is_female', 'medschool_year_grad', 'birth_year', 'last_name_counts']


In [81]:
all_matches = pd.concat([last_merge5, exact5], axis=0)

In [82]:
nm_apps = get_nonmatched(apps, all_matches[PERSON_ID], PERSON_ID)

(3090, 113)


In [83]:
apps2 = pd.concat([all_matches, nm_apps], axis=0)

In [84]:
to_drop_cols = [c for c in apps2.columns if c.endswith('_missing') or c.endswith('_duplicate') or '_counts' in c
               or c.endswith('_sim')]
print to_drop_cols

['activity_year_sim', 'clean_first_initial_sim', 'clean_first_name_sim', 'clean_middle_initial_sim', 'clean_middle_name_sim', 'grad_sim', 'last_name_counts', 'newsetnb_duplicate', 'person_uuid_duplicate']


In [85]:
apps3 = apps2.drop(to_drop_cols+['is_match', 'first', 'last', 'middle', 'is_female', 'suffix'], axis=1)

In [86]:
apps3.loc[apps3.duplicated(NAME_COLS, keep=False), NAME_COLS+['grant_id', PERSON_ID, 'application_year', 'medical_school']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,grant_id,person_uuid,application_year,medical_school
1954,JAMES,EDWARD,BROWN,,1433.0,1966.0,YALE SCHOOL OF MEDICINE
1963,JAMES,EDWARD,BROWN,,1409.0,1967.0,ROCHESTER
1387,DONALD,MARTIN,FALCHUK,,2497.0,1967.0,HARVARD
1878,DONALD,MARTIN,FALCHUK,,1974.0,1966.0,HARVARD
3198,DANIEL,MARTIN,BERKOWITZ,,564.0,1969.0,YALE
3230,DANIEL,MARTIN,BERKOWITZ,,529.0,1967.0,NEW YORK UNIVERSITY SCHOOL OF MEDICINE


In [92]:
has_birth_dt = ~pd.isnull(apps3.date_of_birth)
apps3.loc[has_birth_dt, 'birth_dt'] =  apps3.loc[has_birth_dt, 'date_of_birth']
apps3.loc[has_birth_dt, 'birth_year'] =  apps3.loc[has_birth_dt, 'date_of_birth'].apply(lambda x: pd.to_datetime(x).year)
apps4 = apps3.drop('date_of_birth', axis=1).rename(columns={'birth_dt': 'birth_date'})


In [93]:
def improper_formatting(raw_last_name):
    # a boolean fnc to identify which rows may have a suffix
    if pd.isnull(raw_last_name):
        return False
    last_lst = raw_last_name.split(' ')
    if len(last_lst) == 1:
        # if no white spaces in last name, only 1 word, so no suffix
        return False
    return True


In [94]:
# check for improperly formatted last names

bad_lastnames = ['E ROSS HARVARD','MCCLURE MCCHURE', 'S COHEN NYU']
to_fix_lastnames = ['ROSS', 'MCCLURE', 'COHEN']
apps4.loc[apps4.clean_last_name.isin(bad_lastnames), 'clean_last_name'] = to_fix_lastnames


In [95]:
# check for improper first names
# apps4.loc[apps4.clean_first_name.apply(improper_formatting), NAME_COLS]
bad_firstnames = ['ANNE FRANCES',  
                  'PHILIP R', 'J HAROLD', 'HENRY N', 'H BENFER', 'G JAMES', 'G DAVID', 'A LELAND', 'W STEVES']
to_fix_firstnames = ['FRANCES',  'PHILIP', 'J', 'HENRY', 'H', 'G', 'G', 'A', 'W']
apps4.loc[apps4.clean_first_name.isin(bad_firstnames), 'clean_first_name'] = to_fix_firstnames

In [96]:
# check for improper middle names
bad_middlenames = ['ANNE FRANCES',  'PHILIP R', 'J HAROLD', 'HENRY N', 'H BENFER', 'G JAMES', 'G DAVID', 'A LELAND', 'W STEVES']
to_fix_middlenames = ['FRANCES',  'PHILIP', 'J', 'HENRY', 'H', 'G', 'G', 'A', 'W']

has_suff = apps4.clean_middle_name.apply(has_suffix)
apps4.loc[has_suff, 'clean_suffix'] = apps4.loc[has_suff, 'clean_middle_name'].apply(get_suffix)
apps4.loc[has_suff, 'clean_middle_name'] = apps4.loc[has_suff, 'clean_middle_name'].apply(remove_suffix_from_last_name)

In [117]:
has_suff = apps4.clean_last_name.apply(has_suffix)
apps4.loc[has_suff, 'clean_suffix'] = apps4.loc[has_suff, 'clean_last_name'].apply(get_suffix)
apps4.loc[has_suff, 'clean_last_name'] = apps4.loc[has_suff, 'clean_last_name'].apply(remove_suffix_from_last_name)

In [97]:
def remove_dr_from_first_name(raw_str):
    if pd.isnull(raw_str):
        return np.nan
    raw_str2 = raw_str.split(' DR')
    return raw_str2[0]

In [98]:
mask = apps4.clean_middle_name.apply(improper_formatting)
apps4.loc[mask, 'clean_middle_name'] = apps4.loc[mask, 'clean_middle_name'].apply(remove_dr_from_first_name)

In [101]:
# sorted(apps4.columns)

In [102]:
def same_name(row):
    # check if first and last name are the same
    if row[0] == row[1]:
        print row
        return True
    return False

def is_weird_name(raw_str):
    rs = raw_str.split(' ')
    if len(rs) == 1:
        return False
    return True

In [114]:
apps4.loc[
    apps4[['clean_first_name', 'clean_last_name']].apply(same_name, axis=1), 'clean_first_name'] = 'JOHN'
#     NAME_COLS+['medical_school', 'first_name', 'middle_name', 'last_name']]

clean_first_name    NICHOLAS
clean_last_name     NICHOLAS
Name: 1348, dtype: object


In [119]:
apps4.loc[
    apps4[['clean_first_name', 'clean_middle_name']].apply(same_name, axis=1), 'clean_middle_name'] = np.nan

clean_first_name     LAWRENCE
clean_middle_name    LAWRENCE
Name: 1782, dtype: object
clean_first_name     FREDERIC
clean_middle_name    FREDERIC
Name: 1950, dtype: object
clean_first_name     EDMOND
clean_middle_name    EDMOND
Name: 2952, dtype: object


In [120]:
apps4.loc[
    apps4[['clean_middle_name', 'clean_last_name']].apply(same_name, axis=1), NAME_COLS]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name


In [123]:
# ADD LAST NAME COUNTS
nih = pd.read_csv(os.path.join(ATT_DATA_DIR, 'NIH_attendee_deduped_raw.csv'))

In [124]:
last_name_counts = Counter(nih.clean_last_name.values)

In [125]:
apps4['nih_last_name_counts'] = apps4.clean_last_name.apply(lambda x: last_name_counts[x])

In [129]:
def total_number_applications(app_years):
    unique_apps_years = app_years.dropna().unique()
    return unique_apps_years.shape[0]

In [131]:
apps4['number_applications'] = apps4[
    ['application_year', 'application_year_1', 'application_year_2', 'application_year_3']].apply(
        total_number_applications, axis=1)

In [141]:
to_drop = ['clean_first_initial', 'clean_middle_initial', 'degree_country_1', 'degree_inst_1_desc', 'exit', 
          'first_name', 'middle_name', 'last_name', 'level_0', 'dob_app_diff', 'duplicate_dno', 'index', 'inst',
          'main_dept', 'sim']

In [157]:

apps5 = apps4.sort_values(['clean_last_name', 'clean_first_name']).drop(to_drop, axis=1)

IMP_COLS = [
    PERSON_ID, 'aamc_id', 'newsetnb', 'dno', 'clean_first_name', 'clean_middle_name', 'clean_last_name',
    'clean_suffix','control_flag', 'application_year_min', 'application_year_max', 'eod_year',
    'medical_school', 'birth_year', 
    'nih_last_name_counts', 'number_applications', 'internship_start', 'internship_end',
    'internship_hospital', 'residency_start', 'residency_end', 'residency_hospital', 
    'undergrad_year_grad', 'year_accepted', 'rejected', 'rejection_date',
    'address', 'city', 'state', 'zip_code']

RESEARCH_COLS = ['bob', 'ca', 'cc', 'clinical', 'cord', 'dbs', 'fifth', 'generation', 'honor_societies_first',
                 'honor_societies_fourth', 'honor_societies_second', 'honor_societies_third', 'ic', 'institute',
                  'is_star', 'nci', 'nei', 'nhi', 'nhli', 'niaid', 'niamd', 'niamdd', 'nichd', 'nichhd',
                 'nidr', 'niehs', 'nigms', 'nimh', 'nindb', 'ninds', 'oir', 
                  'pharm_ra', 'phd_year_grad', 'pi', 'program', 'ra',  'research', 'sa', 'sixth',
                 'supervisor', 'teaching', 'withdrawal']

In [163]:
col_ordered =  funcy.remove(lambda x: x in IMP_COLS or x in RESEARCH_COLS, apps5.columns)
print col_ordered

['aamc_id_2', 'application_date', 'application_year', 'application_year_1', 'application_year_2', 'application_year_3', 'associate_program_entered', 'birth_country_cd', 'birth_date', 'birth_state_cd', 'citizenship', 'clean_college', 'death_year', 'deg', 'degree_type', 'department', 'first_grant_year', 'institution_aamc_id', 'internship_hospital_ipfcode', 'internship_hospital_std', 'med_school', 'medschool_year_grad', 'original_medical_school', 'race', 'raw_uuid', 'raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3', 'raw_uuid_4', 'raw_uuid_5', 'residency', 'residency_hospital_ipfcode', 'residency_hospital_std', 'sub_department']


In [164]:
apps6 = apps5[IMP_COLS + col_ordered + RESEARCH_COLS]

In [168]:
apps7 = apps6[~pd.isnull(apps6.clean_last_name)]

In [None]:
# remove LICHTER, 

In [169]:
apps7.to_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_grant_standardized.csv'), index=False)

In [None]:
apps7.apps7.cl