In [1]:
# import and merge NIH applicants with AAMC data to get AAMC unique id and use info to fill in name information
from collections import Counter
import funcy
from fuzzywuzzy import fuzz
import numpy as np 
import pandas as pd 
import os

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings)

from dev import (
    APP_DATA_DIR, SUM_STAT_DIR, ATT_DATA_DIR, CARD_DATA_DIR, CORRECTIONS_DIR, AWARDS_KEYWORDS, NAME_COLS, RAW_NAME_COLS, 
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, FEMALE_FIRST_NAMES,
    PICKLE_DIR, AAMC_DATA_DIR)

from merging_functions import *

OUTPUT_CSV = False 

PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']


# load autoreload extension
%load_ext autoreload
%autoreload 2

In [2]:
IMPORTANT_COLS = [NIH_ID, PERSON_ID, 'aamc_id', 'application_year_min', 'application_year_max', 'eod_year',
                  'clean_first_name', 'clean_middle_name', 
                 'clean_last_name', 'control_flag', 'time_period_flag', 'year_accepted',
                  'rejected', 'rejection_date', 'clean_college', 'medical_school',
                'residency_dates', 'residency_dates_NIH', 'internship_dates', 'internship_dates_NIH',
                  'medschool_year_grad', 'yobb',
                  'is_female', 'is_foreign', 'number_applications', 
                 'NIH_first_name', 'NIH_middle_name', 'NIH_last_name', 'NIH_medical_school']

In [3]:
print AAMC_DATA_DIR

~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/aamc_data


In [4]:
# import aamc data file
aamc = pd.read_csv(os.path.join(AAMC_DATA_DIR, 'aamc_deduped_raw.csv'))

ln = 'LAWLEY'

aamc.loc[aamc.clean_last_name==ln]

In [5]:
aamc.head()

Unnamed: 0.1,Unnamed: 0,aamc_id,aamc_id_2,birth_country_cd,birth_country_desc,birth_dt,birth_state_cd,clean_first_name,clean_last_name,clean_middle_initial,...,degree_country_1,degree_inst_1_desc,degree_type,degree_year_1,fname,ident_cat_desc,lname,mname,suffix_cd,yobb
0,0,13758974,13707332.0,,,,,KEITH,BALDWIN,,...,,,MD,,Keith,WHITE,Baldwin,,,
1,1,12416381,10967027.0,,,10/19/1946,,MARK,BROWN,S,...,,,MD,,Mark,,Brown,S,,1946.0
2,2,12222187,12154436.0,,,,,JAMES,COLLINS,,...,,,MD,1965.0,James,BLACK,Collins,,,
3,3,13217221,12603243.0,,,,,EDWARD,COOPER,C,...,,,MD,,Edward,WHITE,Cooper,C,,
4,4,12881504,10967675.0,,,,,RUDOLPH,CUMBERBATCH,,...,USA,HOWARD UNIVERSITY COLLEGE OF MEDICINE,MD,1959.0,Rudolph,BLACK,Cumberbatch,,,


In [6]:
# read in applicant/nih merged data set
apps = pd.read_csv(os.path.join(APP_DATA_DIR, 'fuzzy_all_apps_plus_NIH_info.csv' ))

In [7]:
apps.loc[(apps.clean_last_name=='FALCHUK')&(apps.clean_first_name=='MYRON'), 'clean_first_name'] = 'Z'
apps.loc[(apps.clean_last_name=='BLAYLOCK')&(apps.clean_first_name=='KENNETH'), 'clean_first_name'] = 'W'

In [8]:
# add middle_initial column

In [9]:
# try to merge on first, middle initial and last name
apps2 = apps.sort_values(['clean_first_name', 'clean_middle_initial', 'clean_last_name', 'medschool_year_grad'])

aamc2 = aamc.sort_values(['clean_first_name', 'clean_middle_initial', 'clean_last_name', 'degree_year_1'])
aamc2.loc[:, 'medical_school'] = aamc2.degree_inst_1_desc.apply(clean_med_school)

  result = lib.scalar_compare(x, y, op)


ISTANBUL ÜNIVERSITESI ISTANBUL TIP FAKüLTESI
TEHRAN UNIVERSITY OF MEDICAL SCIENCES SCHOOL OF MEDICINE
ISTANBUL ÜNIVERSITESI ISTANBUL TIP FAKüLTESI
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE


  return key in self._engine


UNIVERSITà DEGLI STUDI DI BOLOGNA SCUOLA DI MEDICINA E CHIRURGIA
UNIVERSIDAD NACIONAL AUTóNOMA DE MéXICO FACULTAD DE MEDICINA
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
INDIA FREE STANDING INST
NORTHEAST OHIO MEDICAL UNIVERSITY
PUSAN NATIONAL UNIVERSITY COLLEGE OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
SIRIRAJ HOSPITAL FACULTY OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
UNIVERSITY OF MEDICINE AND PHARMACY OF HO CHI MINH CITY
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
KING EDWARD MEDICAL UNIVERSITY
UNIVERSITà DI PISA FACOLTà DI MEDICINA E CHIRURGIA
THE UNIVERSITY OF TOLEDO COLLEGE OF MEDICINE
TEHRAN UNIVERSITY OF MEDICAL SCIENCES SCHOOL OF MEDICINE
UNIVERSIDADE FEDERAL DE MINAS GERAIS UFMG 

In [10]:
aamc_counter = Counter(aamc2.clean_last_name.values)
apps_counter = Counter(apps2.clean_last_name.values)
apps2['last_name_counts'] = apps2.clean_last_name.apply(lambda x: apps_counter[x])
aamc2['last_name_counts'] = aamc2.clean_last_name.apply(lambda x: apps_counter[x])

In [11]:
aamc3 = aamc2[aamc2.last_name_counts>0]

In [12]:
aamc3['fuzzy_merge_col'] = aamc3[
    ['clean_first_name', 'clean_middle_initial', 'clean_last_name']].apply(create_str_merge, axis=1)
apps2['fuzzy_merge_col'] = apps2[
    ['clean_first_name', 'clean_middle_initial', 'clean_last_name']].apply(create_str_merge, axis=1)
# match1 = df_get_closest_matches(apps2, aamc2.iloc[:500,:], 'fuzzy_merge_col', suffixes=['_x', '_y']) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [13]:
aamc3['clean_first_initial'] = aamc3.clean_first_name.apply(lambda x: np.nan if pd.isnull(x) else x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
match1 = pd.merge(left=apps2, right=aamc3, on='fuzzy_merge_col', how='inner')

In [15]:
def check_match(row):
    if row['dob_app_diff'] < 20 or row['dob_app_diff'] > 30:
        return 0
    if not pd.isnull(
        row['medschool_year_grad']) and not pd.isnull(
            row['degree_year_1']) and row['medschool_year_grad'] != row['degree_year_1']:
        return 0
    if row['medical_school_sim'] < 90:
        return 0
    if row['clean_first_initial_x'] == row['clean_first_initial_y']:
        if row['clean_middle_name_x'] and row['clean_middle_name_y'] and row['clean_middle_name_sim'] > 70:
            return 1
    if row['clean_first_name_sim'] < 70:
        return 0
    if pd.isnull(
        row['medical_school_x']) or pd.isnull(row['medical_school_y']):
        if row['clean_middle_name_sim'] > 70 and row['clean_first_initial_x'] == row['clean_first_initial_y']:
            return 1
        else:
            return 0
    return 1

In [16]:
def get_dob_app_diff(row):
    return row['application_year'] - row['yobb']

In [17]:
match1['dob_app_diff'] = match1.apply(get_dob_app_diff, axis=1)

feature_dict = {
    'clean_first_name': get_name_str_sim,
    'clean_middle_name': get_name_str_sim,
    'clean_last_name': get_name_str_sim,
    'medical_school': get_name_str_sim,
}

match2 = add_similarity_features(match1, feature_dict, check_match, suffixes=['_x', '_y'])

def filter_one_match_per_group(df, dedupe_col, sim_cols):
    # to merge cols should be a dict the names of the extra cols to merge in
    # values should be col names to rename
    # sim cols should be name of the columns to use as features
    # sim mask should be mask that accounts as actual mask
    # dedupe col is name of col to dedupe on

    def count_matches(id_list_arr):
        # for each id, make sure matched on 1x in data set
        # should be applied with rolling apply so takes in a dataframe and must return single value
        # unpack already matched ids from string
        current_id1 = id_list_arr[-1]
        other_matches = id_list_arr[:-1]
        is_dup = np.any(other_matches[:] == current_id1)
        if is_dup:
            return True
        return False

    # for each uuid, check for duplicates and choose best match based on sim cols
    # order of the sim cols should be with most important first
    dup_flag = '{}_duplicate'.format(dedupe_col)
    df[dup_flag] = 0
    df.loc[:, dup_flag] = df[
        dedupe_col].expanding(center=False, min_periods=0).apply(func=count_matches)

    df_matches = df[df['is_match'] == 1].sort_values([dedupe_col] + sim_cols, ascending=False)
    return df_matches.drop_duplicates([dedupe_col], keep='first')

In [18]:
match3 = filter_one_match_per_group(match2, PERSON_ID, sim_cols=['clean_first_name_sim', 'clean_last_name_sim', 'medical_school_sim'])
match4 = filter_one_match_per_group(match3, 'aamc_id', sim_cols=['clean_first_name_sim', 'clean_last_name_sim', 'medical_school_sim'])
match4.shape

(1747, 142)

In [19]:
nm_aamc = get_nonmatched(df=aamc3, id_colname='aamc_id', matched_ids=match4.aamc_id.values)

nm_apps = get_nonmatched(df=apps2, id_colname=PERSON_ID, matched_ids=match4[PERSON_ID].values)

(12911, 25)
(2359, 110)


In [20]:
first_last_matches = pd.merge(
    left=nm_apps,
    right = nm_aamc,
    left_on=['clean_last_name', 'medschool_year_grad'],
    right_on=['clean_last_name', 'degree_year_1'],
    how='inner'
)
print first_last_matches.shape

(2365, 134)


In [21]:
first_last_matches['dob_app_diff'] = first_last_matches.apply(get_dob_app_diff, axis=1)

feature_dict = {
    'clean_first_name': get_name_str_sim,
    'clean_middle_name': get_name_str_sim,
    'medical_school': get_name_str_sim,
}

first_last_matches2 = add_similarity_features(first_last_matches, feature_dict, check_match, suffixes=['_x', '_y'])
fm2 = first_last_matches2[first_last_matches2.is_match==1]

In [41]:
# mark GREGORY WALSH and WILLIAM WILLIAMS as NOT matches
fm2.loc[
    (fm2.clean_first_name_x.isin(['GREGORY', 'WILLIAM'])) & (fm2.clean_last_name.isin(['WILLIAMS', 'WALSH'])), 'is_match'] = 0

In [42]:
fm3 = filter_one_match_per_group(fm2, PERSON_ID, sim_cols=['clean_first_name_sim', 'medical_school_sim'])
fm4 = filter_one_match_per_group(fm3, 'aamc_id', sim_cols=['clean_first_name_sim', 'medical_school_sim'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [43]:
fm4.shape

(331, 141)

In [44]:
print sorted(set(fm4.columns) - set(match4.columns))
print sorted(set(match4.columns) - set(fm4.columns))

['fuzzy_merge_col_x', 'fuzzy_merge_col_y']
['clean_last_name_sim', 'clean_last_name_x', 'clean_last_name_y', 'fuzzy_merge_col']


In [45]:
def consolidate_cols(row):
    # each row comes in as a series
    NULL_STRINGS = ['UNKNOWN', 'OTHER']
    row[row.isin(NULL_STRINGS)]=np.nan
    non_nulls = row[~pd.isnull(row)].values
    if len(non_nulls) == 0:
        return np.nan
    # sort values by string len
    if isinstance(non_nulls[0], str):
        # sort in reverse so longest string returns
        return sorted(non_nulls, key=len, reverse=True)[0]
    # if not a string, just return first value
    return non_nulls[0]

In [46]:
match4.loc[:, 'clean_last_name'] = match4[[
        'clean_last_name_x', 'clean_last_name_y']].apply(consolidate_cols, axis=1)

In [47]:
match5 = match4.drop(['fuzzy_merge_col', 'clean_last_name_sim', 'clean_last_name_x', 'clean_last_name_y'], axis=1)
fm5 = fm4.drop(['fuzzy_merge_col_x', 'fuzzy_merge_col_y'], axis=1)

In [48]:
all_matches = pd.concat([match5, fm5], axis=0)

In [49]:
all_matches['medical_school'] = all_matches['medical_school_x']
mask = pd.isnull(all_matches['medical_school'])
all_matches.loc[mask, 'medical_school'] = all_matches.loc[mask, 'degree_inst_1_desc']
# mask = pd.isnull(all_matches['degree_inst_1_desc']) & pd.isnull(all_matches['medical_school'])
# all_matches.loc[mask, 'medical_school'] = all_matches.loc[mask, 'medical_school_x']
mask = pd.isnull(all_matches['medical_school'])

In [50]:
all_matches2 = all_matches.drop(['medical_school_x', 'medical_school_y', 'medical_school_sim'], axis=1)

In [51]:
all_matches3 = consolidate_merge_cols(all_matches2, ['_x', '_y'], [])

['clean_first_initial', 'clean_first_name', 'clean_middle_initial', 'clean_middle_name', 'last_name_counts']


In [52]:
# for the full matches, replace med school grad year with degree_
missing_degree_year = pd.isnull(all_matches3.degree_year_1)
all_matches3.loc[~missing_degree_year, 'medschool_year_grad'] = all_matches3[~missing_degree_year]['degree_year_1']



In [53]:
unmatched_apps3 = get_nonmatched(apps2, all_matches3[PERSON_ID], id_colname=PERSON_ID)

(2028, 110)


In [54]:
all_matches4 = pd.concat([all_matches3, unmatched_apps3], axis=0).drop(['aamc_id_duplicate', 'Unnamed: 0', 
                                                                       'clean_first_name_sim', 'clean_middle_name_sim', 
                                                                       'fuzzy_merge_col', 'lab_brch', 'person_uuid_duplicate', 'other'], axis=1)


In [55]:
all_matches4.loc[all_matches4.duplicated(PERSON_ID, keep=False), NAME_COLS+['person_uuid']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,person_uuid


In [56]:
# rename aamc original cols
all_matches4.rename(columns={
        'fname': 'aamc_first_name', 'mname': 'aamc_middle_name', 'lname': 'aamc_last_name', 
        'degree_inst_1_desc': 'aamc_medical_school', 'degree_year_1': 'aamc_medschool_year_grad' }, inplace=True)

other_cols = sorted([i for i in all_matches4.columns if i not in IMPORTANT_COLS])

# order columns so important ones are 
all_matches5 = all_matches4[IMPORTANT_COLS+other_cols].sort_values(
    ['clean_last_name', 'clean_first_name', 'application_year_max']).drop(
        ['is_match', 'last_name_counts', 'counts', 'to_drop'], axis=1)


In [60]:
all_matches5.to_csv(os.path.join(APP_DATA_DIR, 'aamc_apps_nih.csv'), index=False)
all_matches5.to_pickle(os.path.join(PICKLE_DIR, 'aaamc_apps_nih.p'))

In [59]:
all_matches5.loc[all_matches5.clean_last_name.isin(['WALSH', 'WILLIAMS']), NAME_COLS+['aamc_id']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,aamc_id
3825,GREGORY,O,WALSH,
1356,JOHN,VINCENT,WALSH,10823147.0
494,GARY,MURRAY,WILLIAMS,10854755.0
858,GEORGE,WM,WILLIAMS,12194106.0
3964,HIBBARD,DR,WILLIAMS,
3965,JONATHAN,R,WILLIAMS,
1076,LYSLE,WESTLEY,WILLIAMS,10864236.0
1375,REDFORD,BROWN,WILLIAMS,10870680.0
3968,ROGER,RICHARDS,WILLIAMS,
3969,TEMPLE,W,WILLIAMS,
