In [392]:
# read in known applicant files, dedupe and try to merge with applicants file
from collections import Counter
import difflib
import uuid
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import re
import os

APP_DATA_DIR = os.path.abspath('Data/applicant_data')
ATT_DATA_DIR = os.path.abspath('Data/attendees_data')
CARD_DATA_DIR = os.path.abspath('Data/applicant_data/raw_card_data')

# from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
#                                      clean_names, has_award, has_suffix, get_suffix, replace_last_name,  
#                                      is_year_range, str_sim, clean_med_school, clean_std_college_name)
from data_cleaning_functions import correct_mispellings, long_form_date, clean_names, clean_med_school

In [393]:
# id column that links back to raw applicant data file
RAW_CARD_ID = 'raw_uuid'

# column where the raw id information is stored
RAW_INDEX_IDS = 'raw_card_ids'

# try to get one id per unique applicant in the dataset
PERSON_ID = 'person_uuid'
# id per deduped application-person - if someone applied multiple times, they will have multiple ids
PERSON_APPLICATION_ID = 'person_app_uuid' 
NIH_ID = 'dno'

APPLICANT_SUFFIX = '_ap'
ATTENDEE_SUFFIX = '_at'

%load_ext autoreload
%autoreload 2

%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [394]:
NAME_COLS = ['clean_middle_name', 'clean_last_name', 'clean_first_name']

MED_TRAINING_COLS = ['res_dates', 'intern_dates', 'residency_hospital', 'internship_hospital', 'medical_school', 'residency']


In [568]:
# import NIH raw data set
NIH_raw = pd.read_csv(os.path.join(ATT_DATA_DIR, 'NIH_attendee_deduped_raw.csv'))

In [576]:
# drop from the data set all people with eod years > 1980
NIH = NIH_raw.loc[NIH_raw.eod_year<1980, :] 

In [577]:
# import cleaned, deduped applicant data set in wide form (multiple app dates as columns)
apps = pd.read_pickle(os.path.join(APP_DATA_DIR, 'person_application_date_wide.p'))

In [578]:
apps.loc[:, 'res_dates'] = apps['residency_year(s)'].apply(long_form_date)

apps.loc[:, 'intern_dates'] = apps['internship_year(s)'].apply(long_form_date)

Str. Medicine
Medicine


In [579]:
apps2 = apps.rename(
    columns={'residency_type': 'residency', 'internship_hospital_1': 'internship_hospital'})

In [580]:
string_med_cols = ['medical_school', 'residency_hospital', 'internship_hospital', 'residency']

# apply string cleaning function to each of the string medical info columns
apps2.loc[:, string_med_cols] = apps2[string_med_cols].applymap(clean_names)

In [581]:
to_remove = ['TERRECE', 'FRED', 'LAURENCE',
             'CUONO', 'DEFRENZE', 'JEFFERY', 'FINKLEMAN', 'SHERRAD', 'ANSCHNETZ', 'MARC', 'JENSON', 'KASTI', 
            'ADELBERT', 'RITCHARD', 'MANSFORD', 'DEFRENZO', 'DROBIN', 'HAMES', 'KREUZ', 'JERROLD', 'MANEUSI',
            'UNGARO']
to_replace = ['TERRENCE', 'FREDERICK', 'LAWRENCE',
              'CUOMO', 'DEFRONZO', 'JEFFREY', 'FINKELMAN', 'SHERRARD', 'ANSCHUETZ', 'MARCUS', 'JENSEN', 'KASTL',
              'ALBERT', 'RITCHARD', 'MANIFORD', 'DEFRONZO', 'DROBIS', 'JAMES', 'KRUEZ', 'JERROD', 'MANCUSI',
              'UNGARO']

correct_name_mispellings_fnc = funcy.rpartial(correct_mispellings, to_remove, to_replace)

apps2.loc[:, 'clean_last_name'] = apps2.clean_last_name.apply(correct_name_mispellings_fnc)
apps2.loc[:, 'clean_first_name'] = apps2.clean_first_name.apply(correct_name_mispellings_fnc)
apps2.loc[:, 'clean_medical_school'] = apps2['medical_school']

In [582]:
# there are a bunch of duplicates in apps, where application year is the same, but first name is missing
name_dups = apps2.loc[
    apps2.duplicated(
        ['clean_last_name', 'clean_medical_school', 'application_year'], keep=False), NAME_COLS+MED_TRAINING_COLS]

In [583]:
apps3 = apps2.sort_values(['clean_last_name', 'clean_first_name']).drop_duplicates(
    ['clean_last_name', 'clean_medical_school', 'application_year'])

In [584]:
# there are also some duplicates on middle and last name, but first is missing on one of the dups, 
# so we need to drop these
dups_mask = apps3.duplicated(['clean_last_name', 'clean_middle_name', 'application_year'], keep=False)

In [585]:
apps4 = apps3.loc[((~dups_mask) | ((dups_mask) & (~pd.isnull(apps3.clean_first_name)))), :]

In [586]:
print apps4.shape
print apps3.shape

(4218, 76)
(4238, 76)


In [587]:
apps4.loc[apps4.clean_last_name=='HAAKENSTAD', NAME_COLS]

Unnamed: 0,clean_middle_name,clean_last_name,clean_first_name
352,OTTO,HAAKENSTAD,ALAN


In [588]:
apps3.loc[apps2.clean_last_name=='HAAKENSTAD', NAME_COLS+['clean_medical_school', 'address', 'city']]

Unnamed: 0,clean_middle_name,clean_last_name,clean_first_name,clean_medical_school,address,city
352,OTTO,HAAKENSTAD,ALAN,NORTH,1718 7th Street,South Fargo
1617,OTTO,HAAKENSTAD,,PENNSYLVANIA,1718 7th Street South,Fargo


In [589]:
# function to go in and correct some of the name mispellings in both data sets
# MUTATING FUNCTION
def change_names(df, selection_type, selection_value, to_change_type, to_change_values):
    for t, v in zip(to_change_type, to_change_values):
        print t, v
        df.loc[df[selection_type]==selection_value, t] = v

In [590]:
%%capture
change_names(
    apps2, 'clean_last_name', 'CHESEBRO', ['clean_first_name', 'clean_middle_name'], ['BRUCE', 'WILCOX'])
change_names(
    apps2, 'clean_last_name', 'GALANTER', ['clean_first_name', 'clean_middle_name'], ['MARC', 'I'])
change_names(
    apps2, 'clean_last_name', 'BEAN', ['clean_first_name', 'clean_middle_name', 'clean_medical_school'], ['SIDNEY', 'CHARLES', 'WAKE_FOREST'])
change_names(
    apps2, 'clean_last_name', 'EILER', ['clean_first_name', 'clean_middle_name'], ['DONALD', 'MARTIN'])
change_names(
    apps2, 'clean_last_name', 'FALCHUK', ['clean_first_name', 'clean_middle_name'], ['DONALD', 'MARTIN'])
change_names(
    apps2, 'clean_last_name', 'BOYD', ['clean_first_name', 'clean_middle_name'], ['MICHAEL', 'RAY'])
change_names(
    apps2, 'clean_last_name', 'CHAPMAN', 
    ['clean_first_name', 'clean_middle_name', 'clean_medical_school'], ['STANLEY', 'WILLETS', 'ROCHESTER'])
change_names(apps2, 'clean_last_name', 'DANFORTH', ['clean_first_name'], ['DAVID'])
change_names(apps2, 'clean_last_name', 'HUNT', ['clean_first_name', 'clean_middle_name'], ['ROBERT', 'D'])
change_names(apps2, 'clean_last_name', 'KARK', ['clean_first_name', 'clean_middle_name'], ['ROBERT', 'ADRIAN'])
change_names(apps2, 'clean_last_name', 'KEBABIAN', ['clean_first_name', 'clean_middle_name'], ['JOHN', 'WILLIS'])
change_names(apps2, 'clean_last_name', 'KNOPF', ['clean_first_name', 'clean_middle_name'], ['HARRY', 'LOUIS'])
change_names(apps2, 'clean_last_name', 'KROLIKOWSKI', ['clean_first_name', 'clean_middle_name'], ['FRANCIS', 'JOHN'])
change_names(apps2, 'clean_last_name', 'KASTL', ['clean_first_name', 'clean_middle_name'], ['DAVID', 'GENE'])
change_names(apps2, 'clean_first_name', 'JAN', ['clean_last_name'], ['KNOWLER'])
change_names(apps2, 'clean_last_name', 'KLAVEMAN', ['clean_last_name'], ['KLAEVEMAN'])
change_names(apps2, 'clean_last_name', 'MATHEW', ['clean_last_name'], ['MATTHEW'])

apps2.loc[apps2.clean_last_name=='CHESEBRO', ['clean_first_name']] = 'BRUCE'
apps2.loc[apps2.clean_last_name=='CHESEBRO', ['clean_middle_name']] = 'WILCOX'

In [591]:
print NIH.shape
print  apps4.shape

(3025, 29)
(4218, 76)


In [592]:
# resave apps 4 data set as deduped apps
apps4.sort_values('clean_last_name').to_csv(
    os.path.join(APP_DATA_DIR, 'person_application_date_wide_deduped.csv'), index=False)

In [685]:
apps4.application_year.value_counts().to_csv(os.path.join(APP_DATA_DIR, 'unique_apps_application_year_counts.csv'))
NIH.eod_year.value_counts().to_csv(os.path.join(ATT_DATA_DIR, 'unique_att_eod_year_counts.csv'))

In [594]:
# After cleaning apps2 to match cleaning in Clean NIH Applicant notebook, we try to start merging

In [595]:
exact_name_matches = pd.merge(left=NIH, right=apps4, left_on=['clean_first_name', 'clean_middle_name', 'clean_last_name'], right_on=[
        'clean_first_name', 'clean_middle_name', 'clean_last_name'], how='inner', suffixes=[ATTENDEE_SUFFIX, APPLICANT_SUFFIX])

In [596]:
not_matched_apps = apps4.loc[~apps4[PERSON_ID].isin(exact_name_matches[PERSON_ID]), :]
not_matched_attendees = NIH.loc[~NIH[NIH_ID].isin(exact_name_matches[NIH_ID]), :]

first_last_matches = pd.merge(left=not_matched_attendees, right=not_matched_apps, left_on=['clean_first_name', 'clean_last_name'], right_on=[
        'clean_first_name', 'clean_last_name'], how='inner', suffixes=[ATTENDEE_SUFFIX, APPLICANT_SUFFIX])

In [597]:
print first_last_matches.shape
print not_matched_apps.shape
print not_matched_attendees.shape
print exact_name_matches.shape

(106, 103)
(2392, 76)
(1247, 29)
(1794, 102)


In [598]:
# create counter objects for each data set that count the number of times the last name occurs in either data set
attendees_counter = Counter(NIH.clean_last_name)
apps_counter = Counter(apps4.clean_last_name)

In [599]:
# for the first and last matches, where the last name only occurs 1x in each data set, set confidence flag to 1
first_last_matches.loc[:, 'last_name_counts'] = first_last_matches.clean_last_name.apply(
    lambda x: apps_counter[x] + attendees_counter[x])

In [600]:
def define_med_school_junk(seq_elem):
    # difflib sequence matcher first element can take a fnc that inputs a
    # sequence element and returns True if it should be considered Junk
    return seq_elem in ['SCHOOL OF MEDICINE', 'UNIVERSITY OF', 'MEDICAL COLLEGE OF',
                        'MEDICAL', 'SCHOOL', 'UNIVERSITY', 'COLLEGE', 'OF', 'THE', 'MEDICINE', 'CENTER', 'DENTISTRY']

In [601]:
def str_sim_fnc(row, index1, index2, junk_fnc=None):
    s1 = row[index1]
    s2 = row[index2]
    has_null = any(map(lambda x: pd.isnull(x), [s1, s2]))
    if has_null:
        return np.nan
    if s1 in s2 or s2 in s1:
        return 1
    return difflib.SequenceMatcher(
        junk_fnc, s1, s2).ratio()

In [602]:
get_str_sim = funcy.rpartial(str_sim_fnc, 'clean_medical_school_ap', 'clean_medical_school_at', define_med_school_junk)

In [603]:
first_last_matches.loc[:, 'med_school_sim'] = first_last_matches.apply(get_str_sim, axis=1)

In [604]:
first_last_matches.med_school_sim.describe()

count    91.000000
mean      0.859064
std       0.310438
min       0.000000
25%            NaN
50%            NaN
75%            NaN
max       1.000000
Name: med_school_sim, dtype: float64

In [605]:
exact_name_matches.loc[:, 'med_school_sim'] = exact_name_matches.apply(get_str_sim, axis=1)

In [606]:
exact_name_matches.med_school_sim.describe()

count    1634.000000
mean        0.944874
std         0.193802
min         0.000000
25%              NaN
50%              NaN
75%              NaN
max         1.000000
Name: med_school_sim, dtype: float64

In [607]:
# exact_name_matches[['clean_medical_school{}'.format(APPLICANT_SUFFIX), 'clean_medical_school{}'.format(ATTENDEE_SUFFIX), 
#                     'medical_school{}'.format(APPLICANT_SUFFIX), 'med_school_sim', 'medical_school_at']]

In [608]:
def get_years(dt_str):
    # return int date strings
    try:
        single_year = (dt_str.find('-') == -1)
        if single_year:
            return [dt_str]
        return dt_str.split('-')
    except ValueError as e:
        print dt_str
        return np.nan

def get_dts_sim(row, index1, index2):
    dt1 = row[index1]
    dt2 = row[index2]
    has_null = any(map(lambda x: pd.isnull(x), [dt1, dt2]))
    if has_null:
        return np.nan
    dates_tup1 = get_years(dt1)
    dates_tup2 = get_years(dt2)
    # if dates match exactly, return 1
    unique_dts = set(funcy.concat(dates_tup1, dates_tup2))
    return (len(unique_dts) < (len(dates_tup1) + len(dates_tup2)))

In [609]:
get_intern_dts_sim = funcy.rpartial(get_dts_sim, 
                                    'intern_dates{}'.format(APPLICANT_SUFFIX), 
                                    'intern_dates{}'.format(ATTENDEE_SUFFIX))

In [610]:
first_last_matches.loc[:, 'internship_sim'] = first_last_matches.apply(get_intern_dts_sim, axis=1)

In [611]:
# drop all first and last name matches where the med school sim < .6 
# for people where only 1 instance of last name in each data set, say they match
first_last_matches.loc[(first_last_matches['last_name_counts'] < 3), 'match_score'] = 1

# first_last_matches.loc[((first_last_matches['last_name_counts'] < 3) & (
#             pd.isnull(first_last_matches['med_school_sim'])) & (first_last_matches['internship_sim']==True)), 'match_score'] = 1

first_last_matches.loc[((first_last_matches['last_name_counts'] > 2) & (
            first_last_matches['med_school_sim'] > .5) & (first_last_matches['internship_sim']==True)), 'match_score'] = 1

first_last_matches.loc[((first_last_matches['last_name_counts'] > 2) & (
            first_last_matches['med_school_sim'] > .5) & (pd.isnull(first_last_matches['internship_sim']))), 'match_score'] = 1

In [612]:
first_last_matches.loc[((first_last_matches['last_name_counts'] > 1) & (
            first_last_matches['med_school_sim'] < .5) & (first_last_matches['internship_sim']!=True)), 'match_score'] = 0


In [613]:
first_last_matches.loc[pd.isnull(first_last_matches.match_score), 'match_score'] = 0

In [614]:
exact_name_matches.loc[:, 'match_score'] = 1

In [615]:
def reconcile_app_attendee_col(row, col_name, suffix_priority=None):
    # given two columns, reconcile between the app and attendee data set
    # if both values are present, take values in order of the suffix priority list
    if not suffix_priority:
        suffix_priority = [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]
    s1 = row['{}{}'.format(col_name, suffix_priority[0])]
    s2 = row['{}{}'.format(col_name, suffix_priority[1])]
    non_nulls = funcy.remove(lambda x: pd.isnull(x), [s1, s2])
    if not non_nulls:
        return np.nan
    return non_nulls[0] 
    

In [616]:
first_last_matches.loc[:, 'clean_middle_name'] = first_last_matches.apply(
    funcy.rpartial(reconcile_app_attendee_col, 'clean_middle_name', [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]), axis=1)

first_last_matches2 = first_last_matches.drop(
    ['clean_middle_name{}'.format(
            x) for x in [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]]+['internship_sim', 'last_name_counts'], axis=1)

In [617]:
len(sorted(first_last_matches2.columns))==len(sorted(exact_name_matches.columns))

True

In [618]:
# bunch of matches on last name only
# Note that the case where count occurs 2x in one data set and not the other is ok, because nothing to merge on

apps4.loc[:, 'last_name_counts'] = apps4.clean_last_name.apply(
    lambda x: apps_counter[x] + attendees_counter[x])

NIH.loc[:, 'last_name_counts'] = NIH.clean_last_name.apply(
    lambda x: apps_counter[x] + attendees_counter[x])

In [619]:
# drop all people without a good match score
# for non matches, do visual check, look at years 67-75, create a score of reliability they are control
full_matches = pd.concat([first_last_matches2.loc[first_last_matches.match_score==1, :], exact_name_matches], axis=0)
# if application date year after eod year, drop
# if double match and application date shows up 1x, drop

In [620]:
full_matches.loc[:, 'match_score'] = 1

In [621]:
# non matches
not_matched_apps = apps4.loc[~apps2[PERSON_ID].isin(full_matches[PERSON_ID]), :]
not_matched_attendees = NIH.loc[~NIH[NIH_ID].isin(full_matches[NIH_ID]), :]

NIH.shape

not_matched_attendees.shape

(1167, 30)

In [622]:
print not_matched_attendees.loc[not_matched_attendees.last_name_counts< 3, :].shape

# there are a bunch of attendees who were not able to match on first and last name
# for last name, if one instance in data set, try match
last_matches = pd.merge(left=not_matched_apps.loc[not_matched_apps.last_name_counts<3, :],
                    right=not_matched_attendees.loc[not_matched_attendees.last_name_counts < 3, :],
                        left_on='clean_last_name', right_on='clean_last_name', how='inner', suffixes=[
        APPLICANT_SUFFIX, ATTENDEE_SUFFIX])

last_matches.loc[:, 'med_school_sim'] = last_matches.apply(get_str_sim, axis=1)
last_matches.loc['match_score', :] = 1

(1000, 30)


In [623]:
# need to reconcile the first and middle names on the last name matches data set
last_matches.loc[:, 'clean_middle_name'] = last_matches.apply(
    funcy.rpartial(reconcile_app_attendee_col, 'clean_middle_name', [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]), axis=1)

last_matches.loc[:, 'clean_first_name'] = last_matches.apply(
    funcy.rpartial(reconcile_app_attendee_col, 'clean_first_name', [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]), axis=1)

last_matches2 = last_matches.drop(
    ['clean_{}_name{}'.format(
            a, b) for b in [ATTENDEE_SUFFIX, APPLICANT_SUFFIX] for a in ['first', 'middle']], axis=1)


In [624]:
# read in manual match crosswalk
m_matches = pd.read_excel(os.path.join(ATT_DATA_DIR, 'not_matched_attendees_manual_matches.xlsx')).dropna(subset=['merge_{}'.format(PERSON_ID)], axis=0)

In [625]:
# in this file, the clean_first_name, clean_middle name-- clean_medical_school are from the NIH data set(also dno)
# there is 'merge_clean_first_name--clean_last_name' which is from the applicant data set
# use these as merge values, because although merge person uuid exists, better to merge on names
# if data set is recreated merge person uuid will change

In [626]:
# after merging into the applicant info, I want to overwrite the clean_name column from the atttendee data set
# it was wrong most of the time
# also drop the _merge columns except for is_female

In [627]:
manual_app_df = pd.merge(left=m_matches, right=apps4, left_on=[
        'merge_clean_first_name', 'merge_clean_middle_name', 'merge_clean_last_name', 'merge_application_year'], 
                        right_on=[
        'clean_first_name', 'clean_middle_name', 'clean_last_name', 'application_year'],
                        how='inner', suffixes=[ATTENDEE_SUFFIX, APPLICANT_SUFFIX])

In [628]:
# replace clean med school values with that from applicant data set - it is more standardized
manual_app_df.loc[:, 'clean_medical_school'] = manual_app_df.apply(
    funcy.rpartial(reconcile_app_attendee_col, 'clean_medical_school', [APPLICANT_SUFFIX, ATTENDEE_SUFFIX]), axis=1)

In [629]:
manual_app3 = pd.merge(
    left=manual_app_df, right=NIH, left_on=['clean_first_name_at', 'clean_middle_name_at', 'clean_last_name_at'],
    right_on=['clean_first_name', 'clean_middle_name', 'clean_last_name'], how='inner')
# manual_app3 = pd.merge(
#     left=manual_app2, right=NIH, on=['dno'], how='inner')

In [630]:
# replace name with info from attendees data set
manual_app3.loc[:, 'clean_first_name'] = manual_app3.apply(
    funcy.rpartial(reconcile_app_attendee_col, 'clean_first_name', [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]), axis=1)
manual_app3.loc[:, 'clean_middle_name'] = manual_app3.apply(
    funcy.rpartial(reconcile_app_attendee_col, 'clean_middle_name', [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]), axis=1)
manual_app3.loc[:, 'clean_last_name'] = manual_app3.apply(
    funcy.rpartial(reconcile_app_attendee_col, 'clean_last_name', [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]), axis=1)

In [631]:
# manual_app3[['clean_last_name', 'clean_last_name_ap', 'clean_last_name_at']]

# sorted(manual_app3.columns)

# sorted(manual_app3.columns)

In [632]:
# drop _ap and _at and merge_ columns
to_drop_cols = [c for c in manual_app3.columns if 
                    c.endswith(APPLICANT_SUFFIX) or c.endswith(ATTENDEE_SUFFIX) or c.startswith('merge_')]
print to_drop_cols

[u'clean_first_name_at', u'clean_middle_name_at', u'clean_last_name_at', u'clean_medical_school_at', u'merge_person_uuid', u'merge_application_year', u'merge_clean_first_name', u'merge_clean_middle_name', u'merge_clean_last_name', 'clean_first_name_ap', 'clean_middle_name_ap', 'clean_last_name_ap', 'clean_medical_school_ap']


In [633]:
manual_app4 = manual_app3.drop(to_drop_cols+['last_name_counts_x', 'last_name_counts_y'], axis=1)

In [634]:
# take medical school from the applicant data set
manual_app4.loc[:, 'clean_medical_school'] = manual_app4.apply(
    funcy.rpartial(reconcile_app_attendee_col, 'clean_medical_school', ['_x', '_y']), axis=1)

In [635]:
manual_app4a = manual_app4.drop(['clean_medical_school_x', 'clean_medical_school_y'], axis=1)

In [636]:
# reconcile the x and y columns, giving priority to info in the NIH data set _y
cols_to_reconcile = [c.split('_x')[0] for c in manual_app4a.columns if c.endswith('_x')]
print cols_to_reconcile

[u'dno', u'eod_year', 'citizenship', 'medical_school', 'residency_hospital', 'residency', 'internship_hospital', 'res_dates', 'intern_dates']


In [637]:
for c in cols_to_reconcile:
    manual_app4a.loc[:, c] = manual_app4a.apply(
        funcy.rpartial(reconcile_app_attendee_col, c, ['_y', '_x']), axis=1)

In [638]:
cols_to_drop = funcy.flatten(('{}_x'.format(c), '{}_y'.format(c)) for c in cols_to_reconcile)
print cols_to_drop

['dno_x', 'dno_y', 'eod_year_x', 'eod_year_y', 'citizenship_x', 'citizenship_y', 'medical_school_x', 'medical_school_y', 'residency_hospital_x', 'residency_hospital_y', 'residency_x', 'residency_y', 'internship_hospital_x', 'internship_hospital_y', 'res_dates_x', 'res_dates_y', 'intern_dates_x', 'intern_dates_y']


In [639]:
manual_app5 = manual_app4a.drop(cols_to_drop, axis=1)
# sorted(manual_app5.columns)

In [640]:
# check columns of all matches match
# set(full_matches.columns).difference(set(last_matches2.columns))
set(full_matches.columns).difference(set(manual_app5.columns))

{'citizenship_ap',
 'citizenship_at',
 'clean_medical_school_ap',
 'clean_medical_school_at',
 'intern_dates_ap',
 'intern_dates_at',
 'internship_hospital_ap',
 'internship_hospital_at',
 'match_score',
 'med_school_sim',
 'medical_school_ap',
 'medical_school_at',
 'res_dates_ap',
 'res_dates_at',
 'residency_ap',
 'residency_at',
 'residency_hospital_ap',
 'residency_hospital_at'}

In [641]:
# before adding manual matches, must reconcile columns
full_matches2 = pd.concat([full_matches, last_matches2], axis=0)
print full_matches2.shape

(2652, 106)


In [642]:
cols_to_reconcile = [c.split(APPLICANT_SUFFIX)[0] for c in full_matches2.columns if c.endswith(APPLICANT_SUFFIX)]
print cols_to_reconcile

for c in cols_to_reconcile:
    full_matches2.loc[:, c] = full_matches2.apply(
        funcy.rpartial(reconcile_app_attendee_col, c, [ATTENDEE_SUFFIX, APPLICANT_SUFFIX]), axis=1)

cols_to_drop = funcy.flatten(('{}{}'.format(c, APPLICANT_SUFFIX), '{}{}'.format(c, ATTENDEE_SUFFIX)) for c in cols_to_reconcile)
print cols_to_drop

['citizenship', 'clean_medical_school', 'intern_dates', 'internship_hospital', 'last_name_counts', 'medical_school', 'res_dates', 'residency', 'residency_hospital']
['citizenship_ap', 'citizenship_at', 'clean_medical_school_ap', 'clean_medical_school_at', 'intern_dates_ap', 'intern_dates_at', 'internship_hospital_ap', 'internship_hospital_at', 'last_name_counts_ap', 'last_name_counts_at', 'medical_school_ap', 'medical_school_at', 'res_dates_ap', 'res_dates_at', 'residency_ap', 'residency_at', 'residency_hospital_ap', 'residency_hospital_at']


In [643]:
full_matches3 = full_matches2.drop(cols_to_drop, axis=1)

In [644]:
# now append the manual and programmatic matches
full_matches4 = pd.concat([full_matches3, manual_app5], axis=0).sort_values(['clean_last_name', 'application_year'])
print full_matches3.shape
print manual_app5.shape
print full_matches4.shape

(2652, 97)
(140, 96)
(2792, 99)


In [645]:
print full_matches4.clean_last_name.dropna().shape
print full_matches4.clean_first_name.dropna().shape

(2792,)
(2792,)


In [646]:
full_matches4.loc[pd.isnull(full_matches4.clean_last_name), [PERSON_ID, NIH_ID]]

Unnamed: 0,person_uuid,dno


In [647]:
# full_matches4.loc[full_matches4.duplicated(['clean_last_name', 'clean_first_name'], keep=False),NAME_COLS] 

# sorted(full_matches4.columns)

In [648]:
# there are about 127 entries that have duplicates in data set because they applied 2x
# drop them
full_matches5 = full_matches4.drop_duplicates(['clean_last_name', 'dno'])
print full_matches5.shape

(2746, 99)


In [649]:
# there are about 50 entries that have duplicates in data set because they appear 2x in NIH data set
# drop these
full_matches6 = full_matches5.drop_duplicates(['clean_last_name', 'clean_first_name', 'clean_middle_name'])
print full_matches6.shape

(2742, 99)


In [650]:
# there are about 20 entries that have duplicates in data set because they appear 2x in NIH data set with diff dno
# there also don't have a middle name so don't show up in data set
full_matches7 = full_matches6.drop_duplicates(['clean_last_name', 'clean_first_name', 'eod_year'])
print full_matches7.shape

(2738, 99)


In [651]:
# there are about 127 rows where applicant date and person are the same, drop these
full_matches_dups = full_matches7.loc[
    full_matches7.duplicated(['clean_last_name', 'clean_medical_school', 'application_year'], keep=False), ['dno', 'application_year', 'application_date_2', 'eod_year', 'rejection_date', 'clean_first_name', 'clean_last_name']]
print full_matches_dups.shape

(0, 7)


In [652]:
full_matches7.shape

(2738, 99)

In [653]:
full_matches8 = full_matches7.set_index(PERSON_ID, drop=False).sort_values(['clean_last_name', 'application_year'])

In [654]:
full_matches8.to_pickle(os.path.join(ATT_DATA_DIR, 'full_matches.p'))
full_matches8.to_csv(os.path.join(ATT_DATA_DIR, 'full_matches.csv'))

In [655]:
apps5 = apps4.set_index(PERSON_ID, drop=False)

In [656]:
# now, drop instances where person uuid is missing
missing_uuid = apps5.loc[pd.isnull(apps5[PERSON_ID]), :]

In [657]:
missing_uuid.sort_values('clean_last_name').to_csv(os.path.join(APP_DATA_DIR, 'missing_uuid_for_sanity_check.csv'))

In [658]:
apps6 = apps5.loc[~pd.isnull(apps5[PERSON_ID]), :]
print apps6.shape
print apps5.shape

(4218, 77)
(4218, 77)


In [659]:
multiple_apps = apps6.loc[~pd.isnull(apps6.application_date_2), :].sort_values([PERSON_ID, 'application_date_2'])
print multiple_apps.shape

(221, 77)


In [660]:
multiple_apps.loc[:, 'application_year_2'] = multiple_apps.application_date_2.apply(lambda x: x.year)

In [661]:
# then, for the people who are in the data set twice, drop rows where application year == application date
apps7 = pd.concat([apps6.loc[pd.isnull(apps6.application_date_2), :], multiple_apps.loc[
            multiple_apps.application_year != multiple_apps.application_year_2, :]])

In [662]:
# there are 2 people in here who applied 3x so still duplicated
# drop them (doesn't matter which row to drop)
apps8 = apps7.sort_values(['clean_last_name', 'application_year']).drop_duplicates(PERSON_ID)
print apps7.shape
print apps8.shape

(4108, 78)
(4106, 78)


In [663]:
apps7.loc[apps7.duplicated(PERSON_ID, keep=False), ['clean_last_name', 'clean_medical_school', 'application_date', 'application_year', 'application_date_2', 'application_date_3']]

Unnamed: 0_level_0,clean_last_name,clean_medical_school,application_date,application_year,application_date_2,application_date_3
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
61fc6b4c-1364-46b9-97ce-3d37c5751bf0,BORNSTEIN,NYU,1961-05-08,1961,1960-06-29,1959-06-18
61fc6b4c-1364-46b9-97ce-3d37c5751bf0,BORNSTEIN,NYU,1959-06-18,1959,1960-06-29,1959-06-18
da65a59f-f3e0-43da-8ee3-0d2bd177bc12,DEPPERMAN,PENNSYLVANIA,1969-03-20,1969,1970-02-10,1969-03-20
da65a59f-f3e0-43da-8ee3-0d2bd177bc12,DEPPERMAN,PENNSYLVANIA,1971-03-26,1971,1970-02-10,1969-03-20


In [664]:
apps8.loc[apps8.clean_last_name=='BORNSTEIN', :]

Unnamed: 0_level_0,address,age,application_date,application_date_2,application_date_3,application_year,application_year_2,associate_program_entered,bob,ca,...,residency_year(s),reviewer,sa,state,teaching,undergrad_year_grad,undergraduate_school,withdrawal,year_accepted,zip_code
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
61fc6b4c-1364-46b9-97ce-3d37c5751bf0,74 Asylum Street,,1959-06-18,1960-06-29,1959-06-18,1959,1960.0,,,,...,,1,,Connecticut,,,,0.0,,


In [665]:
# in full matches, there are 2 people who show, drop them
# full_matches8.loc[full_matches8.duplicated(PERSON_ID, keep=False), :]
full_matches9 = full_matches8.sort_values([PERSON_ID] + NAME_COLS).drop_duplicates(PERSON_ID)

In [666]:
full_matches8.loc[full_matches8[PERSON_ID]=='40b80987-33e2-4278-b660-4315a4a851f7', ['dno', 'application_date']]

Unnamed: 0_level_0,dno,application_date
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
40b80987-33e2-4278-b660-4315a4a851f7,4068.0,1965-05-11
40b80987-33e2-4278-b660-4315a4a851f7,4076.0,1965-05-11


In [667]:
wide_apps = full_matches9.combine_first(apps8)
print wide_apps.shape
print full_matches9.shape
print apps8.shape

(4107, 100)
(2736, 99)
(4106, 78)


In [668]:
wide_apps.loc[~wide_apps[PERSON_ID].isin(apps8[PERSON_ID]), NAME_COLS]

Unnamed: 0_level_0,clean_middle_name,clean_last_name,clean_first_name
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,1,1


In [536]:
wide_apps.loc[wide_apps.duplicated(PERSON_ID, keep=False), :]

Unnamed: 0_level_0,Unnamed: 0,address,age,application_date,application_date_2,application_date_3,application_year,application_year_2,associate_program_entered,bob,...,suffix,supervisor,teaching,undergrad_year_grad,undergraduate_school,unknown,withdrawal,year_accepted,year_grad,zip_code
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [669]:
# sorted(wide_apps.columns)

In [670]:
wide_apps2 = wide_apps.drop(['Unnamed: 0', 'firstname', 'lastname', 'middlename',  'med_school_sim', 'match_score', 
                'unknown', 'internship_year(s)', 'residency_year(s)', 'first_name', 'last_name', 'middle_name', 
                        'undergraduate_school', 'year_grad' ], axis=1)

In [671]:
wide_apps2.rename(columns={'clean_college_trans': 'clean_college'}, inplace=True)

In [672]:
# consolidate dob and date_of_birth, undergraduate_school and clean_college, intern_dates and internship_year(s)
# res_dates and residency_year(s)
# year_grad

In [673]:
# replace date of birth with dob whenever date of birth missing and dob is not
wide_apps2.loc[
    (pd.isnull(wide_apps2['date_of_birth'])) & (~pd.isnull(wide_apps2['dob'])), 'date_of_birth'] = wide_apps2.loc[
        (pd.isnull(wide_apps2['date_of_birth'])) & (~pd.isnull(wide_apps2['dob'])), 'dob']


In [674]:
c1 = 'date_of_birth'
c2 = 'dob'
wide_apps2.loc[(pd.isnull(wide_apps2[c1])) & (~pd.isnull(wide_apps2[c2])), [c1, c2]]

Unnamed: 0_level_0,date_of_birth,dob
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1


In [675]:
# delete them
wide_apps3 = wide_apps2.drop(['dob'], axis=1)

In [676]:
wide_apps3.rename(columns={'res_dates': 'residency_dates', 'intern_dates': 'internship_dates'}, inplace=True)

In [677]:
IMPORTANT_COLS = [NIH_ID, PERSON_ID, 'application_year', 'eod_year', 'application_date', 'clean_first_name', 'clean_middle_name', 
                 'clean_last_name', 'year_accepted', 'rejected', 'rejection_date', 'clean_college', 'clean_medical_school',
                'residency_dates', 'internship_dates', 'application_date_2', 'application_year_2', 'application_date_3']

In [678]:
other_cols = sorted([i for i in wide_apps3.columns if i not in IMPORTANT_COLS])

In [679]:
# order columns so important ones are 
wide_apps4 = wide_apps3[IMPORTANT_COLS+other_cols].sort_values(['clean_last_name', 'application_year'])

In [680]:
wide_apps5 = wide_apps4.dropna(subset=[PERSON_ID], axis=0).sort_values(['clean_last_name', 'clean_first_name'])

In [681]:
wide_apps5.to_pickle(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info.p'))
wide_apps5.to_csv(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info.csv'), index=False)

In [682]:
wide_apps_v = wide_apps5.loc[(wide_apps5.application_year>1960) & (wide_apps5.application_year<1976), :].sort_values(
    ['clean_last_name', 'application_date'])

In [683]:
wide_apps_v.to_pickle(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info_vietnam.p'))

wide_apps_v.to_csv(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info_vietnam.csv'))

In [684]:
wide_apps_v.loc[wide_apps_v.eod_year> 1980, NAME_COLS+['application_year', 'eod_year', 'clean_medical_school', 'residency_dates']]

Unnamed: 0_level_0,clean_middle_name,clean_last_name,clean_first_name,application_year,eod_year,clean_medical_school,residency_dates
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [564]:
NIH.loc[NIH.clean_last_name=='WILLIAMSON', NAME_COLS+['clean_medical_school', 'res_dates']]

Unnamed: 0,clean_middle_name,clean_last_name,clean_first_name,clean_medical_school,res_dates
3929,RICHARD,WILLIAMSON,PETER,BOSTON,1988-1989


In [561]:
apps6.loc[apps6.clean_last_name=='WILLIAMSON', :]

Unnamed: 0_level_0,person_app_uuid,address,age,application_date,associate_program_entered,bob,ca,cc,cord,citizenship,...,application_year,person_uuid,not_matched,flag_rejected,application_date_2,application_date_3,res_dates,intern_dates,clean_medical_school,last_name_counts
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f9292a55-0ce4-44a0-a8f4-68731d499b3a,926,3503 Herb Lane,,1970-03-08,,0.0,1.0,1.0,,,...,1970,f9292a55-0ce4-44a0-a8f4-68731d499b3a,,0,NaT,NaT,1970-1971,1969-1970,VANDERBILT,2


In [686]:
wide_apps_v.eod_year.value_counts()

1973.0    217
1972.0    211
1970.0    204
1968.0    199
1971.0    198
1969.0    190
1967.0    188
1974.0    184
1966.0    174
1965.0    149
1975.0    133
1964.0    123
1963.0    118
1976.0     97
1977.0     57
1962.0     40
1961.0     17
1978.0     13
1979.0      4
1960.0      2
Name: eod_year, dtype: int64

In [691]:
# note that this really overestimates people who are not matches
# some of people are not matched it is because their dno is a duplicate of someone already matches
not_matched_apps = apps6.loc[~apps6[PERSON_ID].isin(wide_apps5[PERSON_ID]), :]
not_matched_attendees = NIH.loc[~NIH[NIH_ID].isin(wide_apps5[NIH_ID]), :]


not_matched_attendees.shape

(290, 30)

In [697]:
# check for people with a big discrepancy between eod date and application_year
eod_year_diff = wide_apps_v.loc[
    (~pd.isnull(wide_apps_v.dno)&(wide_apps_v.eod_year-wide_apps.application_year>3)), IMPORTANT_COLS]
eod_year_diff.to_csv(os.path.join(APP_DATA_DIR, 'matches_eod_year_diff_gt3.csv'), index=False)

In [700]:
year_counts_df = pd.concat([wide_apps_v.application_year.value_counts(), 
           wide_apps_v.loc[~pd.isnull(wide_apps.dno), 'application_year'].value_counts(), 
           wide_apps_v.loc[pd.isnull(wide_apps.dno), 'application_year'].value_counts(), 
          ], axis=1)

In [701]:
year_counts_df.columns = ['total', 'matched', 'not_matched']

In [702]:
year_counts_df.to_csv(os.path.join(ATT_DATA_DIR, 'data_counts.csv'))

In [692]:
not_matched_attendees_vietnam = not_matched_attendees.loc[(not_matched_attendees.eod_year<1976) & (not_matched_attendees.eod_year>1960), :].sort_values('clean_last_name')

In [693]:
not_matched_attendees_vietnam.shape

(51, 30)

In [695]:
not_matched_attendees_vietnam.sort_values(['clean_last_name', 'clean_first_name']).to_csv(
    os.path.join(ATT_DATA_DIR, 'not_matched_attendees.csv'), index=False)

In [694]:
not_matched_attendees_vietnam.loc[:, NAME_COLS+MED_TRAINING_COLS]

Unnamed: 0,clean_middle_name,clean_last_name,clean_first_name,res_dates,intern_dates,residency_hospital,internship_hospital,medical_school,residency
28,PHILLIP,ADLER,STUART,,1971-1972,,JOHNS HOPKINS HOSPITAL,JOHNS HOPKINS UNIVERSITY SCHOOL OF MEDICINE,
88,DOUGLAS,ANDREWS,ALAN,,,,,UNIVERSITY OF VIRGINIA,DERMATOLOGY
239,CLAUDE,BENNETT,J,1959-1960,1958-1959,UNIVERSITY HOSPITAL BIRMINGHAM ALABAMA,UNIVERSITY HOSPITAL BIRMINGHAM ALABAMA,HARVARD,MEDICINE
248,J,BENTXEL,CARL,1959-1961,1958-1959,PRESBYTERIAN N Y,PRESBYTERIAN N Y,MEDICAL COLLEGE OF ALABAMA,
333,KENNETH,BLAYLOCK,W,1960-1961,1958-1959,DUKE HOSPITAL,DUKE HOSPITAL,MEDICAL COLLEGE OF VA,MEDICINE
456,W,BROWN,JOHN,,,,UNIVERSITY OF MICHIGAN AFFILIATED,INDIANA UNIVERSITY,
459,C,BROWN,NEIL,,1961-1962,,DUKE,ALBANY,
642,JOHN,CLARK,ROBERT,1968-1969,1967-1968,BOSTON CITY HARVARD IV,BOSTON CITY HARVARD IV,WASHINGTON UNIVERSITY,MEDICINE
643,DOUGLAS,CLARK,W,1959-1961,,UNIVERSITY OF VIRGINIA,,,SURGERY
663,LEROY,COHEN,KENNETH,1972-1973,1971-1972,YALE NEW HAVEN MEDICAL CENTER,YALE NEW HAVEN MEDICAL CENTER,NEW YORK UNIVERSITY SCHOOL OF MEDICINE,ST MEDICINE


In [None]:
not_matched_attendees.eod_year.value_counts().to_csv(os.path.join(ATT_DATA_DIR, 'not_matched_attendees.csv'))

In [None]:
first_last_matches.loc[first_last_matches['clean_last_name']=='LARSON', ['medical_school_x', 'medical_school_y']]