In [None]:
%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
%load_ext autotime

In [None]:
# read in known applicant files, dedupe and try to merge with applicants file
from collections import Counter
import difflib
import uuid
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import re
import os

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings, is_foreign_med_school, clean_med_school)

from dev import (
    APP_DATA_DIR, SUM_STAT_DIR, ATT_DATA_DIR, CARD_DATA_DIR, CORRECTIONS_DIR, AWARDS_KEYWORDS, NAME_COLS, RAW_NAME_COLS, 
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, FEMALE_FIRST_NAMES, PICKLE_DIR)
from merging_functions import *

OUTPUT_CSV = False 

PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']


In [None]:
# id column that links back to raw applicant data file
RAW_CARD_ID = 'raw_uuid'

# column where the raw id information is stored
RAW_INDEX_IDS = 'raw_card_ids'

# try to get one id per unique applicant in the dataset
PERSON_ID = 'person_uuid'
# id per deduped application-person - if someone applied multiple times, they will have multiple ids
PERSON_APPLICATION_ID = 'person_app_uuid' 
NIH_ID = 'dno'

APPLICANT_SUFFIX = '_ap'
ATTENDEE_SUFFIX = '_at'

%load_ext autoreload
%autoreload 2

%reload_ext autoreload

In [None]:
apps_filename = 'index_cards_deduped_fuzzy.csv'
# apps_filename = 'person_application_date_wide.csv'

NIH_filename = 'unique_attendees.csv'
# NIH_filename = 'NIH_attendee_deduped_raw.csv'

In [None]:
NAME_COLS = ['clean_first_name', 'clean_middle_name', 'clean_last_name'] 

MED_TRAINING_COLS = ['res_dates', 'intern_dates', 'residency_hospital', 'internship_hospital', 'medical_school', 'residency']


In [None]:
# import NIH raw data set
NIH_raw = pd.read_csv(os.path.join(ATT_DATA_DIR, NIH_filename)).drop_duplicates('dno')

In [None]:
# drop from the data set all people with eod years > 1980
# NIH = NIH_raw.loc[NIH_raw.eod_year<1980, :] 
NIH = NIH_raw
del NIH['medical_school']

In [None]:
NIH['clean_medical_school'] = NIH.med_school.str.upper().str.strip().apply(clean_med_school)

In [None]:
# import cleaned, deduped applicant data set in wide form (multiple app dates as columns)
apps = pd.read_csv(os.path.join(APP_DATA_DIR, apps_filename))
apps['medical_school'] = apps.original_medical_school.str.upper().str.strip().apply(clean_med_school)

In [None]:
apps.loc[apps.clean_first_name=='HOWARD UNIVERSITY COLLEGE OF MEDICINE', 'clean_first_name'] = 'HOWARD'
apps.loc[apps.clean_middle_name=='HOWARD UNIVERSITY COLLEGE OF MEDICINE', 'clean_middle_name'] = 'HOWARD'
apps.loc[apps.clean_first_name=='STANFORD UNIVERSITY', 'clean_first_name'] = 'STANFORD'

NIH.loc[NIH.clean_first_name=='HOWARD UNIVERSITY COLLEGE OF MEDICINE', 'clean_first_name']
NIH.loc[NIH.clean_first_name=='STANFORD UNIVERSITY', 'clean_first_name'] = 'STANFORD'

apps.loc[apps.clean_last_name=='MCCLURE MCCHURE', ['clean_last_name', 'last_name']] = ['MCCLURE', 'MCCLURE']
apps.loc[apps.clean_last_name=='MCCHURE', ['clean_last_name', 'last_name']] = ['MCCLURE', 'MCCLURE']
apps.loc[apps.clean_last_name=='MCCLURE', 'application_year'] = 1972

In [None]:
NIH.loc[NIH.clean_last_name=='YARNELL', ['clean_first_name', 'clean_middle_name']] = ['PHILIP', 'R']
NIH.loc[NIH.clean_last_name=='YARNELL', ['NIH_first_name', 'NIH_middle_name']] = ['PHILIP', 'R']

# need to correct some mispelled first names
apps.loc[apps.clean_first_name=='WILEY', ['clean_first_name', 'first_name']] = ['WYLIE', 'WYLIE']
apps.loc[apps.clean_first_name=='ANCELO', ['clean_first_name', 'first_name']]= ['ANGELO', 'ANGELO']
apps.loc[apps.clean_first_name=='DOHN', ['clean_first_name', 'first_name']] = ['JOHN', 'JOHN']
apps.loc[apps.clean_first_name=='MERION', ['clean_first_name', 'first_name']] = ['MERTON', 'MERTON']
apps.loc[apps.clean_first_name=='NAHVM', ['clean_first_name', 'first_name']] = ['NAHUM', 'NAHUM']
apps.loc[apps.clean_last_name=='PERPICH', ['clean_first_name', 'first_name']] = ['JOSEPH', 'JOSEPH']

# replace MORTON's first name which is mistakenly John
apps.loc[
    apps.clean_last_name=='MORTON', 'clean_first_name'] = apps[
        apps.clean_last_name=='MORTON']['first_name'].apply(lambda x: x.upper().strip())

In [None]:
apps.loc[apps.clean_last_name=='MORTON', NAME_COLS+['first_name']]

In [None]:
apps.loc[apps.clean_last_name=='COLLER', 'application_year'] = 1972

apps.loc[apps.clean_last_name=='PERPICH', 'application_year'] = 1967

In [None]:
apps.loc[:, 'res_dates'] = apps['residency_year(s)'].apply(long_form_date)

apps.loc[:, 'intern_dates'] = apps['internship_year(s)'].apply(long_form_date)

In [None]:
NIH = NIH.rename(columns={'res_dtes':'res_dates', 'intern_dte':'intern_dates', 'intern_hos': 'internship_hospital', 
                         'res_hosp':'residency_hospital', 'clean_medical_school': 'medical_school'})
# sorted(NIH.columns)

In [None]:
apps2 = apps.rename(
    columns={'residency_type': 'residency', 'internship_hospital_1': 'internship_hospital'})

string_med_cols = ['medical_school', 'residency_hospital', 'internship_hospital', 'residency']

# apply string cleaning function to each of the string medical info columns
apps2.loc[:, string_med_cols] = apps2[string_med_cols].applymap(clean_names)

In [None]:
to_remove = ['TERRECE', 'FRED', 'LAURENCE',
             'CUONO', 'DEFRENZE', 'JEFFERY', 'FINKLEMAN', 'SHERRAD', 'ANSCHNETZ', 'MARC', 'JENSON', 'KASTI', 
            'ADELBERT', 'RITCHARD', 'MANSFORD', 'DEFRENZO', 'DROBIN', 'HAMES', 'KREUZ', 'JERROLD', 'MANEUSI',
            'UNGARO']
to_replace = ['TERRENCE', 'FREDERICK', 'LAWRENCE',
              'CUOMO', 'DEFRONZO', 'JEFFREY', 'FINKELMAN', 'SHERRARD', 'ANSCHUETZ', 'MARCUS', 'JENSEN', 'KASTL',
              'ALBERT', 'RITCHARD', 'MANIFORD', 'DEFRONZO', 'DROBIS', 'JAMES', 'KRUEZ', 'JERROD', 'MANCUSI',
              'UNGARO']

correct_name_mispellings_fnc = funcy.rpartial(correct_mispellings, to_remove, to_replace)

apps2.loc[:, 'clean_last_name'] = apps2.clean_last_name.apply(correct_name_mispellings_fnc)
apps2.loc[:, 'clean_first_name'] = apps2.clean_first_name.apply(correct_name_mispellings_fnc)

In [None]:
# function to go in and correct some of the name mispellings in both data sets
# MUTATING FUNCTION
def change_names(df, selection_type, selection_value, to_change_type, to_change_values):
    for t, v in zip(to_change_type, to_change_values):
        print t, v
        sel = df.loc[df[selection_type]==selection_value, t]
        print sel.shape[0]
        if sel.shape[0] > 1:
            print "WARNING"
        df.loc[df[selection_type]==selection_value, t] = v

In [None]:

apps2.loc[(apps2.clean_last_name=='LIBOW') & (apps2.clean_middle_name=='S'), 'clean_first_name'] = 'LESLIE'

change_names(
    apps2, 'clean_last_name', 'CHESEBRO', ['clean_first_name', 'clean_middle_name'], ['BRUCE', 'WILCOX'])
change_names(
    apps2, 'clean_last_name', 'GALANTER', ['clean_first_name', 'clean_middle_name'], ['MARC', 'I'])
change_names(
    apps2, 'clean_last_name', 'BEAN', ['clean_first_name', 'clean_middle_name', 'medical_school'], ['SIDNEY', 'CHARLES', 'WAKE_FOREST'])
change_names(
    apps2, 'clean_last_name', 'BOYD', ['clean_first_name', 'clean_middle_name'], ['MICHAEL', 'RAY'])
change_names(
    apps2, 'clean_last_name', 'CHAPMAN', 
    ['clean_first_name', 'clean_middle_name', 'medical_school'], ['STANLEY', 'WILLETS', 'ROCHESTER'])

In [None]:
change_names(apps2, 'clean_last_name', 'DANFORTH', ['clean_first_name', 'first_name'], ['DAVID', 'DAVID'])
change_names(
    apps2, 'clean_last_name', 'HUNT', [
        'clean_first_name', 'clean_middle_name', 'first_name', 'middle_name'], ['ROBERT', 'D', 'ROBERT', 'D'])
change_names(
    apps2, 'clean_last_name', 'KARK', [
        'clean_first_name', 'clean_middle_name', 'first_name', 'middle_name'], ['ROBERT', 'ADRIAN', 'ROBERT', 'ADRIAN'])
change_names(
    apps2, 'clean_last_name', 'KEBABIAN', [
        'clean_first_name', 'clean_middle_name','first_name', 'middle_name'], ['JOHN', 'WILLIS', 'JOHN', 'WILLIS'])
change_names(
    apps2, 'clean_last_name', 'KNOPF', [
        'clean_first_name', 'clean_middle_name', 'first_name', 'middle_name'], ['HARRY', 'LOUIS', 'HARRY', 'LOUIS'])
change_names(
    apps2, 'clean_last_name', 'KROLIKOWSKI', [
        'clean_first_name', 'clean_middle_name', 'first_name', 'middle_name'], ['FRANCIS', 'JOHN', 'FRANCIS', 'JOHN'])
change_names(
    apps2, 'clean_last_name', 'KASTL', [
        'clean_first_name', 'clean_middle_name', 'first_name', 'middle_name'], ['DAVID', 'GENE', 'DAVID', 'GENE'])
change_names(
    apps2, 'clean_last_name', 'KLAVEMAN', ['clean_last_name', 'last_name'], ['KLAEVEMAN', 'KLAEVEMAN'])
change_names(
    apps2, 'clean_last_name', 'MATHEW', ['clean_last_name', 'last_name'], ['MATTHEW', 'MATTHEW'])

apps2.loc[apps2.clean_last_name=='CHESEBRO', ['clean_first_name', 'first_name']] = ['BRUCE', 'BRUCE']
apps2.loc[apps2.clean_last_name=='CHESEBRO', ['clean_middle_name', 'middle_name']] = ['WILCOX', 'WILCOX']
apps2.loc[
    (apps2.clean_last_name=='HEALY') & (
        apps2.medical_school=='USC KECK SCHOOL OF MEDICINE'), ['clean_first_name', 'first_name']] = ['MARK', 'MARK']
apps2.loc[
    (apps2.clean_last_name=='HEALY') & (
        apps2.medical_school=='USC KECK SCHOOL OF MEDICINE'), ['clean_middle_name', 'middle_name']] = ['H', 'H']

apps2.loc[apps2.clean_last_name=='LENN', ['clean_first_name', 'first_name']] = ['NICHOLAS', 'NICHOLAS']
apps2.loc[
        apps2.clean_last_name=='BRESLOW', [
            'clean_first_name', 'first_name', 'clean_middle_name', 'middle_name']] = ['JAN', 'JAN', 'LESLIE', 'LESLE']

apps2.loc[
        (apps2.clean_last_name=='NADLER') & (
            pd.isnull(apps2.clean_first_name)), [
                'clean_first_name', 'first_name', 'clean_middle_name', 'middle_name']] = ['LEE', 'LEE', 'MARSHALL', 'MARSHALL']
apps2.loc[
        (pd.isnull(apps2.clean_first_name)) & (
            apps2.clean_last_name=='ROSEN'), ['clean_first_name', 'first_name']] = ['HENRY', 'HENRY']

apps2.loc[
        (apps2.clean_last_name=='NEELON'), [
            'first_name', 'clean_first_name', 'clean_middle_name', 'middle_name']] = ['FRANCIS', 'FRANCIS', 'ALBERT', 'ALBERT']

apps2.loc[(apps2.clean_last_name=='NICHOLAS') , ['first_name', 'clean_first_name']] = ['JOHN', 'JOHN']

In [None]:
apps2.loc[(apps2.clean_last_name=='KEBIAN') , 'clean_last_name'] = 'KEBABIAN'
apps2.loc[(apps2.clean_last_name=='KEBABIAN') , 'last_name'] = 'KEBABIAN'
apps2.loc[((apps2.clean_last_name=='FENSTER')&(apps2.clean_first_name=='FREDERICK')) , 'clean_first_name'] = 'L'
apps2.loc[((apps2.clean_last_name=='FENSTER')&(apps2.clean_first_name=='L')) , 'first_name'] = 'L'
apps2.loc[
    ((apps2.clean_last_name=='ALFORD')&(
            apps2.clean_first_name=='ROBERT')) , ['middle_name', 'clean_middle_name']] = ['H', 'H']
apps2.loc[((apps2.clean_last_name=='KASHIMA')&(apps2.clean_first_name=='HASKINS')) , 'clean_middle_name'] = 'K'
apps2.loc[((apps2.clean_last_name=='KASHIMA')&(apps2.clean_first_name=='HASKINS')) , 'middle_name'] = 'K'
apps2.loc[((apps2.clean_middle_name=='SAMMUEL')) , 'clean_middle_name'] = 'SAMUEL'
apps2.loc[
    ((apps2.clean_last_name=='MANEUSI UNQARO')) , ['last_name', 'clean_last_name']] = ['MANEUSI UNGARO', 'MANEUSI UNGARO']

In [None]:
NIH.loc[
    (NIH.clean_last_name=='ALEXANDER') & (
        NIH.clean_first_name=='JOHN'), ['NIH_middle_name', 'clean_middle_name']] = ['CHARLES', 'CHARLES']

NIH.loc[
    (NIH.clean_last_name=='LEBOWITZ') & (
        NIH.clean_first_name=='EDWARD ARTHUR'), [
            'clean_first_name', 'NIH_first_name', 'clean_middle_name', 'NIH_middle_name']] = ['EDWARD', 'EDWARD', 'ARTHUR', 'ARTHUR']
NIH.loc[(
        NIH.clean_last_name=='LEBOWITZ') & (
            NIH.clean_first_name=='EDWARD'), 'medical_school'] = 'ALBERT EINSTEIN COLLEGE OF MEDICINE OF YESHIVA'

NIH = NIH.loc[NIH.clean_first_name!='GERALDINE']

In [None]:
# there are a bunch of duplicates in apps, where application year is the same, but first name is missing
name_dups = apps2.loc[
    apps2.duplicated(
        ['clean_last_name', 'medical_school', 'application_year'], keep=False), NAME_COLS+[PERSON_ID, 'medical_school', 'application_year']]

In [None]:
to_delete_ids = name_dups.loc[(pd.isnull(name_dups.clean_middle_name)) & (pd.isnull(name_dups.clean_first_name))
                             & pd.isnull(name_dups.medical_school), PERSON_ID]

In [None]:
apps4 = apps2.loc[~apps2[PERSON_ID].isin(to_delete_ids), :].sort_values(
    NAME_COLS+['medical_school'], ascending=False).drop_duplicates(NAME_COLS+['medical_school'])

In [None]:
print apps4.shape
print apps2.shape

In [None]:
# read in manual matches
man = pd.read_excel(os.path.join(CORRECTIONS_DIR, 'manual_dno_matches.xlsx'), index=False).rename(columns={'medical_school': 'dno_medical_school'})
man['medical_school'] = man.dno_medical_school.apply(clean_med_school)

In [None]:
t = pd.merge(left=apps4, right=man, on=['clean_last_name', 'clean_middle_name'], how='inner', suffixes=['_x', '_y'])
print t.shape

t['sim'] = t[['medical_school_x', 'medical_school_y']].apply(get_name_str_sim, axis=1)

t['clean_first_name'] = t['clean_first_name_y']

t_1 = t.sort_values(['dno', 'sim'], ascending=False).drop_duplicates(['dno'], keep='first')

# t[NAME_COLS+['dno', 'medical_school_x', 'medical_school_y', 'sim']]
t_1.loc[t_1.duplicated('dno', keep=False), NAME_COLS+['dno', 'medical_school_x', 'medical_school_y', 'sim']]

In [None]:
apps4.loc[(apps4.clean_last_name=='COHEN') & (apps4.clean_first_name=='HARVEY'), NAME_COLS+['medical_school']]

In [None]:
t.loc[t.clean_last_name=='COHEN', NAME_COLS+['first_name', 'sim']]

In [None]:
um = man[~man.dno.isin(t_1.dno)]

um.shape

In [None]:
um

In [None]:
# t2 = pd.merge(left=apps4, right=um, on=['clean_last_name', 'clean_first_name'], how='inner', suffixes=['_x', '_y'])
# print t2.shape
# print um.shape

# t2['sim'] = t2[['medical_school_x', 'medical_school_y']].apply(get_name_str_sim, axis=1)

# t2['clean_middle_name'] = t2['clean_middle_name_y']

# t2_1 = t2.sort_values(['dno', 'sim'], ascending=False).drop_duplicates(['dno'], keep='first')
# t2_1.shape

In [None]:
t3 = t_1[NAME_COLS+['dno', PERSON_ID]]
# t3 = pd.concat([t2_1[NAME_COLS+['dno', PERSON_ID]], t_1[NAME_COLS+['dno', PERSON_ID]]], axis=0)

In [None]:
man_dno = pd.merge(left=apps4, right=t3, on=PERSON_ID, how='left', suffixes=['_x', '_y'])
print man_dno.shape
print t3.shape
print man.shape

In [None]:
mask = ~pd.isnull(man_dno.clean_last_name_y)
man_dno = man_dno.rename(columns={'clean_last_name_x': 'clean_last_name', 'clean_first_name_x': 'clean_first_name', 
                       'clean_middle_name_x': 'clean_middle_name'})
mask = ~pd.isnull(man_dno.clean_last_name_y)
man_dno.loc[mask, 'clean_last_name'] = man_dno.loc[mask, 'clean_last_name_y']
mask = ~pd.isnull(man_dno.clean_first_name_y)
man_dno.loc[mask, 'clean_first_name'] = man_dno.loc[mask, 'clean_first_name_y']
mask = ~pd.isnull(man_dno.clean_middle_name_y)
man_dno.loc[mask, 'clean_middle_name'] = man_dno.loc[mask, 'clean_middle_name_y']

In [None]:
apps5 = man_dno.loc[pd.isnull(man_dno['dno']), :].drop(['dno', 'clean_middle_name_y', 'clean_first_name_y', 'clean_last_name_y'], axis=1) 
has_dno = man_dno.loc[~pd.isnull(man_dno['dno']), NAME_COLS+['dno', 'person_uuid']] 
# apps5 = man_dno.drop(['dno', 'clean_middle_name_y', 'clean_first_name_y', 'clean_last_name_y'], axis=1) 

In [None]:
# mark femalse
NIH['is_female'] = 0
female_mask = (NIH.clean_first_name.isin(FEMALE_FIRST_NAMES))  
NIH.loc[female_mask, 'is_female'] = 1


apps5['is_female'] = 0
female_mask = (apps5.clean_first_name.isin(FEMALE_FIRST_NAMES))  
apps5.loc[female_mask, 'is_female'] = 1


In [None]:
NIH.loc[NIH.clean_first_name=='ASHLEY', NAME_COLS]
apps5.loc[apps5.clean_first_name=='JULES', NAME_COLS]

In [None]:
def get_first_letter(str_var):
    if pd.isnull(str_var) or str_var=='':
        return np.nan
    return str_var[0]

In [None]:
NIH['clean_first_initial'] = NIH.clean_first_name.apply(get_first_letter)
NIH['clean_middle_initial'] = NIH.clean_middle_name.apply(get_first_letter)

In [None]:
print NIH.shape
print  apps5.shape

In [None]:
# After cleaning apps2 to match cleaning in Clean NIH Applicant notebook, we try to start merging
sims_cols = ['medical_school_sim', 'clean_middle_name_sim', 'clean_first_name_sim']

In [None]:
NIH['fuzzy_merge_col'] = NIH[
    ['clean_first_name', 'clean_middle_name', 'clean_last_name']].apply(create_str_merge, axis=1)
apps5['fuzzy_merge_col'] = apps5[
    ['clean_first_name', 'clean_middle_name', 'clean_last_name']].apply(create_str_merge, axis=1)
match1 = df_get_closest_matches(apps5, NIH, 'fuzzy_merge_col', suffixes=['_x', '_y']) 
print match1.shape

In [None]:
# add last name counter to each
app_counter = Counter(apps4.clean_last_name.values)
NIH_counter = Counter(NIH.clean_last_name.values)
match1['last_name_counts_x'] = match1.clean_last_name_x.apply(lambda x: app_counter[x])
match1['last_name_counts_y'] = match1.clean_last_name_y.apply(lambda x: NIH_counter[x])


In [None]:
def check_match(row):
    # address and application year match
    app_eod_year_diff = abs(row['application_year'] - row['eod_year'])
    if row['is_female_x'] != row['is_female_y']:
        return 0
    if row['clean_last_name_sim'] < 90 or app_eod_year_diff > 8:
        return 0
    if not pd.isnull(row['medical_school_sim']) and row['medical_school_sim'] < 90:
        # drop people with no medical school similarity
        return 0
    if not pd.isnull(row['clean_first_name_sim']) and row['clean_first_name_sim'] < 80:
        return 0
    # the first and middle name seem to be mixed up in index card data set
    mixed_sim1 =  get_name_str_sim(row[['clean_middle_name_x', 'clean_first_name_y']])
    mixed_sim2 =  get_name_str_sim(row[['clean_middle_name_y', 'clean_first_name_x']])
    mix_sim = max(mixed_sim1, mixed_sim2)
    if pd.isnull(mix_sim):
        mix_sim = 0
    if (mix_sim > 90) and row['medical_school_sim'] > 90:
        return 1
    if row['last_name_counts_x'] < 2 and row['last_name_counts_y'] < 2:
        return 1
    if (mix_sim > 90) and pd.isnull(row['medical_school_sim']) and (app_eod_year_diff < 5):
        return 1
    # if matching application year and med schools match
    if (app_eod_year_diff < 5) and row['medical_school_sim'] > 80:
        return 1
    # first and middle names match or first
    if (app_eod_year_diff < 5) and row['clean_first_name_sim'] > 80:
        return 1
    return 0


feature_dict = {
    'clean_first_name': get_name_str_sim,
    'clean_middle_name': get_name_str_sim,
    'clean_last_name': get_name_str_sim,
    'medical_school': get_name_str_sim,
}

match2 = add_similarity_features(match1, feature_dict, check_match, suffixes=['_x', '_y'])

In [None]:
# select out people who match and make sure each person id and dno only 1x in data set
match3 = match2[match2.is_match==1].sort_values(['clean_last_name_x']+sims_cols, ascending=False).dropna(
    axis=0, subset=[RAW_CARD_ID])
print match3.shape

In [None]:
def filter_one_match_per_group(df, dedupe_col, sim_cols):
    # to merge cols should be a dict the names of the extra cols to merge in
    # values should be col names to rename
    # sim cols should be name of the columns to use as features
    # sim mask should be mask that accounts as actual mask
    # dedupe col is name of col to dedupe on

    def count_matches(id_list_arr):
        # for each id, make sure matched on 1x in data set
        # should be applied with rolling apply so takes in a dataframe and must return single value
        # unpack already matched ids from string
        current_id1 = id_list_arr[-1]
        other_matches = id_list_arr[:-1]
        is_dup = np.any(other_matches[:] == current_id1)
        if is_dup:
            return True
        return False

    # for each uuid, check for duplicates and choose best match based on sim cols
    # order of the sim cols should be with most important first
    dup_flag = '{}_duplicate'.format(dedupe_col)
    df[dup_flag] = 0
    df.loc[:, dup_flag] = df[
        dedupe_col].expanding(center=False, min_periods=0).apply(func=count_matches)

    df_matches = df[df['is_match'] == 1].sort_values([dedupe_col] + sim_cols, ascending=False)
    return df_matches.drop_duplicates([dedupe_col], keep='first')


In [None]:
match4 = filter_one_match_per_group(match3, 'raw_uuid', sims_cols)
print match4.shape

In [None]:
match5 = filter_one_match_per_group(match4, NIH_ID, sims_cols)
print match5.shape

In [None]:
match5.loc[match5.clean_last_name_x=='BRADEN', ['medical_school_y', 'medical_school_x']]

In [None]:
# get nonmatched NIH people and not matched applicants 
nm_apps = get_nonmatched(apps5, id_colname=RAW_CARD_ID, matched_ids=match5[RAW_CARD_ID].dropna().values)
nm_NIH = get_nonmatched(NIH, id_colname=NIH_ID, matched_ids=match5[NIH_ID].dropna().values)

In [None]:
# do another round of matching just on last name
nm_match1 = df_get_closest_matches(nm_apps, nm_NIH, 'clean_last_name', suffixes=['_x', '_y']) 
print nm_match1.shape


In [None]:
def check_match(row):
    app_eod_year_diff = abs(row['application_year'] - row['eod_year'])
    if app_eod_year_diff > 8:
        return 0
    if row['is_female_x'] != row['is_female_y']:
        return 0
    if not pd.isnull(row['medical_school_sim']) and row['medical_school_sim'] < 80:
        return 0
    # the first and middle name seem to be mixed up in index card data set
    mixed_sim1 =  get_name_str_sim(row[['clean_middle_name_x', 'clean_first_name_y']])
    mixed_sim2 =  get_name_str_sim(row[['clean_middle_name_y', 'clean_first_name_x']])
    
    max_name_sim = max(row['clean_first_name_sim'], row['medical_school_sim'], row['clean_middle_name_sim'])
    if max_name_sim < 80:
        return 0
    mix_sim = max(mixed_sim1, mixed_sim2)
    if pd.isnull(mix_sim):
        mix_sim = 0
    if (mix_sim > 90) and row['medical_school_sim'] > 90:
        return 1
    if (mix_sim > 90) and pd.isnull(row['medical_school_sim']) and (app_eod_year_diff < 6):
        return 1
    if not pd.isnull(row['clean_first_name_sim']) and row['clean_first_name_sim'] < 70:
        return 0
    # if matching application year and med schools match
    if (app_eod_year_diff < 6) and row['medical_school_sim'] > 90:
        return 1
    # first and middle names match or first
    if (app_eod_year_diff < 6) and row['clean_first_name_sim'] > 90:
        return 1
    # first and middle names match or first
    if (app_eod_year_diff < 6) and row['clean_middle_name_sim'] > 90:
        return 1
    return 0

In [None]:
nm_feature_dict = {
    'clean_first_name': get_name_str_sim,
    'clean_middle_name': get_name_str_sim,
    'medical_school': get_name_str_sim,
}

nm_match2 = add_similarity_features(nm_match1, nm_feature_dict, check_match, suffixes=['_x', '_y'])


In [None]:
nm_match3 = nm_match2.loc[(nm_match2.is_match==1) & (nm_match2.index!='MORTON'), :].reset_index(
    drop=False).rename(columns={'index': 'clean_last_name'})

In [None]:
print nm_match3.shape

In [None]:
# merge the manual people with dno with the dno data set and append to the matches list
man_dno2 = man_dno.loc[~pd.isnull(man_dno['dno']), :].drop(['clean_first_name_y', 'clean_middle_name_y',
                                                            'clean_last_name_y'], axis=1)
man_dno2.loc[:, 'dno'] = man_dno2.dno.astype(int)
NIH.loc[:, 'dno'] = NIH.dno.astype(int)

In [None]:
man_dno2.loc[man_dno2.clean_last_name=='COHEN', NAME_COLS+['first_name']]

In [None]:
print man_dno2.shape
man_dno_merge = pd.merge(left=man_dno2, right=NIH, on='dno', how='inner')
print man_dno_merge.shape

In [None]:
# stewart and sherwin have eod years outside range
man_dno2.loc[~man_dno.dno.isin(man_dno_merge.dno), NAME_COLS+['dno']]

In [None]:
man_dno_merge2 = man_dno_merge.rename(columns={'clean_first_name_x': 'clean_first_name2', 
                                               'clean_middle_name_x': 'clean_middle_name2',
                             'clean_last_name_x': 'clean_last_name2'})


In [None]:
man_dno_merge2.loc[man_dno_merge2.clean_last_name_y=='COHEN', NAME_COLS+['first_name']]

In [None]:
fails = man_dno2.loc[~man_dno2.dno.isin(man_dno_merge.dno), :]
print fails.shape

In [None]:
# append matches together
match6 = pd.concat([nm_match3, match5, man_dno_merge2], axis=0)

In [None]:
# get nonmatched NIH people and not matched applicants 
nm_apps2 = get_nonmatched(apps5, id_colname=RAW_CARD_ID, matched_ids=match6[RAW_CARD_ID].dropna().values)
nm_NIH2 = get_nonmatched(NIH, id_colname=NIH_ID, matched_ids=match6[NIH_ID].dropna().values)

In [None]:
nm_NIH3 = nm_NIH2.loc[
    (nm_NIH2.eod_year< 1976) & (
        nm_NIH2.eod_year>1963), NAME_COLS+['medical_school', 'eod_year']].sort_values('clean_last_name')
print nm_NIH3.shape

In [None]:
apps_match = nm_apps2.loc[nm_apps2.clean_last_name.isin(nm_NIH3.clean_last_name.values)]
test_merge = pd.merge(left=nm_NIH3, right=apps_match, on='clean_last_name', how='inner').sort_values('clean_last_name')
test_merge = test_merge[sorted(test_merge.columns)]
if OUTPUT_CSV:
    test_merge.to_csv(os.path.join(CORRECTIONS_DIR, 'test_merge_missing_NIH.csv'), index=False)

In [None]:
match6.loc[pd.isnull(match6.clean_last_name), 'clean_last_name'] =  match6.loc[
    pd.isnull(match6.clean_last_name), 'clean_last_name_x']
to_drop = [c for c in match6.columns if c.endswith('_sim') or '_counts' in c]
match7= match6.drop(to_drop+[
        'dup_flag', 'eod_year_diff', 'fuzzy_merge_col_x', 'fuzzy_merge_col_y', 'unknown',
        'Unnamed: 0', 'raw_uuid_duplicate', 'dno_duplicate', 'count_missing',
                'clean_last_name_x', 'clean_last_name_y', 'is_match'], axis=1)

In [None]:
# consolidate columns in match6
match7a = consolidate_merge_cols(match7, ['_x', '_y'], [])

In [None]:
sorted(match7a.columns)

In [None]:
match7a.loc[match7a.med_school.str.upper()!=match7a.medical_school, NAME_COLS+['med_school', 'medical_school', 'original_medical_school']]

In [None]:
apps.loc[apps.clean_last_name=='MORTON', NAME_COLS]

In [None]:
mask = ~pd.isnull(match7a.clean_first_name2)
match7a.loc[mask, 'clean_first_name'] = match7a.loc[mask, 'clean_first_name2']

mask = ~pd.isnull(match7a.clean_last_name2)
match7a.loc[mask, 'clean_last_name'] = match7a.loc[mask, 'clean_last_name2']

mask = ~pd.isnull(match7a.clean_middle_name2)
match7a.loc[mask, 'clean_middle_name'] = match7a.loc[mask, 'clean_middle_name2']

In [None]:
match8 = pd.concat([
        man_dno[~pd.isnull(man_dno['dno'])], match7a, apps4.loc[
            ~apps4[PERSON_ID].isin(match7a[PERSON_ID].values),:]], axis=0).drop(['clean_first_name2', 
                                                                                'clean_middle_name2', 
                                                                                'clean_last_name2'], axis=1)
print match8.shape

In [None]:
# replace date of birth with dob whenever date of birth missing and dob is not
match8.loc[
    (pd.isnull(match8['date_of_birth'])) & (~pd.isnull(match8['dob'])), 'date_of_birth'] = match8.loc[
        (pd.isnull(match8['date_of_birth'])) & (~pd.isnull(match8['dob'])), 'dob']


c1 = 'date_of_birth'
c2 = 'dob'
match8.loc[(pd.isnull(match8[c1])) & (~pd.isnull(match8[c2])), [c1, c2]]

In [None]:
match8.loc[(match8.clean_last_name=='KNOWLER') & (match8.clean_first_name=='JAN'), 'dno'] = 1922
match8.loc[
    (match8.clean_last_name== 'E ROSS HARVARD') & (match8.clean_first_name=='MICHAEL'), 'clean_last_name'] = 'ROSS'


In [None]:
match9 = match8.drop(['dob'], axis=1).reset_index(drop=True).sort_values(
    NAME_COLS+['dno']).drop_duplicates(NAME_COLS+['dno']).drop_duplicates(NAME_COLS+[PERSON_ID])
# print sorted(match9.columns)
print match8.shape
print match9.shape

In [None]:
# check for dno duplicates 
dups_dno = match9.loc[(~pd.isnull(match9.dno)) & (
        match9.duplicated('dno', keep=False)), NAME_COLS+['dno', PERSON_ID, 'medical_school']]

dups_merge = pd.merge(
    left=dups_dno, right=NIH.loc[NIH.dno.isin(dups_dno.dno), NAME_COLS+['dno', 'medical_school']], on=['dno'], how='left')


In [None]:
def get_sim_score(row):
    sim_cols = NAME_COLS + ['medical_school']
    sims = []
    for col in sim_cols:
        sim_1 = get_name_str_sim(row[['{}_x'.format(col), '{}_y'.format(col)]])
        sims.append(sim_1)
    return np.mean(sims)

In [None]:
dups_merge['sim'] = dups_merge.apply(get_sim_score, axis=1)

In [None]:
dups_merge = dups_merge.sort_values(['dno', 'sim'], ascending=False).rename(columns={'dno':'old_dno'})
dups_merge['dno'] = 0 

In [None]:
best_match = dups_merge.groupby('old_dno').first().person_uuid.values
print len(best_match)

In [None]:
mask = (dups_merge.sim > 94) & (dups_merge.person_uuid.isin(best_match))
print sum(mask)
dups_merge.loc[mask, 'dno'] = dups_merge[mask]['old_dno']
# match9.loc[(~pd.isnull(match9.clean_first_name_y)), ['clean_first_name', 'clean_first_name_y']]

In [None]:
# dups_merge.loc[dups_merge.dno==dups_merge.old_dno]

In [None]:
# reset duplicates to null
match9.loc[match9.person_uuid.isin(dups_merge.loc[dups_merge['dno']==0, PERSON_ID]), 'dno'] = np.nan

match9.loc[match9.person_uuid==35, NAME_COLS+[PERSON_ID, 'dno']]

match9.loc[(match9.duplicated('dno', keep=False) & (~pd.isnull(match9.dno))), NAME_COLS+['dno', PERSON_ID]]

In [None]:
# need to check we haven't created amy frankenstein matches
# ie. people who shouldn't really match together
# get a
match9.loc[(~pd.isnull(match9.first_name) & (
    match9.clean_first_name!=match9.first_name.str.upper())), NAME_COLS+['first_name', 'middle_name', 'last_name', 'med_school', 'original_medical_school',
                                                                        'NIH_first_name', 'NIH_middle_name', 'NIH_last_name']]

In [None]:
match9.is_female.describe()

In [None]:
# add an international applicant flag
# also drop anyone missing first, middle names and med school
match9['is_foreign'] = 0
match9.loc[:, 'is_foreign'] = match9.medical_school.apply(is_foreign_med_school)

In [None]:
match9.is_foreign.describe()

In [None]:
match9['control_flag'] = 0
match9.loc[pd.isnull(match9.dno) & pd.isnull(match9.year_accepted), 'control_flag'] = 1

In [None]:
match9.loc[pd.isnull(match9.application_year), 'application_year'] = match9.loc[
    pd.isnull(match9.application_year), 'application_year_1']

In [None]:
def total_number_applications(app_years):
    unique_apps_years = app_years.dropna().unique()
    return unique_apps_years.shape[0]

match9['number_applications'] = match9[
    ['application_year', 'application_year_1', 'application_year_2', 'application_year_3']].apply(
        total_number_applications, axis=1)

In [None]:
def applications_max_min(app_years, fnc):
    unique_apps_years = app_years.dropna().unique()
    if unique_apps_years.shape[0] == 0:
        print app_years
        return np.nan
    return fnc(unique_apps_years)

In [None]:
match9['application_year_max'] = match9[
    ['application_year', 'application_year_1', 'application_year_2', 'application_year_3']].apply(
        funcy.rpartial(applications_max_min, max), axis=1)


In [None]:
match9['application_year_min'] = match9[
    ['application_year', 'application_year_1', 'application_year_2', 'application_year_3']].apply(
        funcy.rpartial(applications_max_min, min), axis=1)


In [None]:
match9['time_period_flag'] = 0
match9.loc[(match9.application_year_max>1964) & (match9.application_year_max<1976), 'time_period_flag'] = 1

In [None]:
# match9A.loc[(match9A.clean_last_name=='MCCHURE') & (pd.isnull(match9A.clean_first_name))]]
match9.loc[match9.clean_last_name=='MCCLURE MCCHURE', 'clean_last_name'] = 'MCCLURE'
match9.loc[match9.clean_last_name=='PERPICH', 'application_year'] = 1967
match9.loc[match9.clean_last_name=='PERPICH', 'application_year_min'] = 1967
match9.loc[match9.clean_last_name=='PERPICH', 'application_year_max'] = 1967
mathch9 = match9[match9.clean_last_name!='BRADEN R']
match9_1 = match9[~((match9.clean_last_name=='BULKEY') & (match9.clean_first_name=='GREGORY'))]
match9_2 = match9_1[~((match9_1.clean_last_name=='KNOWLER') & (match9_1.clean_first_name=='JAN'))]
match9_3 = match9_2[~((match9_2.clean_last_name=='COLLIN') & (match9_2.clean_first_name=='ROBERT'))]
match9_4 = match9_3[~((match9_3.clean_last_name=='BULLARD') & (match9_3.clean_first_name=='BRIAN'))]
# CHESEBRE, COLDBERG, Robert Collin, DIEZMAN, GLASSROBTH, HUGH HAYWOOD, Bart Kentover, jan knowler, robert jeffery kramer
# SAIRAI, william sullivan

In [None]:
# fix some 
match9_4.loc[(
        match9_4.clean_first_name=='WILLIAM') & (match9_4.clean_middle_name=='WILLIAM') & 
             (match9_4.clean_last_name=='SULLIVAN'), 'clean_middle_name'] = np.nan

match9_4.loc[match9_4.clean_middle_name=='JEFFERY', 'clean_middle_name'] = 'JEFFREY'
match9_4.loc[match9_4.clean_last_name=='GLASSROBTH', 'clean_last_name'] = 'GLASSROTH'

In [None]:
# name corrections
match9_4.loc[((match9_4.clean_first_name=='ALBERT') & (match9_4.first_name=='Adelbert')), 'clean_first_name'] = 'ADELBERT'
match9_4.loc[((match9_4.clean_first_name=='BENJAMIN') & (match9_4.clean_last_name=='CAHAN')), 'clean_first_name'] = 'LESLIE'
match9_4.loc[((match9_4.clean_first_name=='GEORGES')), 'clean_first_name'] = 'GEORGE'
match9_4.loc[((match9_4.clean_first_name=='JACKS')), 'clean_first_name'] = 'JACK'
match9_4.loc[((match9_4.clean_first_name=='HAVERY')), 'clean_first_name'] = 'HARVEY'
match9_4.loc[((match9_4.clean_first_name=='LAWRENCE') & (match9_4.first_name=='Laurence')), 'clean_first_name'] = 'LAURENCE'
match9_4.loc[((match9_4.clean_first_name=='PHILLIP') & (match9_4.first_name=='Philip')), 'clean_first_name'] = 'PHILIP'
match9_4.loc[((match9_4.clean_first_name=='FREDERIC') & (match9_4.clean_last_name=='MUSHINSKI')), 'clean_middle_name'] = 'COSTEP'
# name corrections
match9_4.loc[((match9_4.clean_first_name=='ALBERT') & (match9_4.first_name=='Adelbert')), 'clean_first_name'] = 'ADELBERT'
match9_4.loc[((match9_4.clean_first_name=='BENJAMIN') & (match9_4.clean_last_name=='CAHAN')), 'clean_first_name'] = 'LESLIE'
match9_4.loc[((match9_4.clean_first_name=='GEORGES')), 'clean_first_name'] = 'GEORGE'
match9_4.loc[((match9_4.clean_first_name=='JACKS')), 'clean_first_name'] = 'JACK'
match9_4.loc[((match9_4.clean_first_name=='HAVERY')), 'clean_first_name'] = 'HARVEY'
match9_4.loc[((match9_4.clean_first_name=='LAWRENCE') & (match9_4.first_name=='Laurence')), 'clean_first_name'] = 'LAURENCE'
match9_4.loc[((match9_4.clean_first_name=='PHILLIP') & (match9_4.first_name=='Philip')), 'clean_first_name'] = 'PHILIP'
match9_4.loc[((match9_4.clean_first_name=='FREDERIC') & (match9_4.clean_last_name=='MUSHINSKI')), 'clean_middle_name'] = 'COSTEP'
match9_4.loc[((match9_4.clean_first_name=='STANLEY') & (match9_4.clean_last_name=='SHERWIN')), 'clean_first_name'] = 'ROBERT'


In [None]:
test = ['PERPICH',  'BRADEN', 'BRADEN R', 'BULKEY', 'BULKLEY', 
       'CHESEBRE', 'COLDBERG', 'COLLIN', 'DIEZMAN', 'DIETZMAN', 'GLASSROBTH', 
        'GLASSROTH', 'HAYWARD', 'HAYWOOD', 'MCCLURE MCCHURE', 'MCCLURE', 'MCCHURE',
       'KETOVER', 'KENTOVER', 'KNOWLER', 'KRAMER', 'SARAI', 'SARAL', 'SARAL', 'SULLIVAN', 'COLLINS', 
       'KOEHLER']
match9_4.loc[match9_4.clean_last_name.isin(test), NAME_COLS+['medical_school']].sort_values('clean_last_name')

In [None]:
# delete people missing first and middle names
print match9_4.shape
match9A = match9_4.loc[~((pd.isnull(match9_4.clean_first_name) & pd.isnull(match9_4.clean_middle_name))), :]
match9A2 = match9A.loc[~(
        (match9A.medical_school=='UNIVERSITY OF MINNESOTA MEDICAL SCHOOL DULUTH') & (
            match9A.clean_last_name=='PERPICH')), :]
match9A3 = match9A2.loc[~((pd.isnull(match9A2.clean_first_name) & (match9A2.clean_last_name=='MCCHURE'))), :]
match9A4 = match9A3.loc[~(
        (match9A3.clean_first_name==match9A3.clean_middle_name) & (match9A3.clean_last_name=='BROWER')), :]
match9B = match9A4.loc[~pd.isnull(match9A4.application_year_min), :]
print match9A.shape
print match9B.shape

In [None]:
match9A2.loc[pd.isnull(match9A2.application_year_min), NAME_COLS+['application_year_min', 'medical_school', 'application_year']]
match9B.loc[match9B.clean_last_name=='PERPICH', NAME_COLS+['application_year_min', 'medical_school', 'application_year']]

In [None]:
mask = match9B.clean_first_name.apply(has_suffix)
match9B.loc[mask, 'clean_suffix'] = match9B.loc[mask, 'clean_first_name'].apply(get_suffix)

match9B.loc[mask, 'clean_first_name'] = 'SPENCER'

In [None]:
# check for person id duplicates
dups_mask = match9B.duplicated(PERSON_ID, keep=False)
match9B.loc[dups_mask, NAME_COLS+['medical_school', PERSON_ID, 'address']]

In [None]:
import hashlib
from random import randint
max(match9B.person_uuid.values)

In [None]:
max_id = max(match9B.person_uuid.values)
print max_id
dups_mask = match9B.duplicated(PERSON_ID, keep=False)
dups = match9B.ix[dups_mask, NAME_COLS+[PERSON_ID, 'medical_school', 'address']]
dups['new_id'] = dups.person_uuid.apply(lambda x: max_id+randint(10, 1000))

In [None]:
NIH.loc[NIH.dno==3482, NAME_COLS+['NIH_first_name']]

In [None]:
match9B.loc[match9B.duplicated(PERSON_ID, keep=False), NAME_COLS+[PERSON_ID, 'dno', 'first_name', 'NIH_first_name', 'NIH_last_name']]


In [None]:
missing_NIH_names_mask = (pd.isnull(match9B.NIH_first_name) & (~pd.isnull(match9B.dno)))
match9B.loc[missing_NIH_names_mask, 'NIH_first_name'] = match9B[missing_NIH_names_mask]['dno'].apply(
    lambda x: NIH.get_value(NIH.loc[NIH.dno==x].index[0], 'NIH_first_name'))

In [None]:
match9C = pd.merge(left=match9B, right=dups, on=NAME_COLS+['medical_school', 'address', PERSON_ID], how='left')

In [None]:
# need to correct Dale Dietzman's medical school
match9C.loc[((match9C.clean_last_name=='DIETZMAN') & (match9C.clean_first_name=='DALE')), 'medical_school'] = 'BAYLOR COLLEGE OF MEDICINE'
match9C.loc[((match9C.clean_last_name=='DIETZMAN') & (match9C.clean_first_name=='DALE')), 'medical_school']

In [None]:
mask = ~pd.isnull(match9C.new_id)

match9C.loc[match9C.person_uuid==3800,NAME_COLS+[PERSON_ID, 'new_id']]

match9C.loc[mask, PERSON_ID] = match9C[mask]['new_id'] 

del match9C['new_id']

dups_mask = match9C.duplicated(PERSON_ID, keep=False)
match9C.loc[dups_mask, NAME_COLS+['medical_school', PERSON_ID, 'address']]

In [None]:
match9C.rename(columns={'res_dates': 'residency_dates_NIH', 'intern_dates': 'internship_dates_NIH', 
                        'residency_year(s)': 'residency_dates', 'internship_year(s)': 'internship_dates',
                      'clean_college_trans': 'clean_college', 'med_school': 'NIH_medical_school'}, inplace=True)

IMPORTANT_COLS = [NIH_ID, PERSON_ID, 'application_year_min', 'application_year_max', 'eod_year',
                  'clean_first_name', 'clean_middle_name', 
                 'clean_last_name', 'control_flag', 'time_period_flag', 'year_accepted',
                  'rejected', 'rejection_date', 'clean_college', 'medical_school',
                'residency_dates', 'residency_dates_NIH', 'internship_dates', 'internship_dates_NIH',
                  'is_female', 'is_foreign', 'number_applications', 
                 'NIH_first_name', 'NIH_middle_name', 'NIH_last_name', 'NIH_medical_school']

other_cols = sorted([i for i in match9C.columns if i not in IMPORTANT_COLS])

# order columns so important ones are 
match10 = match9C[IMPORTANT_COLS+other_cols].sort_values(['clean_last_name', 'clean_first_name', 'application_year_max']).drop(
    ['clean_first_name_y', 'clean_middle_name_y', 'clean_last_name_y'], axis=1)

match11 = match10.dropna(subset=[PERSON_ID], axis=0).sort_values(['clean_last_name', 'clean_first_name'])

mask = (pd.isnull(match11.eod_year) & ~pd.isnull(match11.dno))

match11['is_female'] = 0

match11.loc[match11.clean_first_name.isin(FEMALE_FIRST_NAMES), 'is_female'] = 1

match11.loc[mask, 'eod_year'] = match11[mask].dno.apply(lambda x: NIH.loc[NIH.dno==x, 'eod_year'].values[0])
# wide_apps5.to_pickle(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info.p'))
match11.to_csv(os.path.join(APP_DATA_DIR, 'fuzzy_all_apps_plus_NIH_info.csv'), index=False)

In [None]:
match9A.loc[match9A.clean_last_name.isin(['HARRIN']), NAME_COLS+[PERSON_ID, 'medical_school', 'application_year', 'dno', 'residency','residency_hospital',
                                                            'internship_hospital', 'residency_dates']]

In [None]:
dups_dno = match11.loc[(~pd.isnull(match11.dno)) & (match11.duplicated('dno', keep=False)), NAME_COLS+['dno', PERSON_ID, 'medical_school']]
print dups_dno.shape
dups_merge = pd.merge(
    left=dups_dno, right=NIH.loc[NIH.dno.isin(dups_dno.dno), NAME_COLS+['dno', 'medical_school']], on=['dno'], how='left')

dups_merge

In [None]:
# check for uuid dups
match11.loc[match11.duplicated(PERSON_ID, keep=False), NAME_COLS]