In [5]:
# read in known applicant files, dedupe and try to merge with applicants file
from collections import Counter
import difflib
import uuid
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import re
import os

APP_DATA_DIR = os.path.abspath('Data/applicant_data')
ATT_DATA_DIR = os.path.abspath('Data/attendees_data')
CARD_DATA_DIR = os.path.abspath('Data/applicant_data/raw_card_data')

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name,  
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name)
from data_cleaning_functions import correct_mispellings


# id column that links back to raw applicant data file
RAW_CARD_ID = 'raw_uuid'

# column where the raw id information is stored
RAW_INDEX_IDS = 'raw_card_ids'

# try to get one id per unique applicant in the dataset
PERSON_ID = 'person_uuid'
# id per deduped application-person - if someone applied multiple times, they will have multiple ids
PERSON_APPLICATION_ID = 'person_app_uuid' 
NIH_ID = 'dno'

In [6]:
# read _csv
df3_unique = pd.read_csv(os.path.join(ATT_DATA_DIR, 'unique_attendees.csv'))

# import applicants file and try to merge with attendees
# interested to see how many applicants were NOT accepted
apps = pd.read_pickle(os.path.join(APP_DATA_DIR, 'person_application_date_wide.p'))

In [7]:
# rename columns in df3 to match
# change residency and internship dates to be YYYY-YYYY instead of YYYY-YY
def long_form_date(dt_str):
    if pd.isnull(dt_str):
        return dt_str
    m = re.match(r'(\d{4})-(\d{2})', dt_str)
    if m:
        g = m.groups()
        return '{0}-19{1}'.format(g[0], g[1])
    m = re.match(r'(\d{4})', dt_str)
    if m:
        return dt_str
    print dt_str
    return np.nan

In [8]:
apps['res_dates'] = apps['residency_year(s)'].apply(long_form_date)

In [9]:
apps['intern_dates'] = apps['internship_year(s)'].apply(long_form_date)

Str. Medicine
Medicine


In [10]:
df3_unique.rename(columns={'res_dtes': 'res_dates', 'intern_dte': 'intern_dates', 'res_hosp': 'residency_hospital', 
                          'intern_hos': 'internship_hospital', 'clean_middlename': 'clean_middle_name', 
                          'clean_firstname': 'clean_first_name', 'clean_lastname': 'clean_last_name', 
                          'med_school': 'medical_school'}, inplace=True)

In [11]:
apps.rename(columns={'residency_type': 'residency', 'internship_hospital_1': 'internship_hospital'}, inplace=True)

In [14]:
NAME_COLS = ['clean_middle_name', 'clean_last_name', 'clean_first_name']

In [15]:
MED_TRAINING_COLS = ['res_dates', 'intern_dates', 'residency_hospital', 'internship_hospital', 'medical_school', 'residency']

In [16]:
# apply string cleaning to each of the medical training info cols
for c in ['medical_school', 'residency_hospital', 'internship_hospital', 'residency', 'institute']:
    df3_unique.loc[:, c] = df3_unique[c].apply(clean_names)

In [17]:
# apply string cleaning to each of the medical training info cols
for c in ['medical_school', 'residency_hospital', 'internship_hospital', 'residency']:
    apps.loc[:, c] = apps[c].apply(clean_names)

In [18]:
df3_unique.loc[:, 'clean_medical_school'] = df3_unique.medical_school.apply(clean_med_school)

In [20]:
sorted(apps.columns)

['address',
 'age',
 'application_date',
 'application_date_2',
 'application_date_3',
 'application_year',
 'associate_program_entered',
 'bob',
 'ca',
 'cc',
 'citizenship',
 'city',
 'clean_college_trans',
 'clean_first_name',
 'clean_last_name',
 'clean_middle_name',
 'clean_suffix',
 'clinical',
 'cord',
 "daniel's_comments",
 'date_of_birth',
 'dbs',
 'fifth',
 'first_name',
 'flag_missing_app_date',
 'flag_rejected',
 'honor_societies_first',
 'honor_societies_fourth',
 'honor_societies_second',
 'honor_societies_third',
 'ic',
 'intern_dates',
 'internship_hospital',
 'internship_year(s)',
 'last_name',
 'medical_school',
 'medschool_year_grad',
 'middle_name',
 'nci',
 'nei',
 'nhi',
 'nhli',
 'niaid',
 'niamd',
 'niamdd',
 'nichd',
 'nichhd',
 'nidr',
 'niehs',
 'nigms',
 'nimh',
 'nindb',
 'ninds',
 'not_matched',
 'oir',
 'other',
 'person_app_uuid',
 'person_uuid',
 'pharm_ra',
 'pi',
 'ra',
 'raw_uuid',
 'rejected',
 'rejection_date',
 'res_dates',
 'research',
 'residenc

In [19]:
i
apps['clean_medical_school'] = apps.clean_medical_school.apply(clean_med_school) 

AttributeError: 'DataFrame' object has no attribute 'clean_medical_school'

In [None]:
to_remove = ['TERRECE', 'FRED', 'LAURENCE',
             'CUONO', 'DEFRENZE', 'JEFFERY', 'FINKLEMAN', 'SHERRAD', 'ANSCHNETZ', 'MARC', 'JENSON', 'KASTI', 
            'ADELBERT', 'RITCHARD', 'MANSFORD', 'DEFRENZO', 'DROBIN', 'HAMES', 'KREUZ', 'JERROLD', 'MANEUSI',
            'UNGARO']
to_replace = ['TERRENCE', 'FREDERICK', 'LAWRENCE',
              'CUOMO', 'DEFRONZO', 'JEFFREY', 'FINKELMAN', 'SHERRARD', 'ANSCHUETZ', 'MARCUS', 'JENSEN', 'KASTL',
              'ALBERT', 'RITCHARD', 'MANIFORD', 'DEFRONZO', 'DROBIS', 'JAMES', 'KRUEZ', 'JERROD', 'MANCUSI',
              'UNGARO']

correct_name_mispellings_fnc = funcy.rpartial(correct_mispellings, to_remove, to_replace)


In [None]:
df3_unique.loc[:, 'clean_last_name'] = df3_unique.clean_last_name.apply(correct_name_mispellings_fnc)
df3_unique.loc[:, 'clean_first_name'] = df3_unique.clean_first_name.apply(correct_name_mispellings_fnc)
apps.loc[:, 'clean_last_name'] = apps.clean_last_name.apply(correct_name_mispellings_fnc)
apps.loc[:, 'clean_first_name'] = apps.clean_first_name.apply(correct_name_mispellings_fnc)

In [None]:
def change_names(dfs, selection_type, selection_value, to_change_type, to_change_values):
    for t, v in zip(to_change_type, to_change_values):
        print t, v
        for df in dfs:
            df.loc[df[selection_type]==selection_value, t] = v

In [None]:
ALL_DFS = [apps, df3_unique]

In [None]:
change_names(
    ALL_DFS, 'clean_last_name', 'CHESEBRO', ['clean_first_name', 'clean_middle_name'], ['BRUCE', 'WILCOX'])
change_names(ALL_DFS, 'clean_last_name', 'GALANTER', ['clean_first_name', 'clean_middle_name'], ['MARC', 'I'])
change_names(
    ALL_DFS, 'clean_last_name', 'BEAN', ['clean_first_name', 'clean_middle_name', 'clean_medical_school'], ['SIDNEY', 'CHARLES', 'WAKE_FOREST'])
change_names(
    ALL_DFS, 'clean_last_name', 'EILER', ['clean_first_name', 'clean_middle_name'], ['DONALD', 'MARTIN'])
change_names(
    ALL_DFS, 'clean_last_name', 'FALCHUK', ['clean_first_name', 'clean_middle_name'], ['DONALD', 'MARTIN'])
change_names(
    [apps], 'clean_last_name', 'BOYD', ['clean_first_name', 'clean_middle_name'], ['MICHAEL', 'RAY'])
change_names(
    [apps], 'clean_last_name', 'CHAPMAN', 
    ['clean_first_name', 'clean_middle_name', 'clean_medical_school'], ['STANLEY', 'WILLETS', 'ROCHESTER'])
change_names([apps], 'clean_last_name', 'DANFORTH', ['clean_first_name'], ['DAVID'])
change_names([apps], 'clean_last_name', 'HUNT', ['clean_first_name', 'clean_middle_name'], ['ROBERT', 'D'])
change_names([apps], 'clean_last_name', 'KARK', ['clean_first_name', 'clean_middle_name'], ['ROBERT', 'ADRIAN'])
change_names([apps], 'clean_last_name', 'KEBABIAN', ['clean_first_name', 'clean_middle_name'], ['JOHN', 'WILLIS'])
change_names([apps], 'clean_last_name', 'KNOPF', ['clean_first_name', 'clean_middle_name'], ['HARRY', 'LOUIS'])
change_names([apps], 'clean_last_name', 'KROLIKOWSKI', ['clean_first_name', 'clean_middle_name'], ['FRANCIS', 'JOHN'])
change_names([apps], 'clean_last_name', 'KASTL', ['clean_first_name', 'clean_middle_name'], ['DAVID', 'GENE'])
change_names([apps], 'clean_first_name', 'JAN', ['clean_last_name'], ['KNOWLER'])
change_names([apps], 'clean_last_name', 'KLAVEMAN', ['clean_last_name'], ['KLAEVEMAN'])
change_names([apps], 'clean_last_name', 'MATHEW', ['clean_last_name'], ['MATTHEW'])

apps.loc[apps.clean_last_name=='CHESEBRO', ['clean_first_name']] = 'BRUCE'
apps.loc[apps.clean_last_name=='CHESEBRO', ['clean_middle_name']] = 'WILCOX'

In [None]:
exact_name_matches = pd.merge(left=df3_unique, right=apps, left_on=['clean_first_name', 'clean_middle_name', 'clean_last_name'], right_on=[
        'clean_first_name', 'clean_middle_name', 'clean_last_name'], how='inner')

In [None]:
not_matched_apps = apps.loc[~apps[PERSON_ID].isin(exact_name_matches[PERSON_ID]), :]
not_matched_attendees = df3_unique.loc[~df3_unique[NIH_ID].isin(exact_name_matches[NIH_ID]), :]

first_last_matches = pd.merge(left=not_matched_attendees, right=not_matched_apps, left_on=['clean_first_name', 'clean_last_name'], right_on=[
        'clean_first_name', 'clean_last_name'], how='inner')

In [None]:
# create counter objects for each data set that count the number of times the last name occurs in either data set
attendees_counter = Counter(df3_unique.clean_last_name)

apps_counter = Counter(apps.clean_last_name)

In [None]:
# for the first and last matches, where the last name only occurs 1x in each data set, set confidence flag to 1
first_last_matches.loc[:, 'last_name_counts'] = first_last_matches.clean_last_name.apply(
    lambda x: apps_counter[x] + attendees_counter[x])

In [None]:
def define_med_school_junk(seq_elem):
    # difflib sequence matcher first element can take a fnc that inputs a
    # sequence element and returns True if it should be considered Junk
    return seq_elem in ['MEDICAL', 'SCHOOL', 'UNIVERSITY', 'COLLEGE', 'OF', 'THE', 'MEDICINE', 'CENTER', 'DENTISTRY']

In [None]:
def str_sim_fnc(row, index1, index2, junk_fnc=None):
    has_null = any(map(lambda x: pd.isnull(x), row.values))
    if has_null:
        return np.nan
    return difflib.SequenceMatcher(
        junk_fnc, row[index1], row[index2]).ratio()

In [None]:
get_str_sim = funcy.rpartial(str_sim_fnc, 'clean_medical_school_x', 'clean_medical_school_y', define_med_school_junk)

In [None]:
first_last_matches.loc[:, 'med_school_sim'] = first_last_matches[[
        'clean_medical_school_x', 'clean_medical_school_y']].apply(get_str_sim, axis=1)

In [None]:
def get_years(dt_str):
    # return int date strings
    try:
        single_year = (dt_str.find('-') == -1)
        if single_year:
            return [dt_str]
        return dt_str.split('-')
    except ValueError as e:
        print dt_str
        return np.nan

def get_dts_sim(row, name_str):
    has_null = any(map(lambda x: pd.isnull(x), row.values))
    if has_null:
        return np.nan
    dt1 = row['{}_x'.format(name_str)]
    dt2 = row['{}_y'.format(name_str)]
    dates_tup1 = get_years(dt1)
    dates_tup2 = get_years(dt2)
    # if dates match exactly, return 1
    unique_dts = set(funcy.concat(dates_tup1, dates_tup2))
    return (len(unique_dts) < (len(dates_tup1) + len(dates_tup2)))

In [None]:
get_intern_dts_sim = funcy.rpartial(get_dts_sim, 'intern_dates')

In [None]:
first_last_matches.loc[:, 'internship_sim'] = first_last_matches[[
        'intern_dates_x', 'intern_dates_y']].apply(get_intern_dts_sim, axis=1)

In [None]:
# drop all first and last name matches where the med school sim < .6 
first_last_matches.loc[(first_last_matches['last_name_counts'] < 2), 'match_score'] = 1
first_last_matches.loc[((first_last_matches['last_name_counts'] > 1) & (
            first_last_matches['med_school_sim'] > .6) & (first_last_matches['internship_sim']==True)), 'match_score'] = 1
first_last_matches.loc[((first_last_matches['last_name_counts'] > 1) & (
            first_last_matches['med_school_sim'] > .6) & (pd.isnull(first_last_matches['internship_sim']))), 'match_score'] = 1

In [None]:
first_last_matches.loc[pd.isnull(first_last_matches.match_score), 'match_score'] = 0

In [None]:
exact_name_matches.loc[:, 'match_score'] = 1

In [None]:
# bunch of matches on last name only
attendees_counter = Counter(df3_unique.clean_last_name)
apps_counter = Counter(apps.clean_last_name)

# Note that the case where count occurs 2x in one data set and not the other is ok, because nothing to merge on

apps.loc[:, 'last_name_counts'] = apps.clean_last_name.apply(
    lambda x: apps_counter[x] + attendees_counter[x])

df3_unique.loc[:, 'last_name_counts'] = df3_unique.clean_last_name.apply(
    lambda x: apps_counter[x] + attendees_counter[x])

In [None]:
# drop all people without a good match score
# for non matches, do visual check, look at years 67-75, create a score of reliability they are control
full_matches = pd.concat([first_last_matches.loc[first_last_matches.match_score==1, :], exact_name_matches], axis=0)
# if application date year after eod year, drop
# if double match and application date shows up 1x, drop

In [None]:
# non matches
not_matched_apps = apps.loc[~apps[PERSON_ID].isin(full_matches[PERSON_ID]), :]
not_matched_attendees = df3_unique.loc[~df3_unique[NIH_ID].isin(full_matches[NIH_ID]), :]

df3_unique.shape

not_matched_attendees.shape

In [None]:
not_matched_attendees.loc[not_matched_attendees.last_name_counts< 3, :].shape

# there are a bunch of attendees who were not able to match on first and last name
# for last name, if one instance in data set, try match
last_matches = pd.merge(left=not_matched_apps.loc[not_matched_apps.last_name_counts<3, :],
                    right=not_matched_attendees.loc[not_matched_attendees.last_name_counts < 3, :],
                        left_on='clean_last_name', right_on='clean_last_name', how='inner')

last_matches.loc[:, 'med_school_sim'] = last_matches[[
        'clean_medical_school_x', 'clean_medical_school_y']].apply(get_str_sim, axis=1)
last_matches.loc['match_score', :] = 1

In [None]:
# read in manual match crosswalk
m_matches = pd.read_excel(os.path.join(ATT_DATA_DIR, 'manual_attendees_match.xlsx'))

In [None]:
# pull from attendee data set because the information doesn't match
# assuming NIH more reliable
mm_df = df3_unique.loc[df3_unique[NIH_ID].isin(m_matches[NIH_ID]), :]

In [None]:
mm2 = m_matches.dropna(subset=[PERSON_ID], axis=0)

In [None]:
mmdf2 = pd.merge(left=mm_df, right=mm2, how='inner')

In [None]:
mm_df3 = pd.merge(left=mmdf2, right=apps, left_on=PERSON_ID, right_on=PERSON_ID, how='inner')

In [1]:
col_pairs = map(lambda x: x.split('_x')[0], (filter(lambda x: x.endswith('_x'), 2.columns)))
print col_pairs

SyntaxError: invalid syntax (<ipython-input-1-43be5ce2e846>, line 1)

In [2]:
print col_pairs[-1]

NameError: name 'col_pairs' is not defined

In [3]:
mm_df3.loc[:, 'manual_match_flag'] = 1

NameError: name 'mm_df3' is not defined

In [4]:
# for each of paired columns, overwrite _y with _x
mm_df3.loc[:, ['{}_y'.format(c_name) for c_name in col_pairs]] = mm_df3[[
        '{}_x'.format(c_name) for c_name in col_pairs]]

NameError: name 'mm_df3' is not defined

In [None]:
mm_df3.head()

In [None]:
full_matches2 = pd.concat([full_matches, last_matches, mm_df3], axis=0)

In [None]:
full_matches2.shape

In [None]:
full_matches.shape

In [None]:
# for the attendees matched, we want to consolidate duplicate information for all the _x, _y columns
col_pairs = map(lambda x: x.split('_x')[0], (filter(lambda x: x.endswith('_x'), full_matches2.columns)))
print col_pairs

In [None]:
full_matches2.loc[full_matches2.manual_match_flag==1, ['clean_last_name', 'clean_first_name', 'clean_middle_name']]

In [None]:
def select_nonmissing(a, b):
    is_null = pd.isnull(a) and pd.isnull(b)
    if is_null:
        return np.nan
    non_nulls = funcy.remove(pd.isnull, [a, b])
    if isinstance(non_nulls[0], long) or isinstance(non_nulls[0], int) or isinstance(non_nulls[0], float):
        return sorted(non_nulls, reverse=True)[0]
    return sorted(non_nulls, key=len, reverse=True)[0]

In [None]:
cols_to_mush = [('{}_x'.format(c), '{}_y'.format(c)) for c in col_pairs]
for c1, c2 in cols_to_mush:
    full_matches2.loc[:, c1.split('_x')[0]] = full_matches2.loc[:, [c1, c2]].apply(
        lambda x: select_nonmissing(x[c1], x[c2]), axis=1)

In [None]:
# drop consolidated columns
full_matches2.drop(funcy.flatten(cols_to_mush), axis=1, inplace=True)

In [None]:
# there are about 127 rows where applicant date and person are the same, drop these
full_matches_dups = full_matches2.loc[
    full_matches2.duplicated(['clean_first_name', 'clean_last_name'], keep=False), ['application_date', 'eod_year', 'rejection_date', 'clean_first_name', 'clean_last_name', 'sanity_check']]

In [None]:
full_matches_deduped = full_matches2.drop_duplicates(['clean_last_name', 'application_year'])

In [None]:
full_matches_deduped.loc[full_matches_deduped.manual_match_flag==1, ['clean_last_name', 'clean_first_name', 'clean_middle_name']]

In [None]:
full_matches2.loc[~pd.isnull(full_matches2.rejection_date), ['application_date', 'eod_year', 'rejection_date', 'clean_first_name', 'clean_last_name', 'sanity_check']]

In [None]:
full_matches_deduped.to_pickle(os.path.join(ATT_DATA_DIR, 'full_matches.p'))
full_matches_deduped.to_csv(os.path.join(ATT_DATA_DIR, 'full_matches.csv'))

In [None]:
full_matches_deduped2 = full_matches_deduped.set_index(PERSON_ID, drop=False)

In [None]:
apps2 = apps.set_index(PERSON_ID, drop=False)

In [None]:
wide_apps = full_matches_deduped2.combine_first(apps2)

In [None]:
wide_apps2 = wide_apps.drop(['Unnamed: 0', "daniel's_comments", 'firstname', 'lastname', 'middlename',  'med_school_sim', 'match_score', 
                'unknown'], axis=1)

In [None]:
wide_apps2.rename(columns={'clean_college_trans': 'clean_college'}, inplace=True)

In [None]:
# consolidate dob and date_of_birth, undergraduate_school and clean_college, intern_dates and internship_year(s)
# res_dates and residency_year(s)
# year_grad

In [None]:
# replace date of birth with dob whenever date of birth missing and dob is not
wide_apps2.loc[
    (pd.isnull(wide_apps2['date_of_birth'])) & (~pd.isnull(wide_apps2['dob'])), 'date_of_birth'] = wide_apps2.loc[
        (pd.isnull(wide_apps2['date_of_birth'])) & (~pd.isnull(wide_apps2['dob'])), 'dob']


In [None]:
c1 = 'date_of_birth'
c2 = 'dob'
wide_apps2.loc[(pd.isnull(wide_apps2[c1])) & (~pd.isnull(wide_apps2[c2])), [c1, c2]]

In [None]:
# check if columns fifth and sixth are empty
wide_apps2.loc[~pd.isnull(wide_apps2.fifth), :]
wide_apps2.loc[~pd.isnull(wide_apps2.sixth), :]

In [None]:
# delete them
wide_apps3 = wide_apps2.drop(['dob', 'undergraduate_school', 'internship_year(s)', 'residency_year(s)', 
                              'year_grad', 'fifth', 'sixth', 'other', 'internship_sim'], axis=1)

In [None]:
wide_apps3.rename(columns={'res_dates': 'residency_dates', 'intern_dates': 'internship_dates'}, inplace=True)

In [None]:
IMPORTANT_COLS = [NIH_ID, PERSON_ID, 'application_year', 'eod_year', 'application_date', 'clean_first_name', 'clean_middle_name', 
                 'clean_last_name', 'year_accepted', 'rejected', 'rejection_date', 'clean_college', 'clean_medical_school',
                'residency_dates', 'internship_dates']

In [None]:
other_cols = sorted([i for i in wide_apps3.columns if i not in IMPORTANT_COLS])

In [None]:
# order columns so important ones are 
wide_apps4 = wide_apps3[IMPORTANT_COLS+other_cols]

In [None]:
wide_apps5 = wide_apps4.dropna(subset=[PERSON_ID], axis=0)

In [None]:
wide_apps5.to_pickle(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info.p'))
wide_apps5.to_csv(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info.csv'))

In [None]:
wide_apps5.loc[(wide_apps5.application_year>1960) & (wide_apps5.application_year<1976), :].to_pickle(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info_vietnam.p'))

wide_apps5.loc[(wide_apps5.application_year>1960) & (wide_apps5.application_year<1976), :].to_csv(os.path.join(APP_DATA_DIR, 'all_apps_plus_NIH_info_vietnam.csv'))

In [None]:
not_matched_apps = apps.loc[~apps[PERSON_ID].isin(full_matches2[PERSON_ID]), :]
not_matched_attendees = df3_unique.loc[~df3_unique[NIH_ID].isin(full_matches2[NIH_ID]), :]

df3_unique.shape

not_matched_attendees.shape

In [None]:
not_matched_attendees_vietnam = not_matched_attendees.loc[(not_matched_attendees.eod_year<1976) & (not_matched_attendees.eod_year>1960), :].sort_values('clean_last_name')

In [None]:
not_matched_attendees_vietnam.shape

In [None]:
not_matched_attendees_vietnam.to_csv(os.path.join(ATT_DATA_DIR, 'not_matched_attendees.csv'))

In [None]:
not_matched_attendees.eod_year.value_counts().to_csv(os.path.join(ATT_DATA_DIR, 'not_matched_attendees.csv'))

In [None]:
first_last_matches.loc[first_last_matches['clean_last_name']=='LARSON', ['medical_school_x', 'medical_school_y']]