In [19]:
# read in known applicant files, clean and try to dedupe
from collections import Counter
import difflib
import uuid
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import re
import os

APP_DATA_DIR = os.path.abspath('Data/applicant_data')
ATT_DATA_DIR = os.path.abspath('Data/attendees_data')
CARD_DATA_DIR = os.path.abspath('Data/applicant_data/raw_card_data')

from data_cleaning_functions import correct_mispellings, long_form_date, clean_names, clean_med_school

In [2]:
# id column that links back to raw applicant data file
RAW_CARD_ID = 'raw_uuid'

# column where the raw id information is stored
RAW_INDEX_IDS = 'raw_card_ids'

# try to get one id per unique applicant in the dataset
PERSON_ID = 'person_uuid'
# id per deduped application-person - if someone applied multiple times, they will have multiple ids
PERSON_APPLICATION_ID = 'person_app_uuid' 
NIH_ID = 'dno'

APPLICANT_SUFFIX = '_ap'
ATTENDEE_SUFFIX = '_at'

%load_ext autoreload
%autoreload 2

%reload_ext autoreload

In [11]:
NAME_COLS = ['clean_middle_name', 'clean_last_name', 'clean_first_name']
MED_TRAINING_COLS = ['res_dates', 'intern_dates', 'residency_hospital', 
                     'internship_hospital', 'medical_school', 'residency']

In [3]:
# read _csv
df3_unique = pd.read_csv(os.path.join(ATT_DATA_DIR, 'unique_attendees.csv'))

In [49]:
df3_unique.loc[:, 'res_dates'] = df3_unique.res_dtes.apply(long_form_date)
df3_unique.loc[:, 'intern_dates'] = df3_unique.intern_dte.apply(long_form_date)

present
not known
Starting July 1,1962
present
1 Year
present


In [50]:
df4_unique = df3_unique.drop(['res_dtes', 'intern_dte'], axis=1).rename(columns={
        'res_hosp': 'residency_hospital', 'intern_hos': 'internship_hospital', 
        'clean_middlename': 'clean_middle_name', 'clean_firstname': 'clean_first_name',
        'clean_lastname': 'clean_last_name',  'med_school': 'medical_school'})

In [51]:
string_med_cols = ['medical_school', 'residency_hospital', 'internship_hospital', 'residency', 'institute']

# apply string cleaning function to each of the string medical info columns
df4_unique.loc[:, string_med_cols] = df4_unique[string_med_cols].applymap(clean_names)

In [52]:
%%capture
df4_unique.loc[:, 'clean_medical_school'] = df4_unique.medical_school.apply(clean_med_school);

In [53]:
to_remove = ['TERRECE', 'FRED', 'LAURENCE',
             'CUONO', 'DEFRENZE', 'JEFFERY', 'FINKLEMAN', 'SHERRAD', 'ANSCHNETZ', 'MARC', 'JENSON', 'KASTI', 
            'ADELBERT', 'RITCHARD', 'MANSFORD', 'DEFRENZO', 'DROBIN', 'HAMES', 'KREUZ', 'JERROLD', 'MANEUSI',
            'UNGARO']
to_replace = ['TERRENCE', 'FREDERICK', 'LAWRENCE',
              'CUOMO', 'DEFRONZO', 'JEFFREY', 'FINKELMAN', 'SHERRARD', 'ANSCHUETZ', 'MARCUS', 'JENSEN', 'KASTL',
              'ALBERT', 'RITCHARD', 'MANIFORD', 'DEFRONZO', 'DROBIS', 'JAMES', 'KRUEZ', 'JERROD', 'MANCUSI',
              'UNGARO']

correct_name_mispellings_fnc = funcy.rpartial(correct_mispellings, to_remove, to_replace)

df4_unique.loc[:, 'clean_last_name'] = df4_unique.clean_last_name.apply(correct_name_mispellings_fnc)
df4_unique.loc[:, 'clean_first_name'] = df4_unique.clean_first_name.apply(correct_name_mispellings_fnc)

In [54]:
# function to go in and correct some of the name mispellings in both data sets
# MUTATING FUNCTION
def change_names(df, selection_type, selection_value, to_change_type, to_change_values):
    for t, v in zip(to_change_type, to_change_values):
        print t, v
        df.loc[df[selection_type]==selection_value, t] = v

In [55]:
change_names(
    df4_unique, 'clean_last_name', 'CHESEBRO', ['clean_first_name', 'clean_middle_name'], ['BRUCE', 'WILCOX'])
change_names(df4_unique, 'clean_last_name', 'GALANTER', ['clean_first_name', 'clean_middle_name'], ['MARC', 'I'])
change_names(
    df4_unique, 'clean_last_name', 'BEAN', ['clean_first_name', 'clean_middle_name', 'clean_medical_school'], ['SIDNEY', 'CHARLES', 'WAKE_FOREST'])
change_names(
    df4_unique, 'clean_last_name', 'EILER', ['clean_first_name', 'clean_middle_name'], ['DONALD', 'MARTIN'])
change_names(
    df4_unique, 'clean_last_name', 'FALCHUK', ['clean_first_name', 'clean_middle_name'], ['DONALD', 'MARTIN'])


clean_first_name BRUCE
clean_middle_name WILCOX
clean_first_name MARC
clean_middle_name I
clean_first_name SIDNEY
clean_middle_name CHARLES
clean_medical_school WAKE_FOREST
clean_first_name DONALD
clean_middle_name MARTIN
clean_first_name DONALD
clean_middle_name MARTIN


In [56]:
# now, check for people with duplicate first and last name, but different dno numbers, output them to a seperate dataset
df5 = df4_unique.sort_values(['clean_last_name', 'dno', 'clean_first_name', 'medical_school'])

In [61]:
NIH_dups = df5.loc[df5.duplicated(['clean_last_name', 'clean_first_name', 'medical_school'], keep=False), 
                   NAME_COLS+MED_TRAINING_COLS+['eod_year', 'clean_medical_school', 'dno']]

In [69]:
missing_eod_year = df5.loc[pd.isnull(df5.eod_year), :].sort_values('clean_last_name')

missing_eod_year[NAME_COLS+MED_TRAINING_COLS+['dno']].to_csv(os.path.join(APP_DATA_DIR, 'missing_eod_year.csv'), index=False)

In [68]:
NIH_dups[NAME_COLS+MED_TRAINING_COLS+['dno']].sort_values('clean_last_name').to_csv(
    os.path.join(ATT_DATA_DIR, 'duplicate_attendees.csv'), index=False)

In [64]:
df6 = df5.drop_duplicates(['clean_last_name', 'clean_first_name', 'medical_school'])

In [65]:
print df5.shape
print df6.shape

(4112, 29)
(4075, 29)


In [67]:
df6.to_csv(os.path.join(ATT_DATA_DIR, 'NIH_attendee_deduped_raw.csv'), index=False)