In [1]:
# read in known applicant files, dedupe and try to merge with applicants file
import difflib
import uuid
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import re
import os

In [2]:
ASSOC_DATA_DIR = os.path.abspath('Data/applicant_data')
CARD_DATA_DIR = os.path.abspath('Data/raw_card_data')

print ASSOC_DATA_DIR
r1_file = '1964-1973 associates.XLS'
r2_file = 'Associates alpha by institute.XLS'
r3_file = 'Associates data.XLS'
r4_file = 'NIMH Associates Complete.XLS'
r5_file = 'NINDB Associates alpha by year.xls'
filenames = [r1_file, r2_file, r3_file, r4_file, r5_file]

/home/lraymond/MIT/Azoulay_2016/yellow_berets/yellow_beret/Data/applicant_data


In [3]:
file_df = map(lambda x: pd.read_excel(os.path.join(ASSOC_DATA_DIR, x)), filenames)



In [4]:
file_4_columns = [
    'dno', 'source', 'unknown', 'lastname', 'first_middle', 'institute', 'lab_brch', 
    'program', 'supervisor', 'eod_year', 'med_school', 'year_grad', 'intern_hos', 'intern_dte',
       'res_hosp', 'residency', 'res_dtes'
]

In [5]:
file_df[2].rename(columns={'lname':'lastname', 'fname': 'first_middle'}, inplace=True)
file_df[1].rename(columns={'lname':'lastname', 'fname': 'first_middle'}, inplace=True)

In [6]:
file_df[4].columns = file_4_columns
# for each files in the list, add a column to track source
for name, f in zip(filenames, file_df):
    f.loc[:, 'data_source'] = name
concat_df = pd.concat(file_df)

In [7]:
print sum(map(lambda x: x.shape[0], file_df)) == concat_df.shape[0]

True


In [8]:
# apply a basic string cleaning function to the names- removing all punctuation, changing to all uppercase

def trans_remov_punc(to_change, change_to):
    # removes specified punctuation using string maketrans (very fast, C lookups)
    #returns partially evaluated fnc
    trantab = string.maketrans(to_change, change_to)
    return funcy.func_partial(lambda x: x.translate(trantab))


def standardize_whitespace(pub_str):
    return ' '.join(filter(None, pub_str.split(' ')))


def remove_punc(pub_str):
    # function to remove punctuation
    nonelst = ' '*len(string.punctuation)
    fn = trans_remov_punc(string.punctuation, nonelst)
    pub_str2 = str(pub_str)
    new_str = fn(pub_str2)
    # standardize spaces
    return standardize_whitespace(new_str)


def clean_names(name):
    # if name is missing, return null
    if pd.isnull(name):
        return np.nan
    # uppercase 
    upp = name.upper()
    return remove_punc(upp)

    # pull off suffix in some last names into seperate column
def has_suffix(raw_last_name):
    # a boolean fnc to identify which rows may have a suffix
    last_lst = raw_last_name.split(' ')
    if len(last_lst) == 1:
        # if no white spaces in last name, only 1 word, so no suffix
        return False
    suffixes = ['JR', 'SR', 'I', 'II', 'III', 'IV', 'V', 'VI']
    # want to differentiate between suffix (JR, SR, I, II, III, IV, V) between last names with multiple parts (ex. st john)
    # check if last word in list
    return (last_lst[-1] in suffixes)

In [9]:
def strip_first_middle(raw_str):
    # looks like first middle coded as first middle or first, middle or event first, middle initial suffix
    if pd.isnull(raw_str):
        return pd.Series({'firstname2': np.nan, 'middlename2': np.nan, 'suffix': np.nan})
    split_space = raw_str.split(' ') 
    find_per = raw_str.find('.')
    find_comma = raw_str.find(',')
    
    if len(split_space) == 1 and find_per == -1 and find_comma == -1:
        # just a one word sequence with first name
        return pd.Series({'firstname2': raw_str, 'middlename2': np.nan, 'suffix': np.nan})
    clean_str = remove_punc(raw_str)
    lst_clean_str = clean_str.split(' ')
    middle = lst_clean_str[1] if len(lst_clean_str) > 1 else np.nan
    suffix = lst_clean_str[2] if len(lst_clean_str) > 2 else np.nan
    return pd.Series({'firstname2': lst_clean_str[0], 'middlename2': middle, 'suffix': suffix})

In [10]:
# now we have all the associates, sep first middle into first and middle name, then sort and check 
# to see if we have any duplicates
df2 = pd.concat([concat_df, concat_df.loc[:, 'first_middle'].apply(strip_first_middle)], axis=1)
# consolidate firstname columns
df2.loc[~pd.isnull(df2.first_middle), 'firstname'] = df2.loc[~pd.isnull(df2.first_middle), 'firstname2']
df2.loc[pd.isnull(df2.middlename), 'middlename'] = df2.loc[pd.isnull(df2.middlename), 'middlename2']

In [11]:
df3 = df2.drop(['first_middle', 'firstname2', 'middlename2'], axis=1)

In [12]:
# dropnow where both first and last name are missing
df3 = df3.dropna(subset=['firstname', 'lastname'], how='all')

In [13]:
# df3.dropna(subset=['firstname', 'lastname'], how='all').loc[:, ['firstname', 'lastname', 'dno', 'data_source']]
df3.dropna(subset=['firstname', 'lastname'], how='all').loc[:, 'data_source'].unique()


array(['1964-1973 associates.XLS', 'Associates alpha by institute.XLS',
       'Associates data.XLS', 'NIMH Associates Complete.XLS',
       'NINDB Associates alpha by year.xls'], dtype=object)

In [14]:
df3_sorted = df3.sort_values(by=['dno'])

In [15]:
df3_unique = df3.drop_duplicates('dno')

In [16]:
df3_unique.loc[:, 'clean_firstname'] = df3_unique['firstname'].apply(clean_names)
df3_unique.loc[:, 'clean_middlename'] = df3_unique['middlename'].apply(clean_names)
df3_unique.loc[:, 'clean_lastname'] = df3_unique['lastname'].apply(clean_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [17]:
# it seems that dno does refer to unique person, so drop dups based on that 
# save this unique to pick
df3_unique.to_pickle(os.path.join(ASSOC_DATA_DIR, 'unique_attendees.p'))

In [18]:
# to csv
df3_unique.to_csv(os.path.join(ASSOC_DATA_DIR, 'unique_attendees.csv'))

In [49]:
# import applicants file and try to merge with attendees
# interested to see how many applicants were NOT accepted
apps = pd.read_pickle(os.path.join(CARD_DATA_DIR, 'unique_applicants.p'))

In [50]:
app_to_att_mapping = {'clean_middle_name': 'clean_middlename', 'clean_last_name':  'clean_lastname',
                     'clean_first_name': 'clean_firstname', 'medical_school': 'med_school', 
                    'internship_hospital_1': 'intern_hos', 'internship_year(s)': 'intern_dte', 
                     'residency_year(s)': 'res_dtes', 'residency_hospital': 'res_hosp'}

In [51]:
apps.loc[:, list(app_to_att_mapping.keys())].head()

Unnamed: 0,internship_year(s),clean_first_name,clean_middle_name,residency_year(s),residency_hospital,medical_school,internship_hospital_1,clean_last_name
0,1969-70,ROY,KENNETH,1970-71,U.S. PHS HOSPITAL,SUNY Downstate Medical Center College of Medicine,MONTEFIORE HOSPITAL ASSOCIATION,AARON
0,1966-67,STUART,ALAN,1967-68,HERBERT C. MOFFITT HOSPITAL,UCSF School of Medicine,HERBERT C. MOFFITT HOSPITAL,AARONSON
0,,JOHN,MORTON,,,,,AASE
0,,JON,MORTON,,,,,AASE
0,1971-72,LELAND,RUSSELL,,,Hahnemann University School of Medicine,MOUNT SINAI SCHOOL OF MEDICINE OF NYU,ABBEY


In [52]:
df3_unique.loc[:, app_to_att_mapping.values()].head()

Unnamed: 0,intern_dte,clean_firstname,clean_middlename,res_dtes,res_hosp,med_school,intern_hos,clean_lastname
0,1966-1967,ROBERT,F,1967-1968,"Peter Brent Brigham Hospital, Boston, Massachu...",Columbia University College of Physicians & Su...,"Peter Brent Brigham Hospital, Boston, Massachu...",ASHMAN
1,1965-1966,PHILIP,WILLIAM,1966-1967,Boston City,Yale University School of Medicine,Boston City,ASKENASE
2,1969-1970,FLOYD,L,1970-1971,Yale - New Haven Medical Center,Medical College of Virginia,Yale - New Haven Medical Center,ATKINS
3,1963-1964,ARTHUR,J,1964-1965,Massachusetts General Hospital,Cornell University Medical College,Massachusetts General Hospital,ATKINSON
4,,JOHN,PATTERSON,,,Kansas University,Massachusetts General Hospital,ATKINSON


In [53]:
# rename columns in df3 to match
# change residency and internship dates to be YYYY-YYYY instead of YYYY-YY
def long_form_date(dt_str):
    if pd.isnull(dt_str):
        return dt_str
    m = re.match(r'(\d{4})-(\d{2})', dt_str)
    if m:
        g = m.groups()
        return '{0}-19{1}'.format(g[0], g[1])
    m = re.match(r'(\d{4})', dt_str)
    if m:
        return dt_str
    print dt_str
    return np.nan
    

In [54]:
apps['res_dtes'] = apps['residency_year(s)'].apply(long_form_date)

"Open"


In [56]:
apps['intern_dte'] = apps['internship_year(s)'].apply(long_form_date)

Str. Medicine
Medicine


In [65]:
df3_unique.loc[:, 'clean_med_school'] = df3_unique['med_school'].apply(clean_names)

In [68]:
exact_name_matches = pd.merge(left=df3_unique, right=apps, left_on=['clean_firstname', 'clean_middlename', 'clean_lastname'], right_on=[
        'clean_first_name', 'clean_middle_name', 'clean_last_name'], how='inner')

In [71]:
not_matched_apps = apps.loc[~apps.uuid.isin(exact_name_matches.uuid), :]

In [72]:
not_matched_attendees = df3_unique.loc[~df3_unique.dno.isin(exact_name_matches.dno), :]

In [77]:
first_last_matches

Unnamed: 0,citizenship_x,data_source,dno,dob,eod_year,firstname,generation,institute,intern_dte_x,intern_hos,...,clean_middle_name,clean_last_name,clean_suffix,school_name_sim,clean_college,clean_college_trans,sanity_check,uuid,res_dtes_y,intern_dte_y
0,,1964-1973 associates.XLS,117,,1968.0,Robert,,NIAMD,1966-1967,"Peter Brent Brigham Hospital, Boston, Massachu...",...,FREDERICK,ASHMAN,,0.166667,,,ROBERT_FREDERICK_ASHMAN_nan_Columbia Universit...,a84404cc-b91b-4e64-b2a2-4f01b389a8d7,1967-1968,1966-1967
1,,1964-1973 associates.XLS,120,,1971.0,Floyd,,NIGMS,1969-1970,Yale - New Haven Medical Center,...,C,ATKINS,,0.181818,HOWARD UNIVERSITY,HOWARD,FLOYD_C_ATKINS_HOWARD_Medical College of Virgi...,b291479d-af1a-4666-a402-a3e7ea9d5df8,1970-1971,1969-1970
2,,1964-1973 associates.XLS,237,,1972.0,Richard,,NCI,,LA Co. Harbor General,...,ALAN,BENDER,,0.206897,UNIVERSITY OF CALIFORNIA SANTA BARBARA,UNIVERSITY OF CALIFORNIA SANTA BARBARA,RICHARD_ALAN_BENDER_UNIVERSITY OF CALIFORNIA S...,db46e911-a7d5-45a8-a048-06bb3938b5a4,,1970
3,,1964-1973 associates.XLS,309,,1971.0,John,,NIAMD,1969-1970,Columbia Presbyterian,...,PAUL,BILEZIKIAN,,0.25,,,JOHN_PAUL_BILEZIKIAN_nan_Columbia University C...,90c1c981-2fab-4ebd-a912-49d2757290fe,1970-1971,1969-1970
4,,1964-1973 associates.XLS,329,,1972.0,Thomas,,NCI,1970-1971,Children's Hospital of Philadelphia.,...,JOSEPH JOHN,BLANCK,,0.156863,,,THOMAS_JOSEPH JOHN_BLANCK_nan_University of Pe...,18fc79fe-d788-4575-b91b-c7f92d535ed3,,1970-1971
5,,1964-1973 associates.XLS,356,,1970.0,Samuel,,NCI,1968-1969,Bronx Municipal Hospital Center,...,HIEL,BOBROW,,0.0727273,,,SAMUEL_HIEL_BOBROW_nan_SUNY Downstate Medical ...,a3df442a-a441-4567-afb8-a38fb08e29b0,1969,1968-1969
6,,1964-1973 associates.XLS,398,,1972.0,William,,NIMH,1969-1970,San Francisco General Hospital,...,,BRADEN,III,0.322581,HARVARD UNIVERSITY,HARVARD,WILLIAM_nan_BRADEN_HARVARD_Harvard Medical Sch...,2ee1ff04-dc8b-4f00-912a-318026b9a75f,1970-1972,1969-1970
7,,1964-1973 associates.XLS,419,,1971.0,John,,NIAID,1970-1971,Hospital of University of Pennsylvania,...,CARL SUMMER,BREITNER,,0.188679,HARVARD UNIVERSITY,HARVARD,JOHN_CARL SUMMER_BREITNER_HARVARD_University o...,fc63f6a3-5110-4af1-a661-e571a16f16df,,1970-1971
8,,Associates alpha by institute.XLS,418,,,John,,,1970-1971,,...,CARL SUMMER,BREITNER,,0.188679,HARVARD UNIVERSITY,HARVARD,JOHN_CARL SUMMER_BREITNER_HARVARD_University o...,fc63f6a3-5110-4af1-a661-e571a16f16df,,1970-1971
9,,1964-1973 associates.XLS,55,,1973.0,John,,NCI,1972-1973,Duke Hospital,...,CHARLES,ALEXANDER,JR,0.186047,DUKE UNIVERSITY,DUKE,JOHN_C_ALEXANDER_nan_Duke University School of...,d8ec57d8-a909-4146-b376-d15f3adc9496,1972-1973,1972-1973


In [74]:
first_last_matches = pd.merge(left=not_matched_attendees, right=not_matched_apps, left_on=['clean_firstname', 'clean_lastname'], right_on=[
        'clean_first_name', 'clean_last_name'], how='inner')

In [76]:
# drop matches where middle names totally different or colleges totally different
first_last_matches[['res_dtes_x', 'res_dtes_yintern_dte_x', 'intern_dte_y' 'clean_middlename']]

KeyError: "['res_dtes' 'intern_dte'] not in index"

In [75]:
first_last_matches.shape

(170, 105)

In [None]:
fuzzy = pd.merge(left)

In [None]:
test2.loc[test2.clean_lastname=='ANDERSON', :]

In [None]:
# for med school, strip school of med and then check string sim
test2.loc[:, ['clean_firstname', 'dno', 'clean_middlename', 'uuid', 'clean_lastname']]
test2.loc[test2.duplicated(['uuid'], keep=False), ['medical_school', 'med_school', 'clean_firstname', 'dno', 'clean_middlename', 'clean_middle_name', 'uuid', 'clean_lastname']]

In [None]:
not_matched_apps2 = not_matched_apps.loc[~not_matched_apps.uuid.isin(test2.uuid), :]
not_matched_attendees2 = not_matched_attendees.loc[~not_matched_attendees.dno.isin(test2.dno), :]

In [None]:
# not_matched_apps2.loc[:, ['clean_last_name', 'clean_first_name', 'medical_school']]
not_matched_apps2.loc[not_matched_apps2.clean_first_name=='LOUIS', ['clean_last_name', 'clean_first_name', 'medical_school']]

In [None]:
not_matched_attendees2.loc[:, ['clean_lastname', 'clean_firstname', 'med_school']]

In [None]:
not_matched_attendees.sort_values(['clean_lastname', 'clean_firstname']).loc[:, ['clean_lastname', 'clean_firstname', 'clean_middlename', 'med_school']]

In [None]:
not_matched_apps.sort_values(['clean_last_name', 'clean_first_name']).loc[:, ['clean_last_name', 'clean_middle_name', 'clean_first_name', 'medical_school']]