In [27]:
# code to search for people in NIH directories
import pandas as pd
import numpy as np
import os
from collections import Counter
import funcy
from fuzzywuzzy import fuzz


DIR = '/Users/lrraymond13/MIT/Azoulay_RA_2016/Data/'
NIH_directory_filename = 'cleaned_NIH_directory.xlsx'
sample_filename = 'fuzzy_all_apps_plus_NIH_info.csv'

In [90]:
def lname_sim_fnc(control_lname, x):
    if pd.isnull(x) or isinstance(x, float):
        return False
    return fuzz.ratio(control_lname, x) > 80


def fname_sim_fnc(control_fname, x):
    if pd.isnull(x) or pd.isnull(control_fname) or isinstance(x, float):
        return False
    return x[0].upper() == control_fname[0].upper()


def search_for_control(to_search_df, control_row, index_num, lname_sim_fnc=lname_sim_fnc, fname_sim_fnc=fname_sim_fnc):
    # in the NIH directory, look
    year_grad = control_row['medschool_year_grad']
    lname = control_row['clean_last_name']
    fname = control_row['clean_first_name']
    # subset to search df
    if pd.isnull(year_grad):
        small_df = to_search_df.copy()
    else:
        small_df = to_search_df.loc[
            (to_search_df.year>= year_grad) & (
                to_search_df.year<=year_grad+4), :]
    print small_df.shape
    lname_mask = small_df['last'].apply(lambda x: lname_sim_fnc(lname, x))
    print small_df.loc[lname_mask, :].shape
    fname_mask = small_df['first'].apply(lambda x: fname_sim_fnc(fname, x))
    print small_df.loc[fname_mask, :].shape
    matches = small_df.loc[lname_mask & fname_mask, :]
    print matches.shape
    matches['matches_found'] = matches.shape[0]
    matches['control_first'] = fname
    matches['control_middle'] = control_row['clean_middle_name']
    matches['control_last'] = lname
    matches['control_medschool_year_grad'] = year_grad
    matches['control_undergrad_year_grad'] = control_row['undergrad_year_grad'] 
    matches['control_undergraduate_school'] = control_row['undergraduate_school'] 
    matches['control_flag'] = control_row['control_flag']
    matches['person_uuid'] = control_row['person_uuid']
    matches['internship_hospital'] = control_row['internship_hospital']
    matches['residency_hospital'] = control_row['residency_hospital']
    matches['residency'] = control_row['residency']
    matches['institute'] = control_row['institute']
    matches['internship_dates'] = control_row['internship_dates']
    matches['residency_dates'] = control_row['residency_dates']
    matches['medical_school'] = control_row['medical_school']
    matches['year_accepted'] = control_row['year_accepted']
    matches['rejection_date'] = control_row['rejection_date']
    matches['rejected'] = control_row['rejected']
    matches['withdrawal'] = control_row['withdrawal']
    matches['dno'] = control_row['dno']
    matches['index'] = index_num
    return matches

def create_empty_match_df(control_row, index_num):
    matches = pd.DataFrame({'matches_found': [0], 'index': [index_num]})
    matches['control_first'] = control_row['clean_first_name']
    matches['control_middle'] = control_row['clean_middle_name']
    matches['control_last'] = control_row['clean_last_name']
    matches['control_medschool_year_grad'] = control_row['medschool_year_grad']
    matches['control_undergrad_year_grad'] = control_row['undergrad_year_grad'] 
    matches['control_undergraduate_school'] = control_row['undergraduate_school'] 
    matches['control_flag'] = control_row['control_flag']
    matches['person_uuid'] = control_row['person_uuid']
    matches['internship_hospital'] = control_row['internship_hospital']
    matches['residency_hospital'] = control_row['residency_hospital']
    matches['residency'] = control_row['residency']
    matches['institute'] = control_row['institute']
    matches['internship_dates'] = control_row['internship_dates']
    matches['residency_dates'] = control_row['residency_dates']
    matches['medical_school'] = control_row['medical_school']
    matches['year_accepted'] = control_row['year_accepted']
    matches['rejection_date'] = control_row['rejection_date']
    matches['rejected'] = control_row['rejected']
    matches['withdrawal'] = control_row['withdrawal']
    matches['dno'] = control_row['dno']
    return matches
    


def process_control_dir_matches(search_df, NIH):
    match_res = []
    for ind, row in NIH.iterrows():
        search_df2 = search_for_control(search_df, row, ind)
        if search_df2.shape[0] == 0 or search_df2 is None:
            search_df2 = create_empty_match_df(row, ind)
        match_res.append(search_df2.set_index('index'))
    return pd.concat(match_res, axis=0)

In [91]:
sample = pd.read_csv(os.path.join(DIR, sample_filename))
directory = pd.read_excel(os.path.join(DIR, NIH_directory_filename))

In [100]:
# create counter of the least common names
lc_names = Counter(sample[(
            sample.control_flag==0) & (sample.time_period_flag==1)].clean_last_name.values) 

In [101]:
last_100 = lc_names.most_common()[-100:]
lst_last_100 = map(funcy.first, last_100)

In [106]:
least_common_df = sample.loc[
    (sample.clean_last_name.isin(lst_last_100)) & (
        sample.control_flag==0) & (sample.time_period_flag==1), :]



In [107]:
least_common_df.shape

(100, 109)

In [108]:
res = process_control_dir_matches(directory, least_common_df)

(58269, 10)
(20, 10)
(4794, 10)
(9, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

(66586, 10)
(69, 10)
(5699, 10)
(10, 10)
(64995, 10)
(0, 10)
(7587, 10)
(0, 10)
(66586, 10)
(34, 10)
(6726, 10)
(0, 10)
(55425, 10)
(73, 10)
(5559, 10)
(7, 10)
(59602, 10)
(9, 10)
(1305, 10)
(2, 10)
(64995, 10)
(5, 10)
(6562, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(64995, 10)
(2, 10)
(1513, 10)
(2, 10)
(45955, 10)
(5, 10)
(2547, 10)
(1, 10)
(63343, 10)
(3, 10)
(2666, 10)
(3, 10)
(64995, 10)
(27, 10)
(3795, 10)
(1, 10)
(64995, 10)
(37, 10)
(3203, 10)
(2, 10)
(60806, 10)
(23, 10)
(2019, 10)
(0, 10)
(68120, 10)
(37, 10)
(3552, 10)
(3, 10)
(64995, 10)
(4, 10)
(5565, 10)
(4, 10)
(66586, 10)
(0, 10)
(4450, 10)
(0, 10)
(58269, 10)
(19, 10)
(3265, 10)
(2, 10)
(59602, 10)
(19, 10)
(3372, 10)
(3, 10)
(55425, 10)
(30, 10)
(6497, 10)
(3, 10)
(41660, 10)
(0, 10)
(4855, 10)
(0, 10)
(41660, 10)
(1, 10)
(1634, 10)
(1, 10)
(66586, 10)
(2, 10)
(5699, 10)
(2, 10)
(63343, 10)
(3, 10)
(4219, 10)
(3, 10)
(59602, 10)
(13, 10)
(4974, 10)
(8, 10)
(59602, 10)
(27, 10)
(3378, 10)
(5, 10)
(55425, 10)
(1, 10)

In [113]:
res2 = res.sort_values(['clean', 'year', 'last', 'first', 'middle'])

In [114]:
res2.shape

(304, 31)

In [118]:
cols = [
    'control_first', 'control_middle', 'control_last', 'year_accepted',
    'year', 'first', 'middle', 'last', 'matches_found', 
     'residency_dates', 'internship_dates', 'control_medschool_year_grad',
    'internship_hospital', 'residency', 'residency_hospital',
    'medical_school','dno', 'control_undergrad_year_grad', 
    'control_undergraduate_school', 'institute', 'control_flag',
    'name', 'person_uuid', 'rejected', 'rejection_date', 'sheet',
    'suffix', 'unicode_flag', 'withdrawal']
  

In [119]:
res2 = res2[cols]

In [121]:
sample.loc[sample.clean_last_name=='DELFS', :]

Unnamed: 0,dno,person_uuid,application_year_min,application_year_max,eod_year,clean_first_name,clean_middle_name,clean_last_name,control_flag,time_period_flag,...,ssn,state,supervisor,teaching,to_drop,undergrad_year_grad,undergraduate_school,withdrawal,year_grad,zip_code
805,,1878.0,1973.0,1973.0,,JOHN,ROBERT,DELFS,0,1,...,,Texas,,1.0,False,1969.0,Tulane University,-9.0,,79109.0


In [122]:

res2.to_excel(os.path.join(DIR, 'control_sample_directory_check.xlsx'))

In [123]:
full_results = process_control_dir_matches(directory, sample)


(63343, 10)
(9, 10)
(5413, 10)
(3, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

(58269, 10)
(13, 10)
(2551, 10)
(4, 10)
(139266, 10)
(3, 10)
(16211, 10)
(0, 10)
(66586, 10)
(3, 10)
(3666, 10)
(0, 10)
(11544, 10)
(6, 10)
(577, 10)
(0, 10)
(58269, 10)
(25, 10)
(4794, 10)
(1, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(58269, 10)
(0, 10)
(5944, 10)
(0, 10)
(58269, 10)
(2, 10)
(2712, 10)
(2, 10)
(64995, 10)
(8, 10)
(5565, 10)
(0, 10)
(59602, 10)
(4, 10)
(2687, 10)
(0, 10)
(63343, 10)
(3, 10)
(3705, 10)
(3, 10)
(68120, 10)
(15, 10)
(3725, 10)
(2, 10)
(55425, 10)
(10, 10)
(4696, 10)
(0, 10)
(58269, 10)
(16, 10)
(1079, 10)
(1, 10)
(66586, 10)
(9, 10)
(7814, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(66586, 10)
(1, 10)
(4450, 10)
(0, 10)
(55425, 10)
(2, 10)
(1151, 10)
(2, 10)
(11544, 10)
(16, 10)
(1185, 10)
(2, 10)
(66586, 10)
(88, 10)
(2723, 10)
(5, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(63343, 10)
(44, 10)
(1344, 10)
(2, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(64995, 10)
(28, 10)
(5565, 10)
(0, 10)
(63343, 10)
(27, 10)
(5413, 10)
(0, 10)
(64995, 10)
(28, 10)
(3203, 10)
(2, 10)
(6

In [124]:
full_results2 = full_results[cols]
full_results2.to_excel(os.path.join(DIR, 'full_sample_directory_search.xlsx'))