In [66]:
# read in known applicant files, dedupe and try to merge with applicants file
from collections import Counter
import difflib
import uuid
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import re
import os

In [2]:
ASSOC_DATA_DIR = os.path.abspath('Data/applicant_data')
CARD_DATA_DIR = os.path.abspath('Data/raw_card_data')

print ASSOC_DATA_DIR
r1_file = '1964-1973 associates.XLS'
r2_file = 'Associates alpha by institute.XLS'
r3_file = 'Associates data.XLS'
r4_file = 'NIMH Associates Complete.XLS'
r5_file = 'NINDB Associates alpha by year.xls'
filenames = [r1_file, r2_file, r3_file, r4_file, r5_file]

/home/lraymond/MIT/Azoulay_2016/yellow_berets/yellow_beret/Data/applicant_data


In [3]:
file_df = map(lambda x: pd.read_excel(os.path.join(ASSOC_DATA_DIR, x)), filenames)



In [4]:
file_4_columns = [
    'dno', 'source', 'unknown', 'lastname', 'first_middle', 'institute', 'lab_brch', 
    'program', 'supervisor', 'eod_year', 'med_school', 'year_grad', 'intern_hos', 'intern_dte',
       'res_hosp', 'residency', 'res_dtes'
]

In [5]:
file_df[2].rename(columns={'lname':'lastname', 'fname': 'first_middle'}, inplace=True)
file_df[1].rename(columns={'lname':'lastname', 'fname': 'first_middle'}, inplace=True)

In [6]:
file_df[4].columns = file_4_columns
# for each files in the list, add a column to track source
for name, f in zip(filenames, file_df):
    f.loc[:, 'data_source'] = name
concat_df = pd.concat(file_df)

In [7]:
print sum(map(lambda x: x.shape[0], file_df)) == concat_df.shape[0]

True


In [8]:
# apply a basic string cleaning function to the names- removing all punctuation, changing to all uppercase

def trans_remov_punc(to_change, change_to):
    # removes specified punctuation using string maketrans (very fast, C lookups)
    #returns partially evaluated fnc
    trantab = string.maketrans(to_change, change_to)
    return funcy.func_partial(lambda x: x.translate(trantab))


def standardize_whitespace(pub_str):
    return ' '.join(filter(None, pub_str.split(' ')))


def remove_punc(pub_str):
    # function to remove punctuation
    nonelst = ' '*len(string.punctuation)
    fn = trans_remov_punc(string.punctuation, nonelst)
    pub_str2 = str(pub_str)
    new_str = fn(pub_str2)
    # standardize spaces
    return standardize_whitespace(new_str)


def clean_names(name):
    # if name is missing, return null
    if pd.isnull(name):
        return np.nan
    # uppercase 
    upp = name.upper()
    return remove_punc(upp)

    # pull off suffix in some last names into seperate column
def has_suffix(raw_last_name):
    # a boolean fnc to identify which rows may have a suffix
    last_lst = raw_last_name.split(' ')
    if len(last_lst) == 1:
        # if no white spaces in last name, only 1 word, so no suffix
        return False
    suffixes = ['JR', 'SR', 'I', 'II', 'III', 'IV', 'V', 'VI']
    # want to differentiate between suffix (JR, SR, I, II, III, IV, V) between last names with multiple parts (ex. st john)
    # check if last word in list
    return (last_lst[-1] in suffixes)

In [9]:
# college name standardization fnc
def clean_std_college_name(college_raw):
# need to change 'college to university' unless Boston college or BU remove ANDS, AT, THE expand UCLA to UCAL, UC Davis etc.
# remove mispellings 'collge', UNIVERWSITY, MASSACHUSSETTS 
# RENSSELAER UNIVERSITY and RENSSELAER POLYTECHNICAL INSTITUTE to RENSSELAER POLYTECHNIC INSTITUTE
# JOHN HOPKINS UNIVERSITY to JOHNS
# HOLY CROSS COLLEGE to HOLY CROSS
# FRANKLIN MARSHALL COLLEGE' to FRANKLIN MARSHALL
# DE PAUW to DEPAUW
# ASBURY ASHBURY 
# DREXEL INSTITUTE OF TECHNOLOGY to DREXEL UNIVERSITY
# A B BROWN UNIVERSITY to Brown
# DARTMOUTH MEDICAL SCHOOL to Dartmouth
# remove random 1961 at the end of strings, (anything after university unless univ is the first word)
    if pd.isnull(college_raw):
        return np.nan
    # if AT or AND or THE, remove
    to_remove = [
        ' AND ', ' AT ', 'THE ', ' COLLGE', 'UNIVERISTY', 'UNIVERWSITY', 'MASSACHUSSETTS', 'JOHN ', 'DE PAUW', 'ASBURY', 
    'DREXEL INSTITUTE OF TECHNOLOGY', 'A B BROWN UNIVERSITY', 'DARTMOUTH MEDICAL SCHOOL', 'RENSSELAER UNIVERSITY', 
    'RENSSELAER POLYTECHNICAL INSTITUTE', ' STE', 'COLLEGE OF HOLY CROSS', 'HOLLY CROSS', 'JOHNSS ',  'BERKLEY',
    'UC ', 'PITTSBURRGH', 'WESLYN', 'WILLAMS', 'GEORGIA TECH', 'NEW YORK UNIVERSITY UNIV', 
    'UNIVERSITY OF MICHIGAN IS A', 'OHIO', 'STATE UNIVERSITY OF NEW YORK AT BUFFALO']
    to_replace = [
        ' ', ' ', ' ', ' COLLEGE', 'UNIVERSITY', 'UNIVERSITY', 'MASSACHUSETTS', 'JOHNS ', 'DEPAUW', 'ASHBURY',
        'DREXEL UNIVERSITY', 'BROWN UNIVERSITY', 'DARTMOUTH', 'RENSSELAER POLYTECHNIC INSTITUTE', 
        'RENSSELAER POLYTECHNIC INSTITUTE', ' STATE', 'HOLY CROSS', 'HOLY CROSS', 'JOHNS ', 
        ' BERKELEY', 'UNIVERSITY OF CALIFORNIA ', 'PITTSBURGH', 'WESLEYAN', 'WILLIAMS', 
        'GEORGIA INSTITUTE OF TECHNOLOGY', 'NEW YORK', 'UNIVERSITY OF MICHIGAN', 'OHIO STATE', 'SUNY BUFFALO']
    word_pairs = zip(to_remove, to_replace)
    # list of words to replace
    words_in_str = filter(lambda (x, y): x in college_raw, word_pairs)
    trans_word = college_raw
    for to_remove_wrd, to_replace_wrd in words_in_str:
        trans_word = trans_word.replace(to_remove_wrd, to_replace_wrd)
        
    # after replacing the mispellings and removing and/at, remove everything after college/university
    if 'BOSTON' in trans_word:
        # then this string is BC or BU, so just return string
        return trans_word
    if trans_word.find('UNIVESITY ') == 0:
        return trans_word
    split_wrd = ' UNIVERSITY'
    if 'COLLEGE' in trans_word:
        split_wrd = ' COLLEGE'
    base_word = trans_word.split(split_wrd)[0]
    return standardize_whitespace(base_word)

In [10]:
def strip_first_middle(raw_str):
    # looks like first middle coded as first middle or first, middle or event first, middle initial suffix
    if pd.isnull(raw_str):
        return pd.Series({'firstname2': np.nan, 'middlename2': np.nan, 'suffix': np.nan})
    split_space = raw_str.split(' ') 
    find_per = raw_str.find('.')
    find_comma = raw_str.find(',')
    
    if len(split_space) == 1 and find_per == -1 and find_comma == -1:
        # just a one word sequence with first name
        return pd.Series({'firstname2': raw_str, 'middlename2': np.nan, 'suffix': np.nan})
    clean_str = remove_punc(raw_str)
    lst_clean_str = clean_str.split(' ')
    middle = lst_clean_str[1] if len(lst_clean_str) > 1 else np.nan
    suffix = lst_clean_str[2] if len(lst_clean_str) > 2 else np.nan
    return pd.Series({'firstname2': lst_clean_str[0], 'middlename2': middle, 'suffix': suffix})

In [11]:
# now we have all the associates, sep first middle into first and middle name, then sort and check 
# to see if we have any duplicates
df2 = pd.concat([concat_df, concat_df.loc[:, 'first_middle'].apply(strip_first_middle)], axis=1)
# consolidate firstname columns
df2.loc[~pd.isnull(df2.first_middle), 'firstname'] = df2.loc[~pd.isnull(df2.first_middle), 'firstname2']
df2.loc[pd.isnull(df2.middlename), 'middlename'] = df2.loc[pd.isnull(df2.middlename), 'middlename2']

In [12]:
df3 = df2.drop(['first_middle', 'firstname2', 'middlename2'], axis=1)

In [13]:
# dropnow where both first and last name are missing
df3 = df3.dropna(subset=['firstname', 'lastname'], how='all')

In [14]:
# df3.dropna(subset=['firstname', 'lastname'], how='all').loc[:, ['firstname', 'lastname', 'dno', 'data_source']]
df3.dropna(subset=['firstname', 'lastname'], how='all').loc[:, 'data_source'].unique()


array(['1964-1973 associates.XLS', 'Associates alpha by institute.XLS',
       'Associates data.XLS', 'NIMH Associates Complete.XLS',
       'NINDB Associates alpha by year.xls'], dtype=object)

In [15]:
df3_sorted = df3.sort_values(by=['dno'])

In [16]:
df3_unique = df3.drop_duplicates('dno')

In [17]:
df3_unique.loc[:, 'clean_firstname'] = df3_unique['firstname'].apply(clean_names)
df3_unique.loc[:, 'clean_middlename'] = df3_unique['middlename'].apply(clean_names)
df3_unique.loc[:, 'clean_lastname'] = df3_unique['lastname'].apply(clean_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [18]:
# it seems that dno does refer to unique person, so drop dups based on that 
# save this unique to pick
df3_unique.to_pickle(os.path.join(ASSOC_DATA_DIR, 'unique_attendees.p'))

In [19]:
# to csv
df3_unique.to_csv(os.path.join(ASSOC_DATA_DIR, 'unique_attendees.csv'))

In [20]:
# import applicants file and try to merge with attendees
# interested to see how many applicants were NOT accepted
apps = pd.read_pickle(os.path.join(CARD_DATA_DIR, 'unique_applicants.p'))

In [21]:
# rename columns in df3 to match
# change residency and internship dates to be YYYY-YYYY instead of YYYY-YY
def long_form_date(dt_str):
    if pd.isnull(dt_str):
        return dt_str
    m = re.match(r'(\d{4})-(\d{2})', dt_str)
    if m:
        g = m.groups()
        return '{0}-19{1}'.format(g[0], g[1])
    m = re.match(r'(\d{4})', dt_str)
    if m:
        return dt_str
    print dt_str
    return np.nan
    

In [22]:
apps['res_dates'] = apps['residency_year(s)'].apply(long_form_date)

"Open"


In [23]:
apps['intern_dates'] = apps['internship_year(s)'].apply(long_form_date)

Str. Medicine
Medicine


In [24]:
df3_unique.rename(columns={'res_dtes': 'res_dates', 'intern_dte': 'intern_dates', 'res_hosp': 'residency_hospital', 
                          'intern_hos': 'internship_hospital', 'clean_middlename': 'clean_middle_name', 
                          'clean_firstname': 'clean_first_name', 'clean_lastname': 'clean_last_name', 
                          'med_school': 'medical_school'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [31]:
apps.rename(columns={'residency_type': 'residency', 'internship_hospital_1': 'internship_hospital'}, inplace=True)

In [32]:
NAME_COLS = ['clean_middle_name', 'clean_last_name', 'clean_first_name']

In [33]:
MED_TRAINING_COLS = ['res_dates', 'intern_dates', 'residency_hospital', 'internship_hospital', 'medical_school', 'residency']

In [34]:
# apply string cleaning to each of the medical training info cols
for c in ['medical_school', 'residency_hospital', 'internship_hospital', 'residency', 'institute']:
    df3_unique.loc[:, c] = df3_unique[c].apply(clean_names)

In [37]:
# apply string cleaning to each of the medical training info cols
for c in ['medical_school', 'residency_hospital', 'internship_hospital', 'residency']:
    apps.loc[:, c] = apps[c].apply(clean_names)

In [68]:
# create counter objects for each data set that count the number of times the last name occurs in either data set
attendees_counter = Counter(df3_unique.clean_last_name)

In [71]:
apps_counter = Counter(apps.clean_last_name)

In [72]:
apps_counter['BROWN']

23

In [183]:
def clean_med_school(raw_med_school):
    # cleanup med school string so we can do useful string comparison on it
    # if contains COLUMBIA, return COLUMBIA
    # remove
    if pd.isnull(raw_med_school):
        return raw_med_school
    
    # after replacing the mispellings and removing and/at, remove everything after college/university
    if 'BOSTON UNIVERSITY' in raw_med_school:
        # then this string is BC or BU, so just return string
        return 'BOSTON UNIVERSITY'
    
    to_remove = [
        ' AND ', ' AT ', 'THE ', 'STATE UNIVERSITY OF NEW YORK',
        'STATE UNIVERSITY OF NEW YORK AT BUFFALO',  
        'STATE UNIVERSITY OF N Y', 'N Y UNIVERSITY', 'WASH ', 'MICH ', 'FO ',
        ' COLLGE', 'UNIVERISTY', 'UNIVERWSITY', 'BERKLEY',  'NEW YORK UNIVERSITY UNIV', 
        'STATE UNIVERSITY OF NEW YORK AT BUFFALO', 'UCSF', 'UCD', 'UNIVERSITY OF CALIFORNIA SF', 'NYU', 
        ' UNIVERSITY SCHOOL OF MEDICINE', ' UNIVERSITY SCHOOL OF MED', 
        ' UNIVERSITY OF MEDICINE', ' COLLEGE OF MEDICINE', ' SCHOOL OF MEDICINE', 'MEDICAL COLLEGE OF ', 
      ' MEDICAL SCHOOL', ' HEALTH SCIENCES CENTER', ' PRITZKER', ' MEDICAL']
        
    to_replace = [
        ' ', ' ', ' ', 'SUNY', 
        'SUNY BUFFALO', 'SUNY','NEW YORK UNIVERSITY',
        'WASHINGTON ', 'MICHIGAN ', 'OF ',
        ' COLLEGE', 'UNIVERSITY', 'UNIVERSITY', 'UNIVERSITY OF CALIFORNIA BERKELEY',
        'NEW YORK UNIVERSITY', 'SUNY BUFFALO', 'UNIVERSITY OF CALIFORNIA SAN FRANCISCO', 
        'UNIVERSITY OF CALIFORNIA DAVIS', 'UNIVERSITY OF CALIFORNIA SAN FRANCISCO', 'NEW YORK UNIVERSITY',
        ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
    word_pairs = zip(to_remove, to_replace)
    # list of words to replace
    words_in_str = filter(lambda (x, y): x in raw_med_school, word_pairs)
    trans_word = raw_med_school
    for to_remove_wrd, to_replace_wrd in words_in_str:
        trans_word = trans_word.replace(to_remove_wrd, to_replace_wrd)
    if 'COLUMBIA' in trans_word or ('PHYSICIANS' in trans_word and 'SURGEONS' in trans_word) or 'P S ' in trans_word:
        return 'COLUMBIA'
    if 'CORNELL ' in trans_word:
        return 'CORNELL'
    if 'ALBERT EINSTEIN' in trans_word:
        return 'ALBERT EINSTEIN'
    if 'THOMAS JEFFERSON' in trans_word:
        return 'THOMAS JEFFERSON'
    # search for various SUNY
    # search for various UC schools
    if trans_word.find(' ') == 0:
        return trans_word
    split_wrd = ' UNIVERSITY'
    if 'COLLEGE' in trans_word:
        split_wrd = ' COLLEGE'
    base_word = trans_word.split(split_wrd)[0]
    return standardize_whitespace(base_word)
    
        

In [184]:
apps.loc[:, 'clean_medical_school'] = apps.medical_school.apply(clean_med_school)
df3_unique.loc[:, 'clean_medical_school'] = df3_unique.medical_school.apply(clean_med_school)

In [185]:
exact_name_matches = pd.merge(left=df3_unique, right=apps, left_on=['clean_first_name', 'clean_middle_name', 'clean_last_name'], right_on=[
        'clean_first_name', 'clean_middle_name', 'clean_last_name'], how='inner')

In [186]:
not_matched_apps = apps.loc[~apps.uuid.isin(exact_name_matches.uuid), :]
not_matched_attendees = df3_unique.loc[~df3_unique.dno.isin(exact_name_matches.dno), :]

first_last_matches = pd.merge(left=not_matched_attendees, right=not_matched_apps, left_on=['clean_first_name', 'clean_last_name'], right_on=[
        'clean_first_name', 'clean_last_name'], how='inner')

In [187]:
# for the first and last matches, where the last name only occurs 1x in each data set, set confidence flag to 1
first_last_matches.loc[:, 'last_name_counts'] = first_last_matches.clean_last_name.apply(
    lambda x: apps_counter[x] + attendees_counter[x])

In [192]:
def define_med_school_junk(seq_elem):
    # difflib sequence matcher first element can take a fnc that inputs a
    # sequence element and returns True if it should be considered Junk
    return seq_elem in ['MEDICAL', 'SCHOOL', 'UNIVERSITY', 'COLLEGE', 'OF', 'THE', 'MEDICINE', 'CENTER', 'DENTISTRY']

In [193]:
def str_sim_fnc(row, index1, index2, junk_fnc=None):
    has_null = any(map(lambda x: pd.isnull(x), row.values))
    if has_null:
        return np.nan
    return difflib.SequenceMatcher(
        junk_fnc, row[index1], row['clean_medical_school_y']).ratio()

In [194]:
get_str_sim = funcy.rpartial(str_sim_fnc, 'clean_medical_school_x', 'clean_medical_school_y', define_med_school_junk)


In [195]:
first_last_matches.loc[:, 'med_school_sim'] = first_last_matches[[
        'clean_medical_school_x', 'clean_medical_school_y']].apply(get_str_sim, axis=1)

In [216]:
def get_years(dt_str):
    # return int date strings
    try:
        single_year = (dt_str.find('-') == -1)
        if single_year:
            return [dt_str]
        return dt_str.split('-')
    except ValueError as e:
        print dt_str
        return np.nan

def get_dts_sim(row, name_str):
    has_null = any(map(lambda x: pd.isnull(x), row.values))
    if has_null:
        return np.nan
    dt1 = row['{}_x'.format(name_str)]
    dt2 = row['{}_y'.format(name_str)]
    dates_tup1 = get_years(dt1)
    dates_tup2 = get_years(dt2)
    # if dates match exactly, return 1
    unique_dts = set(funcy.concat(dates_tup1, dates_tup2))
    return (len(unique_dts) < (len(dates_tup1) + len(dates_tup2)))
    

In [217]:
get_intern_dts_sim = funcy.rpartial(get_dts_sim, 'intern_dates')

In [218]:
first_last_matches.loc[:, 'internship_sim'] = first_last_matches[[
        'intern_dates_x', 'intern_dates_y']].apply(get_intern_dts_sim, axis=1)

In [233]:
# drop all first and last name matches where the med school sim < .6 
first_last_matches.loc[(first_last_matches['last_name_counts'] < 2), 'match_score'] = 1
first_last_matches.loc[((first_last_matches['last_name_counts'] > 1) & (
            first_last_matches['med_school_sim'] > .6) & (first_last_matches['internship_sim']==True)), 'match_score'] = 1
first_last_matches.loc[((first_last_matches['last_name_counts'] > 1) & (
            first_last_matches['med_school_sim'] > .6) & (pd.isnull(first_last_matches['internship_sim']))), 'match_score'] = 1

In [234]:
first_last_matches.loc[pd.isnull(first_last_matches.match_score), 'match_score'] = 0

In [219]:
# drop all people without a good match score


In [220]:
first_last_matches.loc[first_last_matches['clean_last_name']=='LARSON', ['medical_school_x', 'medical_school_y']]

Unnamed: 0,medical_school_x,medical_school_y
59,STATE UNIVERSITY OF NEW YORK AT BUFFALO SCHOOL...,SUNY BUFFALO SCHOOL OF MEDICINE BIOMEDICAL SCI...


In [221]:
first_last_matches.loc[(first_last_matches['last_name_counts'] > 2) & (first_last_matches['med_school_sim'] < .6), [
        'clean_first_name', 'clean_last_name', 'clean_middle_name_x', 'clean_middle_name_y', 
        'clean_medical_school_x', 'clean_medical_school_y', 'med_school_sim', 
        'intern_dates_x', 'intern_dates_y', 'internship_sim']]

Unnamed: 0,clean_first_name,clean_last_name,clean_middle_name_x,clean_middle_name_y,clean_medical_school_x,clean_medical_school_y,med_school_sim,intern_dates_x,intern_dates_y,internship_sim
12,JAMES,BROWN,EDWARD,KINGSBURY,YALE,UNIVERSITY OF ROCHESTER DENTISTRY,0.108108,1966-1967,1967-1968,True
18,JOHN,DAVIS,,LUCIAN,YALE,VANDERBILT,0.285714,1960-1961,1960-1961,True
28,TERRANCE,FISHER,N,,MARQUETTE,OTHER,0.142857,1962-1963,1962-1963,True
32,ROBERT,GORDON,,DANA,YALE,CORNELL,0.181818,1969-1970,1971-1972,False
47,JOHN,JOHNSON,DAVID,WILCOX,STANFORD,UNIVERSITY OF TENNESSEE HEALTH SCIENCE CENTER,0.188679,1965-1966,1965-1966,True
48,JOHN,JOHNSON,S,WILCOX,VANDERBILT,UNIVERSITY OF TENNESSEE HEALTH SCIENCE CENTER,0.218182,1961-1962,1965-1966,False
55,ROBERT,KRAMER,JEFFERY,JEFFREY,HARVARD,UNIVERSITY OF WISCONSIN,0.066667,,1971,
59,HAROLD,LARSON,ELLIOTT,ELLIOT,SUNY BUFFALO,SUNY BUFFALO BIOMEDICAL SCIENCES,0.545455,1965-1966,1965-1966,True
64,ROBERT,MARCUS,ALAN,MORTON,STANFORD,UNIVERSITY OF CINCINNATI,0.1875,1966-1967,1966-1967,True
65,DAVID,MARTIN,HUBERT,WILLIAM,HARVARD,DUKE,0.181818,1969-1970,1969-1970,True


In [167]:
IMP_COLS = list(NAME_COLS+MED_TRAINING_COLS)
print IMP_COLS
first_last_matches[['clean_first_name', 'clean_last_name']]

['clean_middle_name', 'clean_last_name', 'clean_first_name', 'res_dates', 'intern_dates', 'residency_hospital', 'internship_hospital', 'medical_school', 'residency']


Unnamed: 0,clean_first_name,clean_last_name
0,ROBERT,ASHMAN
1,FLOYD,ATKINS
2,RICHARD,BENDER
3,JOHN,BILEZIKIAN
4,THOMAS,BLANCK
5,SAMUEL,BOBROW
6,WILLIAM,BRADEN
7,JOHN,BREITNER
8,JOHN,BREITNER
9,JOHN,ALEXANDER


In [58]:
# for the columns in common, consolidate information
def consolidate_info(x, y):
    # take the non missing value
    # if both not missing, take the longest value, if string
    # not consolidating non string columns, so shouldn't hit other cases
    non_nan_vals = funcy.remove(lambda v: pd.isnull(v), [x, y])
    if not non_nan_vals:
        return np.nan
    if len(non_nan_vals) == 1:
        return non_nan_vals[0]
    # if both exist sort by len
    return sorted(non_nan_vals, key=lambda x: len(x), reverse=True)[0]

In [45]:
cols_to_consolidate = sorted(filter(lambda x: x.endswith('_y') or x.endswith('_x'), exact_name_matches.columns))

In [46]:
exact_name_matches

[u'citizenship_x',
 'citizenship_y',
 'intern_dates_x',
 'intern_dates_y',
 'internship_hospital_x',
 'internship_hospital_y',
 'medical_school_x',
 'medical_school_y',
 'res_dates_x',
 'res_dates_y',
 'residency_hospital_x',
 'residency_hospital_y',
 'residency_x',
 'residency_y']

In [50]:
funcy.partition(2, cols_to_consolidate)

[[u'citizenship_x', 'citizenship_y'],
 ['intern_dates_x', 'intern_dates_y'],
 ['internship_hospital_x', 'internship_hospital_y'],
 ['medical_school_x', 'medical_school_y'],
 ['res_dates_x', 'res_dates_y'],
 ['residency_hospital_x', 'residency_hospital_y'],
 ['residency_x', 'residency_y']]

In [None]:
list(first_last_matches.columns)


In [None]:
med_cols = ['intern_hos', 'intern_dte_x', 'intern_dte_y', 'residency', 'res_hosp', 'res_dtes_x', 'res_dtes_y', 'residency_hospital',
'medical_school', 'med_school', 'clean_med_school', 'clean_college_trans']


name_cols = ['clean_middlename', 'clean_middle_name', 'clean_first_name', 'clean_firstname', 'clean_last_name', 'clean_lastname']

In [None]:
first_last_matches[name_cols + df3_cols].head()

In [None]:
# drop matches where middle names totally different or colleges totally different
first_last_matches[['res_dtes_x', 'res_dtes_yintern_dte_x', 'intern_dte_y' 'clean_middlename']]

In [None]:
first_last_matches.shape

In [None]:
fuzzy = pd.merge(left)

In [None]:
test2.loc[test2.clean_lastname=='ANDERSON', :]

In [None]:
# for med school, strip school of med and then check string sim
test2.loc[:, ['clean_firstname', 'dno', 'clean_middlename', 'uuid', 'clean_lastname']]
test2.loc[test2.duplicated(['uuid'], keep=False), ['medical_school', 'med_school', 'clean_firstname', 'dno', 'clean_middlename', 'clean_middle_name', 'uuid', 'clean_lastname']]

In [None]:
not_matched_apps2 = not_matched_apps.loc[~not_matched_apps.uuid.isin(test2.uuid), :]
not_matched_attendees2 = not_matched_attendees.loc[~not_matched_attendees.dno.isin(test2.dno), :]

In [None]:
# not_matched_apps2.loc[:, ['clean_last_name', 'clean_first_name', 'medical_school']]
not_matched_apps2.loc[not_matched_apps2.clean_first_name=='LOUIS', ['clean_last_name', 'clean_first_name', 'medical_school']]

In [None]:
not_matched_attendees2.loc[:, ['clean_lastname', 'clean_firstname', 'med_school']]

In [None]:
not_matched_attendees.sort_values(['clean_lastname', 'clean_firstname']).loc[:, ['clean_lastname', 'clean_firstname', 'clean_middlename', 'med_school']]

In [None]:
not_matched_apps.sort_values(['clean_last_name', 'clean_first_name']).loc[:, ['clean_last_name', 'clean_middle_name', 'clean_first_name', 'medical_school']]