In [1]:
# import and merge NIH applicants with AAMC data to get AAMC unique id and use info to fill in name information
from collections import Counter
import funcy
from fuzzywuzzy import fuzz
import numpy as np 
import pandas as pd 
import os

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings)

from dev import (
    APP_DATA_DIR, SUM_STAT_DIR, ATT_DATA_DIR, CARD_DATA_DIR, CORRECTIONS_DIR, AWARDS_KEYWORDS, NAME_COLS, RAW_NAME_COLS, 
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, FEMALE_FIRST_NAMES, FEMALE_MIDDLE_NAMES, 
    PICKLE_DIR, AAMC_DATA_DIR)

from merging_functions import *

OUTPUT_CSV = False 

PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']


# load autoreload extension
%load_ext autoreload
%autoreload 2

In [2]:
print AAMC_DATA_DIR

~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/aamc_data


In [3]:
# import aamc data file
aamc = pd.read_csv(os.path.join(AAMC_DATA_DIR, 'aamc_deduped_raw.csv'))

In [4]:
# read in applicant/nih merged data set
apps = pd.read_csv(os.path.join(APP_DATA_DIR, 'fuzzy_all_apps_plus_NIH_info.csv' ))

In [5]:
# add middle_initial column

In [6]:
# try to merge on first, middle initial and last name
apps2 = apps.sort_values(['clean_first_name', 'clean_middle_initial', 'clean_last_name', 'medschool_year_grad'])

aamc2 = aamc.sort_values(['clean_first_name', 'clean_middle_initial', 'clean_last_name', 'degree_year_1'])
aamc2.loc[:, 'medical_school'] = aamc2.degree_inst_1_desc.apply(clean_med_school)

In [7]:
aamc_counter = Counter(aamc2.clean_last_name.values)
apps_counter = Counter(apps2.clean_last_name.values)
apps2['last_name_counts'] = apps2.clean_last_name.apply(lambda x: apps_counter[x])
aamc2['last_name_counts'] = aamc2.clean_last_name.apply(lambda x: apps_counter[x])

In [8]:
aamc3 = aamc2[aamc2.last_name_counts>0]

In [9]:
aamc3['fuzzy_merge_col'] = aamc3[
    ['clean_first_name', 'clean_middle_initial', 'clean_last_name']].apply(create_str_merge, axis=1)
apps2['fuzzy_merge_col'] = apps2[
    ['clean_first_name', 'clean_middle_initial', 'clean_last_name']].apply(create_str_merge, axis=1)
# match1 = df_get_closest_matches(apps2, aamc2.iloc[:500,:], 'fuzzy_merge_col', suffixes=['_x', '_y']) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [10]:
aamc3['clean_first_initial'] = aamc3.clean_first_name.apply(lambda x: np.nan if pd.isnull(x) else x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [11]:
match1 = pd.merge(left=apps2, right=aamc3, on='fuzzy_merge_col', how='inner')

In [12]:
def check_match(row):
    if row['dob_app_diff'] < 20 or row['dob_app_diff'] > 30:
        return 0
    if row['medical_school_sim'] < 40:
        return 0
    if row['clean_first_initial_x'] == row['clean_first_initial_y']:
        return 1
    if row['clean_first_name_sim']< 60:
        return 0
    return 1 

In [13]:
def get_dob_app_diff(row):
    return row['application_year'] - row['yobb']

In [14]:
match1['dob_app_diff'] = match1.apply(get_dob_app_diff, axis=1)

feature_dict = {
    'clean_first_name': get_name_str_sim,
    'clean_middle_name': get_name_str_sim,
    'clean_last_name': get_name_str_sim,
    'medical_school': get_name_str_sim,
}

match2 = add_similarity_features(match1, feature_dict, check_match, suffixes=['_x', '_y'])

def filter_one_match_per_group(df, dedupe_col, sim_cols):
    # to merge cols should be a dict the names of the extra cols to merge in
    # values should be col names to rename
    # sim cols should be name of the columns to use as features
    # sim mask should be mask that accounts as actual mask
    # dedupe col is name of col to dedupe on

    def count_matches(id_list_arr):
        # for each id, make sure matched on 1x in data set
        # should be applied with rolling apply so takes in a dataframe and must return single value
        # unpack already matched ids from string
        current_id1 = id_list_arr[-1]
        other_matches = id_list_arr[:-1]
        is_dup = np.any(other_matches[:] == current_id1)
        if is_dup:
            return True
        return False

    # for each uuid, check for duplicates and choose best match based on sim cols
    # order of the sim cols should be with most important first
    dup_flag = '{}_duplicate'.format(dedupe_col)
    df[dup_flag] = 0
    df.loc[:, dup_flag] = df[
        dedupe_col].expanding(center=False, min_periods=0).apply(func=count_matches)

    df_matches = df[df['is_match'] == 1].sort_values([dedupe_col] + sim_cols, ascending=False)
    return df_matches.drop_duplicates([dedupe_col], keep='first')

In [15]:
match3 = filter_one_match_per_group(match2, PERSON_ID, sim_cols=['clean_first_name_sim', 'clean_last_name_sim', 'medical_school_sim'])
match4 = filter_one_match_per_group(match3, 'aamc_id', sim_cols=['clean_first_name_sim', 'clean_last_name_sim', 'medical_school_sim'])
match4.shape

(1697, 134)

In [16]:
nm_aamc = get_nonmatched(df=aamc3, id_colname='aamc_id', matched_ids=match4.aamc_id.values)

nm_apps = get_nonmatched(df=apps2, id_colname=PERSON_ID, matched_ids=match4[PERSON_ID].values)

(12538, 25)
(2038, 103)


In [17]:
first_last_matches = pd.merge(
    left=nm_apps,
    right = nm_aamc,
    left_on=['clean_last_name', 'medschool_year_grad'],
    right_on=['clean_last_name', 'degree_year_1'],
    how='inner'
)
print first_last_matches.shape

(2036, 127)


In [18]:
first_last_matches['dob_app_diff'] = first_last_matches.apply(get_dob_app_diff, axis=1)

feature_dict = {
    'clean_first_name': get_name_str_sim,
    'clean_middle_name': get_name_str_sim,
    'medical_school': get_name_str_sim,
}

first_last_matches2 = add_similarity_features(first_last_matches, feature_dict, check_match, suffixes=['_x', '_y'])
fm2 = first_last_matches2[first_last_matches2.is_match==1]

In [19]:
fm3 = filter_one_match_per_group(fm2, PERSON_ID, sim_cols=['clean_first_name_sim', 'medical_school_sim'])
fm4 = filter_one_match_per_group(fm3, 'aamc_id', sim_cols=['clean_first_name_sim', 'medical_school_sim'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
fm4.shape

(416, 133)

In [21]:
print sorted(set(fm4.columns) - set(match4.columns))
print sorted(set(match4.columns) - set(fm4.columns))

['clean_last_name', 'fuzzy_merge_col_x', 'fuzzy_merge_col_y']
['clean_last_name_sim', 'clean_last_name_x', 'clean_last_name_y', 'fuzzy_merge_col']


In [22]:
def consolidate_cols(row):
    # each row comes in as a series
    NULL_STRINGS = ['UNKNOWN', 'OTHER']
    row[row.isin(NULL_STRINGS)]=np.nan
    non_nulls = row[~pd.isnull(row)].values
    if len(non_nulls) == 0:
        return np.nan
    # sort values by string len
    if isinstance(non_nulls[0], str):
        # sort in reverse so longest string returns
        return sorted(non_nulls, key=len, reverse=True)[0]
    # if not a string, just return first value
    return non_nulls[0]

In [23]:
match4.loc[:, 'clean_last_name'] = match4[[
        'clean_last_name_x', 'clean_last_name_y']].apply(consolidate_cols, axis=1)

In [24]:
match5 = match4.drop(['fuzzy_merge_col', 'clean_last_name_sim', 'clean_last_name_x', 'clean_last_name_y'], axis=1)
fm5 = fm4.drop(['fuzzy_merge_col_x', 'fuzzy_merge_col_y'], axis=1)

In [25]:
all_matches = pd.concat([match5, fm5], axis=0)

In [26]:
all_matches['medical_school'] = all_matches['degree_inst_1_desc']
mask = pd.isnull(all_matches['degree_inst_1_desc'])
all_matches.loc[mask, 'medical_school'] = all_matches.loc[mask, 'medical_school_x']
mask = pd.isnull(all_matches['medical_school'])

In [27]:
all_matches2 = all_matches.drop(['medical_school_x', 'medical_school_y', 'medical_school_sim'], axis=1)

In [28]:
all_matches3 = consolidate_merge_cols(all_matches2, ['_x', '_y'], [])

['clean_first_initial', 'clean_first_name', 'clean_middle_initial', 'clean_middle_name', 'last_name_counts']


In [29]:
# for the full matches, replace med school grad year with degree_
missing_degree_year = pd.isnull(all_matches3.degree_year_1)
all_matches3.loc[~missing_degree_year, 'medschool_year_grad'] = all_matches3[~missing_degree_year]['degree_year_1']



In [30]:
unmatched_apps3 = get_nonmatched(apps2, all_matches3[PERSON_ID], id_colname=PERSON_ID)

(1622, 103)


In [31]:
all_matches4 = pd.concat([all_matches3, unmatched_apps3], axis=0).drop(['aamc_id_duplicate', 'Unnamed: 0', 
                                                                       'clean_first_name_sim', 'clean_middle_name_sim', 
                                                                       'fuzzy_merge_col', 'lab_brch', 'person_uuid_duplicate', 'other'], axis=1)


In [32]:
all_matches4[pd.isnull(all_matches4.aamc_id)]

Unnamed: 0,aamc_id,aamc_id_2,address,age,application_date,application_year,application_year_1,application_year_2,application_year_3,associate_program_entered,...,suffix_cd,supervisor,teaching,undergrad_year_grad,undergraduate_school,withdrawal,year_accepted,year_grad,yobb,zip_code
2059,,,"101 Nob Hill Lane, Apt. 11",,1973-02-17,1973.0,,,,,...,,,1.0,1971.0,University of Louisville,-9.0,,,,40206.0
59,,,"2351 Warwick Avenue, Apt. 18",,1971-04-10,1971.0,,,,,...,,,0.0,,,0.0,,,,90032.0
2082,,,"11657 Lockwood Drive, Apt. 203",,1973-03-20,1973.0,,,,,...,,Wolff,0.0,,,0.0,1975.0,1973.0,,20904.0
100,,,374 Winthrop Avenue,,1966-02-24,1966.0,,,,SA,...,,Dingman,1.0,,,-9.0,1967.0,1967.0,,
3723,,,"200 Haven Avenue, Apt. 1N",,1968-03-28,1968.0,,,,,...,,Engel,1.0,,,-9.0,1973.0,1968.0,,10033
1400,,,151 Irwin Avenue,,1968-04-01,1968.0,,,,CA,...,,Wynne,1.0,1964.0,Amherst College,-9.0,1972.0,1968.0,,15202.0
1142,,,4022 Woodmont Boulevard,,1969-03-01,1969.0,,,,,...,,Cole,0.0,1965.0,Vanderbilt University,-9.0,1970.0,1969.0,,37205.0
2642,,,11 Lorraine Terrace,,1966-05-02,1966.0,,,,CA,...,,Engel,1.0,,,0.0,1971.0,1966.0,,
2170,,,Browertown Road,,1975-03-23,1975.0,,,,,...,,,1.0,,,0.0,,,,7424.0
2137,,,89 Delaware Avenue,,1970-03-20,1970.0,,,,,...,,,1.0,,,0.0,,,,12202.0


In [33]:
all_matches4.to_csv(os.path.join(APP_DATA_DIR, 'aamc_apps_nih.csv'))
all_matches4.to_pickle(os.path.join(PICKLE_DIR, 'aaamc_apps_nih.p'))