In [None]:
# import aamc data, merge all times together and look at unique npi numbers
# match to NIH applicants data set and check matches
import funcy
from collections import Counter
import uuid
import numpy as np
import pandas as pd
import os
import string
from fuzzywuzzy import fuzz

from dev import AMA_DIR, APP_DATA_DIR, CORRECTIONS_DIR, NAME_COLS, AMA_MERGE_IMPORTANT_COLS
ama_match_corrections_filename = 'manual_ama_matches.csv'

In [None]:
AMA_cols = ['person_uuid', 'person_uuid_match', 'AMA_fname', 'AMA_mname', 'AMA_lname', 'abs_grad_diff', 'abs_birth_diff',
            'AMA_grad_yr', 'medschool_year_grad', 'birth_year', 'AMA_med_school', 'medical_school',
            'AMA_stschgrad', 'AMA_research_id', 'fname_sim', 'reverse_fname_sim', 'mname_sim']

In [None]:
AMA_DATA_DIR = os.path.join(AMA_DIR, 'data')
AMA_DATA_DICT_DIR =  os.path.join(AMA_DIR, 'data_dictionary')
med_schools_fname = 'dbo_LU_AMA_Schools.txt'
res_fname = 'dbo_res_train.txt'
top_codes = 'PRIMARY TOP.xls' 
mpa_codes = 'Major Professional Activity.txt'
pe_codes = 'PRESEMP.txt'
names_ids = 'names_ids.txt'
middle_names_ids = 'middle_names_ids.txt'

In [None]:
nih_df = pd.read_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_grant_standardized.csv'))

In [None]:
# first, merge in manual ama corrections
ama_manual_df = pd.read_csv(os.path.join(CORRECTIONS_DIR, ama_match_corrections_filename))
# person uuid is the id to match the data set, 
# person_uuid_match is the id matching to the ama dataset
# ama research id is the internal AMA id per person
# ama_match has 1 for correct matches and 0 if none

manual_matches = pd.merge(left=nih_df, right=ama_manual_df[
        ['person_uuid', 'AMA_research_id', 'ama_match']],
                   on=['person_uuid'], how='inner')

In [None]:
# if not a match (ama_match=0), then replace ama research id with null
manual_matches.loc[manual_matches.ama_match==0, 'AMA_research_id'] = np.nan

In [None]:
# replace nonsensical year grad values
nih_df.medschool_year_grad.replace({15232: 1972, 11969: 1969}, inplace=True)

In [None]:
def get_date(raw_str, delim, pos):
    # note that pos is 0 indexed
    vals = raw_str.split(delim)
    if pos >= len(vals):
        return np.nan
    try:
        ret_val = float(vals[pos])
        if ret_val < 100:
            return 1900+ret_val
        return ret_val
    except TypeError:
        return np.nan

#need to strip leading 0 from top code coding
def strip_leading_zero(raw_str):
    raw_str1 = str(raw_str)
    if raw_str1.startswith('0'):
        return int(raw_str[1:])
    return int(raw_str)
    
def avoid_null_wrapper(x, fnc, **kwargs):
    if pd.isnull(x):
        return np.nan
    return fnc(x, **kwargs)


In [None]:
# internship dates are actually strings for a couple people (data error) ie: Medicine or Str. Medicine
# delete the string, take internship dates from intership_dates_NIH and then update internship start and end
bad_res_strs = ['Starting July 1,1962', 'not known',  '"Open"', '1971-?', '1 Year', '1967-1968 (pending)']
bad_res_mask = nih_df.residency_dates.isin(bad_res_strs)
nih_df.loc[bad_res_mask, 'residency_dates'] = nih_df.loc[bad_res_mask, 'residency_dates_NIH']
bad_res_mask_nih = nih_df.residency_dates_NIH.isin(bad_res_strs)
nih_df.loc[bad_res_mask_nih, 'residency_dates_NIH'] = nih_df.loc[bad_res_mask_nih, 'residency_dates']
# fix data entry error
nih_df.loc[nih_df.residency_dates=='1972073', ['residency_start', 'residency_end']] = [1972, 1973]

multiple_entries = lambda x: (len(str(x).split('&')) > 1) | (len(str(x).split(',')) > 1) | (len(str(x).split('-')) > 2)
bad_nih_internship = nih_df.internship_dates_NIH.apply(multiple_entries)
bad_nih_res = nih_df.residency_dates_NIH.apply(multiple_entries)
bad_nih_res2 = nih_df.residency_dates_NIH.apply(lambda x: len(str(x).split('/')) > 1) 
bad_nih_res3 = nih_df.residency_dates.apply(lambda x: len(str(x).split('/')) > 1) 
bad_internship_mask = nih_df.internship_start.isin(['Medicine', 'Str. Medicine'])
nih_df.loc[bad_internship_mask, 'internship_dates'] = nih_df.loc[bad_internship_mask, 'internship_dates_NIH']
nih_df.loc[
    (bad_internship_mask | bad_nih_internship), 'internship_start'] = nih_df.loc[
        bad_internship_mask, 'internship_dates'].apply(
                lambda x: avoid_null_wrapper(x, get_date, delim='-', pos=0))
nih_df.loc[
   bad_internship_mask, 'internship_end'] = nih_df.loc[
        bad_internship_mask, 'internship_dates_NIH'].apply(
            lambda x: avoid_null_wrapper(x, get_date, delim='-', pos=1))
nih_df.loc[
   bad_nih_internship, 'internship_end'] = nih_df.loc[
        bad_internship_mask, 'internship_dates_NIH'].apply(
            lambda x: avoid_null_wrapper(x, get_date, delim='-', pos=2))
nih_df.loc[
   (bad_nih_res | bad_res_mask | bad_res_mask_nih), 'residency_start'] = nih_df.loc[
         (bad_nih_res | bad_res_mask | bad_res_mask_nih), 'residency_dates_NIH'].apply(
                lambda x: avoid_null_wrapper(x, get_date, delim='-', pos=0))
nih_df.loc[
   (bad_nih_res | bad_res_mask | bad_res_mask_nih | bad_nih_res2), 'residency_end'] = nih_df.loc[
        (bad_nih_res | bad_res_mask | bad_res_mask_nih | bad_nih_res2), 'residency_dates_NIH'].apply(
            lambda x: avoid_null_wrapper(x, get_date, delim='-', pos=2))

nih_df.loc[
   bad_nih_res3, 'residency_start'] = nih_df.loc[
        bad_nih_res3, 'residency_dates_NIH'].apply(
            lambda x: avoid_null_wrapper(x, get_date, delim='-', pos=0))
nih_df.loc[
   bad_nih_res3, 'residency_end'] = nih_df.loc[
        bad_nih_res3, 'residency_dates_NIH'].apply(
            lambda x: avoid_null_wrapper(x, get_date, delim='-', pos=1))

In [None]:
# convert internship start, residency start and ends to floats
# nih_df[[ 'residency_end' ]].astype(float)
year_cols = ['internship_start', 'internship_end', 'residency_start', 'residency_end', 'birth_year']
nih_df[year_cols] = nih_df[year_cols].astype(float)

In [None]:
lname_counter = Counter(nih_df.clean_last_name)
nih_df['lname_freq'] = nih_df.clean_last_name.apply(lambda x: lname_counter[x])

In [None]:
# should have 5 text files for 1978, 1985, 1995, 2005, 2015
ama_data_files = [
    file_name for file_name in os.listdir(AMA_DATA_DIR) if file_name.startswith(
                'QUO-161256-FS8YTU-')] 

In [None]:
# defined na values in codes
na_values = {'MPA': ['NCL'], 'TOP': [100, 'X', '100'], 'PE': [110, 100, 12], 'STSCHGRAD': [0, 0.0], 'MEDTRINST': [0, 0.0]}

file_list = []
for f_name in ama_data_files:
    new_f = pd.read_csv(os.path.join(AMA_DATA_DIR, f_name), na_values=na_values)
    new_f['observation_year'] = int(f_name.split('.txt')[0][-4:])
    file_list.append(new_f.copy())
    
ama_dfs_raw = pd.concat(file_list, axis=0)

# import names and ids and merge into main AMA files 
# note fname and lname columns are in propercase format
names_ids_df = pd.read_csv(os.path.join(AMA_DATA_DIR, names_ids))
middle_df = pd.read_csv(os.path.join(AMA_DATA_DIR, middle_names_ids))
fname_ids_df = pd.merge(left=names_ids_df, right=middle_df, how='left')

In [None]:
unique_names = fname_ids_df.drop_duplicates('RESEARCH ID')

In [None]:
np.average(pd.isnull(unique_names.MNAME))

In [None]:
# research ids should be an int64
ama_dfs = pd.merge(left=ama_dfs_raw, right=fname_ids_df, on=['RESEARCH ID'], how='left')

# check for any missing first and last names
ama_dfs.loc[(pd.isnull(ama_dfs.FNAME)) | (pd.isnull(ama_dfs.LNAME)), ['FNAME', 'MNAME', 'LNAME', 'RESEARCH ID']]

In [None]:
# variable definitions
# 
# LIC_year = license year
# locum tenes = short term/flexible staffing position
# MPA = major professional activity
            # OFF=Office-based
            # HPI=interns(discontinued in 1992)
            # HPR=hospital based-all other years resident
            # HPP=hospital based physician
            # MTC=medical teacher
            # ADM=administration physician
            # RES=research physician
            # OTH=other physician
            # INA=inactive physician
            # NCL=Not classified
            # UNA=address unknown physician
            # TFG=temporary foreign physician
            # CUT=cut physician
            # LOC=locum tenes (began in 1996)
# dead = deceased indicator
# TOP = type of practice 
#             Code	Description
#             012	Resident
#             020	Direct Patient Care
#             030	Administration
#             040	Medical Teaching
#             050	Medical Research
#             062	Non-Patient Care
#             071	Retired
#             072	Semi-Retired
#             074	Temporarily not in Practice
#             075	Not active for other reasons
#             100	No classification
# PE = present employment
#             "010","SELF EMPLOYED"
#             "011","SELF EMPLOYED SOLO PRACTICE"
#             "013","TWO PHYSICIAN PRACTICE - OWNER"
#             "014","TWO PHYSICIAN PRACTICE - EMPL."
#             "021","OTHER PATIENT CARE"
#             "022","Locum Tenens"
#             "030","GROUP PRACTICE"
#             "035","HMO"
#             "040","MEDICAL SCHOOL"
#             "050","NON-GOVERNMENT HOSPITAL"
#             "060","-CITY/COUNTY/STATE GOVERNMENT-"
#             "063","CITY/COUNTY/STATE GOVT HOSP"
#             "064","CITY/COUNTY/STATE GOVT OTHER"
#             "080","-FEDERAL GOVERNMENT HOSPITAL-"
#             "081","FEDERAL GOVT HOSP ARMY"
#             "082","FEDERAL GOVT HOSP NAVY"
#             "083","FEDERAL GOVT HOSP AIR FORCE"
#             "084","FEDERAL GOVT HOSP U.S.P.H.S."
#             "085","FEDERAL GOVT HOSP VET ADMIN"
#             "086","FEDERAL GOVT HOSP OTHER"
#             "090","-FEDERAL GOVERNMENT NON-HOSP-"
#             "091","FEDERAL GOVT N-H ARMY"
#             "092","FEDERAL GOVT N-H NAVY"
#             "093","FEDERAL GOVT N-H AIR FORCE"
#             "094","FEDERAL GOVT N-H U.S.P.H.S."
#             "095","FEDERAL GOVT N-H VET ADMIN"
#             "096","FEDERAL GOVT N-H OTHER"
#             "101","OTHER NON-PATIENT CARE"
#             "110","NO CLASSIFICATION"

# MED_TRFROM = date of medical training start/end. The date the physician entered 
        # the current graduate medical training program and the anticipated completion date.
        # 000000000000 is the same as 00  0000  00. All 0’s = not reported.
        # For years 1978, 1985, 1995, the date is formatted MMYYYYMMYYYY, no spaces. 
        # For 2005, the date is formatted M YYYYM YYYY for single digit months and MMYYYYMMYYYY for
        # double digit months. 2015 is just a year. It’s not clear whether it is the start year or 
        # the completion year.

# MEDTRINST = Medical Training Institution Code - dbo_res_train.txt file contains codes 
# STSCHGRAD = school of graduation; corresponds to Dbo_LU_AMA_Schools.txt
# ECFMG = Education Commision for Foreign Medical Graduates. A unique identifying number 
#     assigned by the Education Commission for Foreign Medical Graduates to foreign medical 
#     graduates applying for ECFMG certification. 000000 = no ECFMG # reported.
# GRAD_YR = med school graduation year (range from 1955-1975)
# FED_CODE = federal code, 1 = federal physician, 0 = non federal physician
# B_DATE = birth_date
# spec1 = specialty 1
# spec2 = specialty 2
# B_PLACE = birth place

In [None]:
# import and merge TOP codes, mpa, pe codes
# top codes should be integers with no leading 0s, MPA should be a string
# pe codes must be floats since column contains missing values

top_df = pd.read_excel(os.path.join(AMA_DATA_DICT_DIR, top_codes))
top_df.columns = ['TOP', 'TOP_description']
top_df['TOP'] = top_df['TOP'].astype(int)
mpa_df = pd.read_csv(os.path.join(AMA_DATA_DICT_DIR, mpa_codes))
mpa_df.columns = ['MPA', 'MPA_description']
pe_df = pd.read_csv(os.path.join(AMA_DATA_DICT_DIR, pe_codes))
pe_df.columns = ['PE', 'PE_description']
pe_df['PE'] = pe_df['PE'].astype(float)

In [None]:
zero_fnc = funcy.rpartial(avoid_null_wrapper, strip_leading_zero)
float_fnc = funcy.rpartial(avoid_null_wrapper, int)
clean_str_fnc = funcy.rpartial(avoid_null_wrapper, funcy.rcompose(string.upper, string.strip))

ama_dfs['TOP'] = ama_dfs['TOP'].apply(zero_fnc)
ama_dfs['PE'] = ama_dfs['PE'].apply(float_fnc)

In [None]:
ama1 = pd.merge(left=ama_dfs, right=top_df, on=['TOP'], how='left')
ama2 = pd.merge(left=ama1, right=mpa_df, on=['MPA'], how='left')
ama3 = pd.merge(left=ama2, right=pe_df, on=['PE'], how='left')

# print ama3['TOP_description'].unique()
# print ama3['MPA_description'].unique()
# print ama3['PE_description'].unique()

In [None]:
# check for places in the data set that have a top, mpa or pe code and don't merge correctly
missing_top = (~pd.isnull(ama3['TOP'])) & (pd.isnull(ama3['TOP_description']))
missing_mpa = (~pd.isnull(ama3['MPA'])) & (pd.isnull(ama3['MPA_description']))
missing_pe = (~pd.isnull(ama3['PE'])) & (pd.isnull(ama3['PE_description']))

In [None]:
missing_mpa_arr = ama3.loc[(missing_mpa)].MPA.unique()
missing_top_arr = ama3.loc[(missing_top)].TOP.unique()
missing_pe_arr = ama3.loc[(missing_pe)].PE.unique()

In [None]:
# set 0 values to np.nan
ama3.loc[ama3.MEDTRINST==0, 'MEDTRINST'] = np.nan
ama3.loc[ama3.STSCHGRAD==0, 'STSCHGRAD'] = np.nan

In [None]:
# import med school and med training institution strings
med_school_df = pd.read_csv(os.path.join(AMA_DATA_DICT_DIR, med_schools_fname))
med_school_df.columns = ['STSCHGRAD', 'MED_SCHOOL', 'MED_SCHOOL_STATE']
train_school_df = pd.read_csv(os.path.join(AMA_DATA_DICT_DIR, res_fname))
train_school_df.columns = [
    'MEDTRINST', 'MEDTRINST_NAME',
    'MEDTRINST_ADD1', 'MEDTRINST_ADD2', 
    'MEDTRINST_CITY', 'MEDTRINST_ST', 
    'MEDTRINST_ZIP']
train_school_df.MEDTRINST = train_school_df.MEDTRINST.astype(float)
med_school_df.STSCHGRAD = med_school_df.STSCHGRAD.astype(float)

In [None]:
ama3.STSCHGRAD = ama3.STSCHGRAD.astype(float)
ama3.MEDTRINST = ama3.MEDTRINST.astype(float)

In [None]:
ama3.shape

In [None]:
# merge in medical school and train inst and check for data values not in the dictionary
ama4 = pd.merge(left=ama3, right=med_school_df, how='left')
ama5 = pd.merge(left=ama4, right=train_school_df, how='left')

In [None]:
# find codes not in corresponding data dictionaries
missing_med_schools = (~pd.isnull(ama5.STSCHGRAD) & pd.isnull(ama5.MED_SCHOOL))
missing_tr_schools = (~pd.isnull(ama5.MEDTRINST) & pd.isnull(ama5.MEDTRINST_NAME))
print missing_med_schools.sum()
print missing_tr_schools.sum()

In [None]:
# missing school and training codes
missing_med_schools_arr = ama5[missing_med_schools]['STSCHGRAD'].sort_values().unique()
missing_tr_schools_arr = ama5[missing_tr_schools]['MEDTRINST'].sort_values().unique()

# output missing values to csv 
missing_dict = {
        'missing_MPA_codes': missing_mpa_arr, 'missing_TOP_codes': missing_top_arr,
        'missing_PE_codes': missing_pe_arr, 'missing_STSCHGRAD_codes': missing_med_schools_arr,
        'missing_MEDTRINST': missing_tr_schools_arr
}

missing_data_dict = pd.DataFrame.from_dict(missing_dict, orient='index').T
# output missing AMA data dictionary to files
missing_data_dict.to_csv(os.path.join(AMA_DIR, 'missing_MMS_data_dictionary_codes.csv'), index=False)

In [None]:
# clean and upcase first and last names
ama5[['FNAME', 'MNAME', 'LNAME']] = ama5[['FNAME', 'MNAME', 'LNAME']].applymap(clean_str_fnc)

#rename ama_dfs columns
ama5['birth_year'] = ama5['B_DATE'].apply(lambda x: int(str(x)[-4:]))
ama_merge_df = ama5.drop(['first_initial', 'clean_last_name', 'hash_id', 'match_id', 'dno'], axis=1)
ama_merge_df.columns = ['person_uuid']+['AMA_{}'.format(x.lower()) for x in ama_merge_df.columns if x != 'person_uuid']
ama_merge_df.rename(columns={'AMA_research id': 'AMA_research_id', 'person_uuid': 'old_uuid'}, inplace=True)

In [None]:
# create a version of the ama data set with 1 observation per unique id, with a column for first and last obs year
ama_merge_df2 = ama_merge_df.sort_values(
    ['AMA_research_id', 'AMA_observation_year']).groupby('AMA_research_id')['AMA_observation_year'].agg(
        {'AMA_obs_year_min': min, 'AMA_obs_year_max': max})
ama_merge_df3 = ama_merge_df.join(ama_merge_df2, on='AMA_research_id', how='left')

In [None]:
ama_merge_df3.loc[(pd.isnull(ama_merge_df3['AMA_obs_year_min'])) | (pd.isnull(ama_merge_df3['AMA_obs_year_max']))]

In [None]:
r_id = '2001591012'
ama_merge_df3.loc[ama_merge_df3.AMA_research_id==int(r_id),AMA_cols]

In [None]:
nih_df['person_uuid_match'] = nih_df['person_uuid']
nih_df.loc[(nih_df.clean_last_name=='CAHAN') & (nih_df.clean_first_name=='LESLIE'), 'person_uuid_match'] = 2270

ama_merge_df3['person_uuid_match'] = ama_merge_df3['old_uuid']
convert_mask = (ama_merge_df3['person_uuid_match'] >= 200) & (ama_merge_df3['person_uuid_match'] < 300)
convert_mask2 = (ama_merge_df3['person_uuid_match'] > 2270) & (ama_merge_df3['person_uuid_match'] < 4145)
ama_merge_df3.loc[convert_mask, 'person_uuid_match'] = ama_merge_df3[convert_mask]['person_uuid_match'] + 1
ama_merge_df3.loc[convert_mask2, 'person_uuid_match'] = ama_merge_df3[convert_mask2]['person_uuid_match'] + 1


ama_merge_df3.loc[
    (ama_merge_df3.AMA_lname=='CAHAN'), 'person_uuid_match'] = 2270
# 4151 -> 4149
ama_merge_df3.loc[
    (ama_merge_df3.AMA_lname=='GARFIN') & (ama_merge_df3.AMA_fname=='STEVEN'), 'person_uuid_match'] = 4149
#4152 -> 4151
ama_merge_df3.loc[
    (ama_merge_df3.AMA_lname=='BULL') & (ama_merge_df3.AMA_fname=='BRIAN'), 'person_uuid_match'] = 4151
# 4148 -> 4147
ama_merge_df3.loc[
    (ama_merge_df3.AMA_lname=='HERSH') & (ama_merge_df3.AMA_fname=='EVAN'), 'person_uuid_match'] = 4147
#make sure list is 4150
ama_merge_df3.loc[
    (ama_merge_df3.AMA_lname=='LIST') & (ama_merge_df3.AMA_fname=='NOEL'), 'person_uuid_match'] = 4150


In [None]:
# optionally output data set
# ama_merge_df3.to_csv(os.path.join(AMA_DIR, 'full_ama.csv'), index=False)

In [None]:
# remove manual matches from nih_data set and merge with ama seperately
nih_df_m = nih_df.loc[~(nih_df.person_uuid.isin(manual_matches.person_uuid)), :]
manual_matches_ama = pd.merge(
    left=manual_matches, right=ama_merge_df3, on='AMA_research_id', how='inner')
manual_matches_ama_un = manual_matches_ama.drop_duplicates('person_uuid')

In [None]:
full_df = pd.merge(left=nih_df_m, right=ama_merge_df3, on=['person_uuid_match'], how='left')

full_df['internship_diff'] = full_df['internship_start']-full_df['AMA_grad_yr']
full_df['abs_internship_diff'] = full_df['internship_diff'].abs()
full_df['grad_diff'] = full_df['medschool_year_grad']-full_df['AMA_grad_yr']
full_df['abs_grad_diff'] = full_df['grad_diff'].abs()
full_df['birth_diff'] = full_df['birth_year']-full_df['AMA_birth_year']
full_df['abs_birth_diff'] = full_df['birth_diff'].abs()

In [None]:
# get unique combos of NIH people, research id matches
unique_match_combos = full_df.sort_values(
    ['person_uuid_match', 'abs_grad_diff', 'abs_birth_diff']).drop_duplicates(
    ['person_uuid_match', 'AMA_research_id'])

In [None]:
no_matches = unique_match_combos.loc[
    pd.isnull(unique_match_combos['AMA_research_id']), [c for c in unique_match_combos.columns if not c.startswith('AMA_')]]
print no_matches.shape

In [None]:
unique_match_combos2 = unique_match_combos.loc[~pd.isnull(unique_match_combos['AMA_research_id'])]
print unique_match_combos2.shape

In [None]:
# calculate first name similarity
def name_sim(nih_fname, ama_fname):
    # if either name is null, return null
    if pd.isnull(nih_fname) or pd.isnull(ama_fname):
        return np.nan
    if len(nih_fname) == 1 or len(ama_fname) == 1:
        return 100*(nih_fname[0]==ama_fname[0])
    return fuzz.ratio(nih_fname, ama_fname)
    
def medschool_sim(nih_name, ama_name):
    # if either name is null, return null
    if pd.isnull(nih_name) or pd.isnull(ama_name):
        return np.nan
    return fuzz.ratio(nih_name, ama_name)

In [None]:
# get rid of matches where grad diff or birth year diff >= 3
unique_match_combos3 = unique_match_combos2.loc[
    (unique_match_combos2.abs_grad_diff <= 2) | (unique_match_combos2.abs_birth_diff <= 2)]
unique_match_combos3['fname_sim'] = unique_match_combos3[
    ['clean_first_name', 'AMA_fname']].apply(lambda x: name_sim(*x), axis=1)
unique_match_combos3['mname_sim'] = unique_match_combos3[
    ['clean_middle_name', 'AMA_mname']].apply(lambda x: name_sim(*x), axis=1)

In [None]:
# calculate reverse sim score for sorting
unique_match_combos3['reverse_fname_sim'] = 100 - unique_match_combos3['fname_sim']
unique_match_combos3['reverse_mname_sim'] = 100 - unique_match_combos3['mname_sim']

In [None]:
# sort by grad year, birth year differences then name sim
unique_match_combos4 = unique_match_combos3.sort_values(
    ['person_uuid_match', 'abs_grad_diff', 'abs_birth_diff', 
         'abs_internship_diff', 'reverse_fname_sim', 'reverse_mname_sim'])

In [None]:
# for every match, take first option (least bad)
matches = unique_match_combos4.drop_duplicates('person_uuid_match', keep='first')
print matches.shape

In [None]:
# keep matches where name freq is unusual (< 1 in the data set) or name sim >= 80 and birth year or grad year match
def is_match(row):
    if row['fname_sim'] <= 70:
        return 0
    if row['lname_freq'] == 1:
        return 1
    # if name sim > 80 and birth year or grad diff similar, match
    if row['abs_birth_diff'] == 0 or row['abs_grad_diff'] == 0:
        return 1
    if row['fname_sim'] >= 95:
        return 1
    # if internship start date is equal to ama_grad date, do a match
    if row['internship_diff'] < 2:
        return 1
    # check middle name sim
    if row['mname_sim'] >= 60:
        return 2
    return 0

In [None]:
matches['is_match'] = matches.apply(is_match, axis=1)
matches.loc[matches.is_match==0, 'AMA_research_id'] = np.nan

In [None]:
matches.is_match.sum()

In [None]:
matched_ids = matches.loc[matches.is_match==1, 'person_uuid_match']
matched_ama_ids = matches.loc[matches.is_match==1, 'AMA_research_id']

In [None]:
non_matches = nih_df_m.loc[~(nih_df_m.person_uuid_match.isin(matched_ids))]

non_matches.to_csv(os.path.join(AMA_DIR, 'unmatched_nih.csv'), index=False)

print non_matches.loc[non_matches.time_period_flag==1].shape
print non_matches.shape

In [None]:
# read data corrections
reviewed_unmatched = pd.read_excel(os.path.join(CORRECTIONS_DIR, 'data_corrections_final.xlsx'))


In [None]:
# new_unmatched = non_matches.loc[(non_matches.time_period_flag==1) & ~(non_matches.person_uuid.isin(reviewed_unmatched.person_uuid))]

new_unmatched = non_matches.loc[(non_matches.time_period_flag==1)]

In [None]:
new_unmatched.to_excel(os.path.join(CORRECTIONS_DIR, 'unmatched_manual_review.xlsx'))

In [None]:
unmatched_ama = ama_merge_df3.loc[
    ~((ama_merge_df3.AMA_research_id.isin(matched_ama_ids)) | (ama_merge_df3.person_uuid_match.isin(matched_ids))), :]
unmatched_ama.shape

# get data set of unique people
unique_unmatched = unmatched_ama[
    [c for c in unmatched_ama if c.startswith('AMA_')]]

unique_unmatched2 = unique_unmatched.drop_duplicates('AMA_research_id')

unique_unmatched2.to_csv(os.path.join(AMA_DIR, 'unmatched_ama.csv'), index=False)
print unique_unmatched2.shape

matches.loc[
#     ((matches.is_match==1) & (matches.time_period_flag==1)),:].to_csv(os.path.join(AMA_DIR, 'matched_nih.csv'))
    (matches.is_match==1),:].to_csv(os.path.join(AMA_DIR, 'matched_nih.csv'))

In [None]:
# matches.loc[(
#         (matches.abs_internship_diff == 0) & (
#             matches.abs_birth_diff != 0) & (
#                 matches.abs_grad_diff != 0)), NAME_COLS + AMA_cols + ['is_match']]
test_m = matches.loc[(
        (matches.is_match==0)), NAME_COLS + AMA_cols + ['is_match']].sort_values(['person_uuid', 'reverse_fname_sim'])

test_m_options = unique_match_combos2.loc[(
        unique_match_combos2.person_uuid.isin(
            no_matches.person_uuid)), NAME_COLS + AMA_cols + ['internship_start', 'eod_year', 'year_accepted']].sort_values(
                ['person_uuid', 'reverse_fname_sim'])
# merge not matched people to ama data set to unmatched ama
other_matches_options = pd.merge(left=non_matches.loc[
        (non_matches.time_period_flag==1) & ~(non_matches.person_uuid.isin(test_m_options.person_uuid)), 
            NAME_COLS + ['internship_start', 'eod_year', 'year_accepted']], right=unique_unmatched2, 
                left_on='clean_last_name', right_on='AMA_lname', how='left')
all_options = pd.concat([test_m_options, other_matches_options], axis=0)
print test_m.shape
print test_m_options.shape
test_m.to_csv(os.path.join(CORRECTIONS_DIR, 'ama_test_matches.csv'), index=False)
all_options[NAME_COLS + AMA_cols + ['internship_start', 'eod_year', 'year_accepted']].to_csv(os.path.join(CORRECTIONS_DIR, 'ama_unmatched_possible_matches.csv'), index=False)

In [None]:
# non AMA columns
non_ama_cols = [c for c in matches.columns if not c.startswith('AMA_')]

In [None]:
# combine manual matches with unique matches and combine with ama data set to get a panel 
# get nih people not able to be matched to AMA data set
print matches.loc[matches.person_uuid.isin(manual_matches_ama.person_uuid)].shape
full_matches = pd.concat([matches, manual_matches_ama_un], axis=0) 
no_matches = nih_df.loc[~nih_df.person_uuid.isin(full_matches.person_uuid), ]
no_matches['AMA_research_id'] = np.nan
full_matches = full_matches[non_ama_cols+['AMA_research_id']]
all_nih = pd.concat([full_matches, no_matches], axis=0)

In [None]:
# these should be equal in len
print all_nih.shape
print nih_df.shape

In [None]:
# now, merge all_nih into AMA data to get a panel data set
ama_cols = [c for c in ama_merge_df3.columns if c.startswith('AMA_')]
panel_nih = pd.merge(
    left=all_nih, right=ama_merge_df3[ama_cols], on='AMA_research_id', how='left')

In [None]:
nih_ama_unique = panel_nih.sort_values(
    ['person_uuid', 'AMA_observation_year']).drop_duplicates('person_uuid', keep='last')

In [None]:
nih_ama_unique['medschool_name_sim'] = nih_ama_unique[
    ['AMA_med_school', 'medical_school']].apply(lambda x: medschool_sim(x[0], x[1]), axis=1)

In [None]:
nih_ama_unique.loc[
    (~pd.isnull(nih_ama_unique.AMA_research_id) & ~(pd.isnull(nih_ama_unique.medschool_name_sim))), ['medical_school', 'AMA_med_school', 'medschool_name_sim']]

In [None]:
old_cols = ['residency_1', 'birth_year.1', 'old_uuid', 
     'reverse_fname_sim', 'reverse_mname_sim', 'is_match', 'suffix_cd', 'correct_match_flag', 
     'person_uuid_match', 'grad_diff', 'abs_grad_diff', 'birth_diff', 'abs_birth_diff']
nih_ama_unique2 = nih_ama_unique.drop(old_cols, axis=1)
panel_nih2 = panel_nih.drop(old_cols, axis=1).sort_values(['person_uuid', 'AMA_observation_year'])


In [None]:
col_order = sorted([c for c in panel_nih2.columns if c not in AMA_MERGE_IMPORTANT_COLS])


In [None]:
nih_ama_unique2[AMA_MERGE_IMPORTANT_COLS+col_order].to_csv(os.path.join(APP_DATA_DIR, 'ama_nih_merge.csv'), index=False)
panel_nih2[AMA_MERGE_IMPORTANT_COLS+col_order].to_csv(os.path.join(APP_DATA_DIR, 'ama_nih_merge_panel.csv'), index=False)

In [None]:
# percentage missing
print np.average(pd.isnull(nih_ama_unique2.AMA_research_id))

# percentage missing ama medical school
print np.average(pd.isnull(nih_ama_unique2.AMA_med_school))

# percentage missing ama middle name
print np.average(pd.isnull(nih_ama_unique2.AMA_mname))

In [None]:
print np.average(pd.isnull(nih_ama_unique2.AMA_b_date))
print np.average(pd.isnull(nih_ama_unique2[nih_ama_unique2.control_flag==1].AMA_b_date))
print np.average(pd.isnull(nih_ama_unique2[nih_ama_unique2.control_flag==0].AMA_b_date))

In [None]:
sorted(nih_ama_unique2.AMA_birth_year.unique())

In [None]:
nih_ama_unique2.loc[pd.isnull(nih_ama_unique2.AMA_b_date), ['AMA_birth_year', 'AMA_b_date']]