In [None]:
# import aamc data, merge all times together and look at unique npi numbers
# match to NIH applicants data set and check matches
import funcy
import numpy as np
import pandas as pd
import os

from dev import AMA_DIR, APP_DATA_DIR
NAMES = ['clean_first_name', 'clean_middle_name', 'clean_last_name']

In [None]:
AMA_DATA_DIR = os.path.join(AMA_DIR, 'data')
AMA_DATA_DICT_DIR =  os.path.join(AMA_DIR, 'data_dictionary')
med_schools_fname = 'dbo_LU_AMA_Schools.txt'
res_fname = 'dbo_res_train.txt'
top_codes = 'PRIMARY TOP.xls' 
mpa_codes = 'Major Professional Activity.txt'
pe_codes = 'PRESEMP.txt'

In [None]:
# should have 5 text files for 1978, 1985, 1995, 2005, 2015
ama_data_files = [
    file_name for file_name in os.listdir(AMA_DATA_DIR) if file_name.startswith(
                'QUO-161256-FS8YTU-')] 

In [None]:
nih_df = pd.read_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_grant_standardized.csv'))

In [None]:
print nih_df.loc[(nih_df.control_flag==0) & (pd.isnull(nih_df.eod_year))].shape[0]
print nih_df.loc[nih_df.control_flag==0].shape[0]

In [None]:
nih_df.loc[nih_df.person_uuid==501]

In [None]:
ama_data_files[0].split('.txt')[0][-4:]

In [None]:
# defined na values in codes
na_values = {'MPA': ['NCL'], 'TOP': [100, 'X', '100'], 'PE': [110], 'STSCHGRAD': [0, 0.0], 'MEDTRINST': [0, 0.0]}

In [None]:
file_list = []
for f_name in ama_data_files:
    new_f = pd.read_csv(os.path.join(AMA_DATA_DIR, f_name), na_values=na_values)
    new_f['observation_year'] = int(f_name.split('.txt')[0][-4:])
    file_list.append(new_f.copy())
    
ama_dfs = pd.concat(file_list, axis=0)

In [None]:
# variable definitions
# LIC_year = license year
# locum tenes = short term/flexible staffing position
# MPA = major professional activity
            # OFF=Office-based
            # HPI=interns(discontinued in 1992)
            # HPR=hospital based-all other years resident
            # HPP=hospital based physician
            # MTC=medical teacher
            # ADM=administration physician
            # RES=research physician
            # OTH=other physician
            # INA=inactive physician
            # NCL=Not classified
            # UNA=address unknown physician
            # TFG=temporary foreign physician
            # CUT=cut physician
            # LOC=locum tenes (began in 1996)
# dead = deceased indicator
# TOP = type of practice 
#             Code	Description
#             012	Resident
#             020	Direct Patient Care
#             030	Administration
#             040	Medical Teaching
#             050	Medical Research
#             062	Non-Patient Care
#             071	Retired
#             072	Semi-Retired
#             074	Temporarily not in Practice
#             075	Not active for other reasons
#             100	No classification
# PE = present employment
#             "010","SELF EMPLOYED"
#             "011","SELF EMPLOYED SOLO PRACTICE"
#             "013","TWO PHYSICIAN PRACTICE - OWNER"
#             "014","TWO PHYSICIAN PRACTICE - EMPL."
#             "021","OTHER PATIENT CARE"
#             "022","Locum Tenens"
#             "030","GROUP PRACTICE"
#             "035","HMO"
#             "040","MEDICAL SCHOOL"
#             "050","NON-GOVERNMENT HOSPITAL"
#             "060","-CITY/COUNTY/STATE GOVERNMENT-"
#             "063","CITY/COUNTY/STATE GOVT HOSP"
#             "064","CITY/COUNTY/STATE GOVT OTHER"
#             "080","-FEDERAL GOVERNMENT HOSPITAL-"
#             "081","FEDERAL GOVT HOSP ARMY"
#             "082","FEDERAL GOVT HOSP NAVY"
#             "083","FEDERAL GOVT HOSP AIR FORCE"
#             "084","FEDERAL GOVT HOSP U.S.P.H.S."
#             "085","FEDERAL GOVT HOSP VET ADMIN"
#             "086","FEDERAL GOVT HOSP OTHER"
#             "090","-FEDERAL GOVERNMENT NON-HOSP-"
#             "091","FEDERAL GOVT N-H ARMY"
#             "092","FEDERAL GOVT N-H NAVY"
#             "093","FEDERAL GOVT N-H AIR FORCE"
#             "094","FEDERAL GOVT N-H U.S.P.H.S."
#             "095","FEDERAL GOVT N-H VET ADMIN"
#             "096","FEDERAL GOVT N-H OTHER"
#             "101","OTHER NON-PATIENT CARE"
#             "110","NO CLASSIFICATION"

# MED_TRFROM = date of medical training start/end. The date the physician entered 
        # the current graduate medical training program and the anticipated completion date.
        # 000000000000 is the same as 00  0000  00. All 0’s = not reported.
        # For years 1978, 1985, 1995, the date is formatted MMYYYYMMYYYY, no spaces. 
        # For 2005, the date is formatted M YYYYM YYYY for single digit months and MMYYYYMMYYYY for
        # double digit months. 2015 is just a year. It’s not clear whether it is the start year or 
        # the completion year.

# MEDTRINST = Medical Training Institution Code - dbo_res_train.txt file contains codes 
# STSCHGRAD = school of graduation; corresponds to Dbo_LU_AMA_Schools.txt
# ECFMG = Education Commision for Foreign Medical Graduates. A unique identifying number 
#     assigned by the Education Commission for Foreign Medical Graduates to foreign medical 
#     graduates applying for ECFMG certification. 000000 = no ECFMG # reported.
# GRAD_YR = med school graduation year (range from 1955-1975)
# FED_CODE = federal code, 1 = federal physician, 0 = non federal physician
# B_DATE = birth_date
# spec1 = specialty 1
# spec2 = specialty 2
# B_PLACE = birth place

In [None]:
# import and merge TOP codes, mpa, pe codes
# top codes should be integers with no leading 0s, MPA should be a string
# pe codes must be floats since column contains missing values

top_df = pd.read_excel(os.path.join(AMA_DATA_DICT_DIR, top_codes))
top_df.columns = ['TOP', 'TOP_description']
top_df['TOP'] = top_df['TOP'].astype(int)
mpa_df = pd.read_csv(os.path.join(AMA_DATA_DICT_DIR, mpa_codes))
mpa_df.columns = ['MPA', 'MPA_description']
pe_df = pd.read_csv(os.path.join(AMA_DATA_DICT_DIR, pe_codes))
pe_df.columns = ['PE', 'PE_description']
pe_df['PE'] = pe_df['PE'].astype(float)

In [None]:
#need to strip leading 0 from top code coding
def strip_leading_zero(raw_str):
    raw_str1 = str(raw_str)
    if raw_str1.startswith('0'):
        return int(raw_str[1:])
    return int(raw_str)
    
def avoid_null_wrapper(x, fnc):
    if pd.isnull(x):
        return np.nan
    return fnc(x)

zero_fnc = funcy.rpartial(avoid_null_wrapper, strip_leading_zero)
float_fnc = funcy.rpartial(avoid_null_wrapper, int)

ama_dfs['TOP'] = ama_dfs['TOP'].apply(zero_fnc)
ama_dfs['PE'] = ama_dfs['PE'].apply(float_fnc)

In [None]:
ama1 = pd.merge(left=ama_dfs, right=top_df, on=['TOP'], how='left')
ama2 = pd.merge(left=ama1, right=mpa_df, on=['MPA'], how='left')
ama3 = pd.merge(left=ama2, right=pe_df, on=['PE'], how='left')

# print ama3['TOP_description'].unique()
# print ama3['MPA_description'].unique()
# print ama3['PE_description'].unique()

In [None]:
# check for places in the data set that have a top, mpa or pe code and don't merge correctly
missing_top = (~pd.isnull(ama3['TOP'])) & (pd.isnull(ama3['TOP_description']))
missing_mpa = (~pd.isnull(ama3['MPA'])) & (pd.isnull(ama3['MPA_description']))
missing_pe = (~pd.isnull(ama3['PE'])) & (pd.isnull(ama3['PE_description']))

In [None]:
print ama3.loc[(missing_mpa)].MPA.unique()
print ama3.loc[(missing_top)].TOP.unique()
print ama3.loc[(missing_pe)].PE.unique()

In [None]:
# set 0 values to np.nan
ama3.loc[ama3.MEDTRINST==0, 'MEDTRINST'] = np.nan
ama3.loc[ama3.STSCHGRAD==0, 'STSCHGRAD'] = np.nan

In [None]:
# import med school and med training institution strings
med_school_df = pd.read_csv(os.path.join(AMA_DATA_DICT_DIR, med_schools_fname))
med_school_df.columns = ['STSCHGRAD', 'MED_SCHOOL', 'MED_SCHOOL_STATE']
train_school_df = pd.read_csv(os.path.join(AMA_DATA_DICT_DIR, res_fname))
train_school_df.columns = [
    'MEDTRINST', 'MEDTRINST_NAME',
    'MEDTRINST_ADD1', 'MEDTRINST_ADD2', 
    'MEDTRINST_CITY', 'MEDTRINST_ST', 
    'MEDTRINST_ZIP']
train_school_df.MEDTRINST = train_school_df.MEDTRINST.astype(float)
med_school_df.STSCHGRAD = med_school_df.STSCHGRAD.astype(float)

In [None]:
ama3.STSCHGRAD = ama3.STSCHGRAD.astype(float)
ama3.MEDTRINST = ama3.MEDTRINST.astype(float)

In [None]:
# merge in medical school and train inst and check for data values not in the dictionary
ama4 = pd.merge(left=ama3, right=med_school_df, how='left')
ama5 = pd.merge(left=ama4, right=train_school_df, how='left')

In [None]:
# find codes not in corresponding data dictionaries
missing_med_schools = (~pd.isnull(ama5.STSCHGRAD) & pd.isnull(ama5.MED_SCHOOL))
missing_tr_schools = (~pd.isnull(ama5.MEDTRINST) & pd.isnull(ama5.MEDTRINST_NAME))
print missing_med_schools.sum()
print missing_tr_schools.sum()

In [None]:
# print missing school and training codes
ama5[missing_med_schools]['STSCHGRAD'].sort_values().unique()
ama5[missing_tr_schools]['MEDTRINST'].sort_values().unique()

In [None]:
# check fill rates on variables
# try to match med school name to our medical school name
# merge in first and last name plus ID file

In [None]:
ama5.loc[missing_tr_schools, ['STSCHGRAD', 'MED_SCHOOL', 'MEDTRINST', 'MEDTRINST_NAME']]

In [None]:
ama5.loc[~pd.isnull(ama5['MEDTRINST_NAME'])]

In [None]:
ama3['MEDTRINST'].unique()

In [None]:
id_years = ama_dfs.sort_values(['RESEARCH ID', 'observation_year']).drop_duplicates(
    ['RESEARCH ID', 'observation_year']).groupby(['RESEARCH ID']).size()


In [None]:
id_years.value_counts()


In [None]:
ama_dfs['PE'].unique()

In [None]:
#rename ama_dfs columns
ama_dfs['birth_year'] = ama_dfs['B_DATE'].apply(lambda x: int(str(x)[-4:]))
ama_merge_df = ama_dfs.drop(['first_initial', 'clean_last_name', 'hash_id', 'match_id', 'dno'], axis=1)
ama_merge_df.columns = ['person_uuid']+['AMA_{}'.format(x.lower()) for x in ama_merge_df.columns if x != 'person_uuid']
ama_merge_df.rename(columns={'AMA_research id': 'AMA_research_id'}, inplace=True)

In [None]:
full_df = pd.merge(left=nih_df, right=ama_merge_df, on=['person_uuid'], how='left')

In [None]:
full_df['grad_diff'] = full_df['medschool_year_grad']-full_df['AMA_grad_yr']
full_df['abs_grad_diff'] = full_df['grad_diff'].abs()
full_df['birth_diff'] = full_df['birth_year']-full_df['AMA_birth_year']
full_df['abs_birth_diff'] = full_df['birth_diff'].abs()

In [None]:
id_combos = full_df.sort_values(['person_uuid', 'abs_grad_diff', 'abs_birth_diff']).drop_duplicates(['person_uuid', 'AMA_research_id'])

In [None]:
no_matches = id_combos.loc[pd.isnull(id_combos['AMA_research_id']), [c for c in id_combos.columns if not c.startswith('AMA_')]]
print no_matches.shape

In [None]:
id_combos2 = id_combos.loc[~pd.isnull(id_combos['AMA_research_id'])]

In [None]:
exact_matches = id_combos2.loc[
    (id_combos2['birth_diff']==0) & (id_combos2['grad_diff']==0), NAMES+['person_uuid', 'AMA_research_id']]
counts = exact_matches.groupby(['person_uuid']).size()

In [None]:
counts2 = counts.reset_index()

In [None]:
exact_matches2 = pd.merge(left=exact_matches, right=counts2)
exact_matches3 = exact_matches2.loc[exact_matches2[0]==1]

In [None]:
exact_matches3.shape

In [None]:
matches = pd.merge(left=nih_df, right=exact_matches3[['person_uuid', 'AMA_research_id']], how='inner')

In [None]:
matches2 = pd.concat([matches, no_matches], axis=0)

In [None]:
matches.shape

In [None]:
matches2.shape

In [None]:
no_matches.shape

In [None]:
nih_df.shape

In [None]:
matched_nih_ids = list(matches.person_uuid.values)
matched_ama_ids = list(matches.AMA_research_id.values)

In [None]:
matched_nih_ids[0]

In [None]:
id_combos2 = id_combos.loc[(
        ~id_combos['person_uuid'].isin(matched_nih_ids)) & (~id_combos['AMA_research_id'].isin(matched_ama_ids))]
id_combos2.shape

In [None]:
id_combos2[NAMES+['medschool_year_grad', 'AMA_grad_yr', 'AMA_research_id', 'person_uuid', 'AMA_observation_year', 'birth_year', 'AMA_birth_year']]

In [None]:
full_df.shape

In [None]:
full_df.loc[full_df['RESEARCH ID']==3403630117, ['GRAD_YR', 'medschool_year_grad']]

In [None]:
ama_dfs.loc[ama_dfs['RESEARCH ID']==]

In [None]:
ama_dfs['RESEARCH ID'].value_counts()

In [None]:
df1.head()

In [None]:

ama_data_files