In [1]:
from collections import Counter
import funcy
from fuzzywuzzy import fuzz
import numpy as np 
import pandas as pd 
import os

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings)

from dev import (
    APP_DATA_DIR, GRANT_DATA_DIR, CORRECTIONS_DIR,  NAME_COLS, RAW_NAME_COLS, FEMALE_FIRST_NAMES,
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, PICKLE_DIR, STD_DIR, ATT_DATA_DIR)

from merging_functions import *

OUTPUT_CSV = False 

PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']

manual_corrections_filename = 'NIH_grant_merge_manual_corrections.xlsx'

# load autoreload extension
%load_ext autoreload
%autoreload 2

In [2]:
apps = pd.read_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_standardized.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
grant_data = pd.read_stata(os.path.join(GRANT_DATA_DIR, 'grant_data.dta'))

stars_data = pd.read_stata(os.path.join(GRANT_DATA_DIR, 'stars_data.dta'))

grant_data.columns = map(lambda x: x.lower(), grant_data.columns)

stars_data.columns = map(lambda x: x.lower(), stars_data.columns)

In [11]:
stars_data['is_star'] = 1

In [23]:
unique_grants = grant_data.sort_values('year').drop_duplicates(
    ['newsetnb'], keep='first')[
        ['first', 'middle', 'last', 'suffix', 'newsetnb', 'mdyear', 'phdyear', 'year', 'hphdyear', 'deg_cgaf']].rename(columns={'year': 'first_grant_year'})

In [24]:
unique_grants2 = pd.concat([stars_data.drop(['fullname'], axis=1), unique_grants], axis=0)

In [25]:
unique_grants2['clean_first_name'] = unique_grants2['first'].apply(clean_names)
unique_grants2['clean_middle_name'] = unique_grants2['middle'].apply(clean_names)

unique_grants2['clean_last_name'] = unique_grants2['last'].apply(clean_names)

In [26]:
female_mask = (unique_grants2['clean_first_name'].isin(FEMALE_FIRST_NAMES))
unique_grants2['is_female'] = 0
unique_grants2.loc[female_mask, 'is_female'] = 1

In [27]:
def get_initial(raw_str):
    if pd.isnull(raw_str):
        return np.nan
    if len(raw_str) == 1:
        return raw_str
    return raw_str[0]

unique_grants2['clean_middle_initial'] = unique_grants2.clean_middle_name.apply(get_initial)
unique_grants2['clean_first_initial'] = unique_grants2.clean_first_name.apply(get_initial)

In [28]:
# drop females and people with masters degrees
unique_grants_2a = unique_grants2.loc[unique_grants2.is_female==0, :]
unique_grants_2b = unique_grants_2a.loc[unique_grants_2a['deg_cgaf']!='Masters', :]

In [30]:
unique_grants3 = unique_grants_2b[
    unique_grants_2b.is_female==0].sort_values(['newsetnb', 'is_star'], ascending=False).drop_duplicates(['newsetnb'], keep='first')

In [31]:
apps_counter = Counter(apps.clean_last_name.values)
unique_grants3['last_name_counts'] = unique_grants3.clean_last_name.apply(lambda x: apps_counter[x])
unique_grants4 = unique_grants3[((
        (unique_grants3.last_name_counts>0) & (unique_grants3.mdyear < 1975)) |
        (unique_grants3.last_name_counts>0) & (pd.isnull(unique_grants3.mdyear)))]
#     ((unique_grants3.last_name_counts>1) & (unique_grants3.mdyear<1975) & (unique_grants3.mdyear>1960))]

In [32]:
unique_grants4 = unique_grants4.rename(columns={
            'mdyear': 'medschool_year_grad', 'phdyear': 'phd_year_grad', 
            'star_yob':'birth_year', 'star_yod': 'death_year', 'first': 'grant_first_name', 
            'middle': 'grant_middle_name', 'last': 'grant_last_name', 'hphdyear': 'grant_hphd_year_grad'})

In [35]:
unique_grants4[
    ['grant_medschool_year_grad', 'grant_phd_year_grad', 'grant_birth_year', 'grant_degree_type']] = unique_grants4[
        ['medschool_year_grad', 'phd_year_grad', 'birth_year', 'deg_cgaf']]

In [36]:
unique_grants4.head()

Unnamed: 0,deg,deg_cgaf,exit,grant_first_name,first_grant_year,grant_hphd_year_grad,is_star,grant_last_name,main_dept,medschool_year_grad,...,clean_middle_name,clean_last_name,is_female,clean_middle_initial,clean_first_initial,last_name_counts,grant_medschool_year_grad,grant_phd_year_grad,grant_birth_year,grant_degree_type
1867084,,MD/PhD,,Eric,1998.0,,,Rubin,,,...,,RUBIN,0,,E,5,,,,MD/PhD
1867083,,PhD,,William,2000.0,,,Randall,,,...,C,RANDALL,0,C,W,1,,1983.0,,PhD
1867070,,PhD,,Michael,1987.0,,,Smith,,,...,E,SMITH,0,E,M,23,,1987.0,,PhD
1867068,,PhD,,Suzanne,2000.0,,,Bradshaw,,,...,F,BRADSHAW,0,F,S,2,,1991.0,,PhD
1867050,,PhD/OhD,,Fred,1981.0,,,Bryan,,,...,A,BRYAN,0,A,F,1,,,,PhD/OhD


In [37]:
unique_grants3.shape

(174957, 23)

In [38]:
del unique_grants2, unique_grants, unique_grants3, grant_data, stars_data

In [39]:
exact = pd.merge(
    left=apps, right=unique_grants4, on=['clean_first_name', 'clean_last_name'], how='inner', suffixes=['_x', '_y'])
exact.shape

(4277, 161)

In [40]:
print apps.shape

(4106, 136)


In [41]:
exact['grad_sim'] = exact.apply(lambda x: abs(x['medschool_year_grad_x'] - x['medschool_year_grad_y']), axis=1)
missing_phd_mask = pd.isnull(exact['medschool_year_grad_y'])
exact.loc[missing_phd_mask, 'grad_sim'] = exact[
    missing_phd_mask].apply(lambda x: abs(x['medschool_year_grad_x'] - x['grant_phd_year_grad']), axis=1)
exact['hphd_grad_sim'] = exact.apply(lambda x: abs(x['medschool_year_grad_x'] - x['grant_hphd_year_grad']), axis=1)
exact['activity_year_sim'] = exact.apply(lambda x: x['first_grant_year'] - x['medschool_year_grad_x'], axis=1)

def check_match(row):
    grad_diffs = funcy.remove(lambda x: pd.isnull(x), [row['grad_sim'], row['hphd_grad_sim']])
    grad_sim = 0
    if len(grad_diffs) > 0:
        grad_sim = np.min(grad_diffs)
    if grad_sim > 5:
        return 0
    # first grant shouldn't be given prior to med school graduation
    if row['activity_year_sim'] < 0:
        return 0
    if ~pd.isnull(row['grant_phd_year_grad']) and row['grant_phd_year_grad'] != row['medschool_year_grad_x']:
        return 0
    if not pd.isnull(row['clean_middle_initial_sim']) and row['clean_middle_initial_sim'] < 80:
        return 0
    if row['last_name_counts'] == 1:
        return 1
    return 1

feature_dict = {
    'clean_middle_name': get_name_str_sim,
    'clean_middle_initial': get_name_str_sim,
} 

exact2 = add_similarity_features(exact, feature_dict, check_match, suffixes=['_x', '_y'])

print unique_grants4.is_star.sum()
print exact2[exact2['is_match']==1].is_star.sum()
print exact2.is_star.sum()
print exact2['is_match'].sum()

1131.0
12.0
589.0
47


In [42]:
# exact2.loc[exact2['is_match']==1,NAME_COLS+['grant_first_name', 'grant_middle_name', 'grant_last_name', 'medschool_year_grad_x', 'medschool_year_grad_y', 'birth_year_x', 'birth_year_y']]

In [43]:
# change raw sim columns to 100- so can be sorted with small values first

exact2.loc[:, 'clean_middle_name_sim'] = 100 - exact2['clean_middle_name_sim']
exact2.loc[:, 'clean_middle_initial_sim'] = 100 - exact2['clean_middle_initial_sim']

In [44]:
# do sort in ascending, because grad year sim and activity sim should be as small as possible
sim_cols =  ['grad_sim', 'activity_year_sim', 'clean_middle_name_sim', 'clean_middle_initial_sim']
exact3 = filter_one_match_per_group_simple(exact2[exact2['is_match']==1], PERSON_ID, sim_cols, True)
exact4 = filter_one_match_per_group_simple(exact3, 'newsetnb', sim_cols, True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[dup_flag] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [45]:
exact5 = consolidate_merge_cols(exact4, ['_x', '_y'], [])

['clean_middle_name', 'medschool_year_grad', 'birth_year', 'is_female', 'clean_first_initial', 'clean_middle_initial']


In [46]:
nm_apps = get_nonmatched(apps, exact4[PERSON_ID].values, PERSON_ID)
nm_grant = get_nonmatched(unique_grants4, exact4['newsetnb'].values, 'newsetnb')

(4059, 136)
(46535, 27)


In [47]:
nm_apps_c = Counter(nm_apps.clean_last_name.values)
nm_grants_c = Counter(nm_grant.clean_last_name.values)
nm_apps.loc[:, 'last_name_counts'] = nm_apps.clean_last_name.apply(lambda x: nm_apps_c[x])

nm_grant.loc[:, 'last_name_counts'] = nm_grant.clean_last_name.apply(lambda x: nm_grants_c[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [48]:
last_merge = pd.merge(left=nm_apps, right=nm_grant, how='inner', on=['clean_last_name'], suffixes=['_x', '_y'])
last_merge.shape

(198341, 163)

In [49]:
del exact, exact2, exact3, exact4

In [50]:
last_merge['grad_sim'] = last_merge.apply(lambda x: abs(x['medschool_year_grad_x'] - x['medschool_year_grad_y']), axis=1)
# missing_phd_mask = pd.isnull(last_merge['medschool_year_grad_y'])
# last_merge.loc[missing_phd_mask, 'grad_sim'] = last_merge[
#     missing_phd_mask].apply(lambda x: abs(x['medschool_year_grad_x'] - x['grant_phd_year_grad']), axis=1)
last_merge['hphd_grad_sim'] = last_merge.apply(
    lambda x: abs(x['medschool_year_grad_x'] - x['grant_hphd_year_grad']), axis=1)
last_merge['activity_year_sim'] = last_merge.apply(
    lambda x: x['first_grant_year'] - x['medschool_year_grad_x'], axis=1)


def check_match(row):
    grad_diffs = funcy.remove(lambda x: pd.isnull(x), [row['grad_sim'], row['hphd_grad_sim']])
    grad_sim = 0
    if len(grad_diffs) > 0:
        grad_sim = np.min(grad_diffs)
    if grad_sim > 5:
        return 0
    # first grant shouldn't be given prior to med school graduation
    if row['activity_year_sim'] < 0:
        return 0
    if row['clean_first_initial_sim'] < 95:
        return 0
    if row['clean_middle_initial_sim'] < 95:
        return 0
    if row['clean_first_name_sim'] < 80:
        return 0
    if row['clean_middle_name_sim'] < 80:
        return 0
    if ~pd.isnull(row['grant_phd_year_grad']) and row['grant_phd_year_grad'] > 1975: 
        return 0
    if ~pd.isnull(row['birth_year_y']) and row['birth_year_y'] > 1955:
        return 0
    return 1

feature_dict = {
    'clean_middle_name': get_name_str_sim,
    'clean_middle_initial': get_name_str_sim,
    'clean_first_name': get_name_str_sim,
    'clean_first_initial': get_name_str_sim,
} 

last_merge2 = add_similarity_features(last_merge, feature_dict, check_match, suffixes=['_x', '_y'])
# print unique_grants4.is_star.sum()
print last_merge2[last_merge2['is_match']==1].is_star.sum()
print last_merge2.is_star.sum()
print last_merge2['is_match'].sum()

553.0
3712.0
2034


In [51]:
# last_merge2.loc[(
#         ~pd.isnull(last_merge2['grant_phd_year_grad']) & (last_merge2['is_match']==1)), NAME_COLS+['clean_first_name_x', 'clean_first_name_y', 'grant_first_name', 'grant_middle_name', 'grant_last_name', 'medschool_year_grad_x', 'medschool_year_grad_y', 'birth_year_x', 'birth_year_y', 'grant_phd_year_grad']]

In [52]:
#filter to one match per id
last_merge2.loc[:, 'clean_first_name_sim'] = 100 - last_merge2['clean_first_name_sim']
last_merge2.loc[:, 'clean_middle_name_sim'] = 100 - last_merge2['clean_middle_name_sim']
last_merge2.loc[:, 'clean_middle_initial_sim'] = 100 - last_merge2['clean_middle_initial_sim']
last_merge2.loc[:, 'clean_first_initial_sim'] = 100 - last_merge2['clean_first_initial_sim']

In [53]:
sim_cols =  ['grad_sim', 'activity_year_sim', 'clean_first_name_sim', 'clean_first_initial_sim', 'clean_middle_name_sim', 'clean_middle_initial_sim']
last_merge3 = filter_one_match_per_group_simple(last_merge2[last_merge2['is_match']==1], PERSON_ID, sim_cols, True)
last_merge4 = filter_one_match_per_group_simple(last_merge3, 'newsetnb', sim_cols, True)

In [54]:
last_merge4.shape

(1737, 173)

In [55]:
last_merge5 = consolidate_merge_cols(last_merge4, ['_x', '_y'], [])

['clean_first_name', 'clean_middle_name', 'medschool_year_grad', 'birth_year', 'is_female', 'clean_first_initial', 'clean_middle_initial', 'last_name_counts']


In [56]:
all_matches = pd.concat([last_merge5, exact5], axis=0)

In [57]:
nm_apps = get_nonmatched(apps, all_matches[PERSON_ID], PERSON_ID)

(2322, 136)


In [58]:
apps2 = pd.concat([all_matches, nm_apps], axis=0)

In [80]:
# merge in manual corrections
manual_corrections_df = pd.read_excel(os.path.join(CORRECTIONS_DIR, manual_corrections_filename))

In [81]:
apps2[PERSON_ID] = apps2[PERSON_ID].apply(lambda x: int(x))

In [87]:
manual_merges = pd.merge(
    left=apps2, right=manual_corrections_df[[
        PERSON_ID, 'correct_match_flag', 'flag_manual_review', 'correct_newsetnb', 'old_newsetnb']], 
                on=[PERSON_ID], how='left')
print manual_corrections_df['person_uuid'].unique().shape
print manual_merges['person_uuid'].unique().shape

(88,)
(4106,)


In [91]:
manual_review_mask = manual_merges.flag_manual_review==1
manual_merges.loc[manual_review_mask, 'newsetnb'] = manual_merges.loc[manual_review_mask, 'correct_newsetnb']

In [95]:
to_drop_cols = [c for c in apps2.columns if c.endswith('_missing') or c.endswith('_duplicate') or '_counts' in c
               or c.endswith('_sim')]
print to_drop_cols

['activity_year_sim', 'clean_first_initial_sim', 'clean_first_name_sim', 'clean_middle_initial_sim', 'clean_middle_name_sim', 'grad_sim', 'hphd_grad_sim', 'last_name_counts', 'newsetnb_duplicate', 'person_uuid_duplicate']


In [97]:
apps3 = manual_merges.drop(to_drop_cols+['correct_newsetnb', 'old_newsetnb'], axis=1)

In [98]:
apps3.loc[apps3.duplicated(NAME_COLS, keep=False), NAME_COLS+['grant_id', PERSON_ID, 'application_year', 'medical_school']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,grant_id,person_uuid,application_year,medical_school
105,ARNOLD,L,SMITH,,4749,1964.0,UNIVERSITY OF PENNSYLVANIA SCHOOL OF MEDICINE
3996,ARNOLD,L,SMITH,,192,1963.0,COLUMBIA UNIVERSITY COLLEGE OF PHYSICIANS AND ...


In [99]:
has_birth_dt = ~pd.isnull(apps3.date_of_birth)
apps3.loc[has_birth_dt, 'birth_dt'] =  apps3.loc[has_birth_dt, 'date_of_birth']
apps3.loc[has_birth_dt, 'birth_year'] =  apps3.loc[has_birth_dt, 'date_of_birth'].apply(lambda x: pd.to_datetime(x).year)
apps4 = apps3.drop('date_of_birth', axis=1).rename(columns={'birth_dt': 'birth_date'})


In [100]:
def improper_formatting(raw_last_name):
    # a boolean fnc to identify which rows may have a suffix
    if pd.isnull(raw_last_name):
        return False
    last_lst = raw_last_name.split(' ')
    if len(last_lst) == 1:
        # if no white spaces in last name, only 1 word, so no suffix
        return False
    return True


In [101]:
# check for improperly formatted last names

bad_lastnames = ['E ROSS HARVARD','MCCLURE MCCHURE', 'S COHEN NYU']
to_fix_lastnames = ['ROSS', 'MCCLURE', 'COHEN']
# apps4.loc[apps4.clean_last_name.isin(bad_lastnames), 'clean_last_name'] = to_fix_lastnames
apps4.loc[apps4.clean_last_name=='MCCLURE MCCHURE', 'clean_last_name'] = 'MCCLURE'


In [102]:
# check for improper first names
# apps4.loc[apps4.clean_first_name.apply(improper_formatting), NAME_COLS]
bad_firstnames = ['ANNE FRANCES',  
                  'PHILIP R', 'J HAROLD', 'HENRY N', 'H BENFER', 'G JAMES', 'G DAVID', 'A LELAND', 'W STEVES']
to_fix_firstnames = ['FRANCES',  'PHILIP', 'J', 'HENRY', 'H', 'G', 'G', 'A', 'W']
apps4.loc[apps4.clean_first_name.isin(bad_firstnames), 'clean_first_name'] = ['W',  'HENRY', 'G']
# apps4.loc[apps4.clean_first_name.isin(bad_firstnames), 'clean_first_name'] 

In [103]:
# check for improper middle names
bad_middlenames = ['ANNE FRANCES',  'PHILIP R', 'J HAROLD', 'HENRY N', 'H BENFER', 'G JAMES', 'G DAVID', 'A LELAND', 'W STEVES']
to_fix_middlenames = ['FRANCES',  'PHILIP', 'J', 'HENRY', 'H', 'G', 'G', 'A', 'W']

has_suff = apps4.clean_middle_name.apply(has_suffix)
apps4.loc[has_suff, 'clean_suffix'] = apps4.loc[has_suff, 'clean_middle_name'].apply(get_suffix)
apps4.loc[has_suff, 'clean_middle_name'] = apps4.loc[has_suff, 'clean_middle_name'].apply(remove_suffix_from_last_name)

In [104]:
has_suff = apps4.clean_last_name.apply(has_suffix)
apps4.loc[has_suff, 'clean_suffix'] = apps4.loc[has_suff, 'clean_last_name'].apply(get_suffix)
apps4.loc[has_suff, 'clean_last_name'] = apps4.loc[has_suff, 'clean_last_name'].apply(remove_suffix_from_last_name)

In [105]:
def remove_dr_from_first_name(raw_str):
    if pd.isnull(raw_str):
        return np.nan
    raw_str2 = raw_str.split(' DR')
    return raw_str2[0]

In [106]:
apps4.loc[(pd.isnull(apps4.clean_suffix) & (~pd.isnull(apps4.suffix))), 'clean_suffix'] =  apps4.loc[
    (pd.isnull(apps4.clean_suffix) & (~pd.isnull(apps4.suffix))), 'suffix']


In [107]:
apps4.loc[:, 'clean_suffix'] = apps4.clean_suffix.apply(clean_names)

In [108]:
apps4.loc[pd.isnull(apps4.clean_suffix), 'clean_suffix'] = np.nan

In [109]:
mask = apps4.clean_middle_name.apply(improper_formatting)
apps4.loc[mask, 'clean_middle_name'] = apps4.loc[mask, 'clean_middle_name'].apply(remove_dr_from_first_name)

In [110]:
def same_name(row):
    # check if first and last name are the same
    if row[0] == row[1]:
        print row
        return True
    return False

def is_weird_name(raw_str):
    rs = raw_str.split(' ')
    if len(rs) == 1:
        return False
    return True

In [111]:
apps4.loc[
    apps4[['clean_first_name', 'clean_last_name']].apply(same_name, axis=1), 'clean_first_name'] = 'JOHN'
#     NAME_COLS+['medical_school', 'first_name', 'middle_name', 'last_name']]

In [112]:
apps4.loc[apps4.clean_first_name==apps4.clean_middle_name, NAME_COLS+['grant_first_name', 'grant_middle_name',
        'NIH_first_name', 'NIH_middle_name', 'first_name', 'middle_name']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,grant_first_name,grant_middle_name,NIH_first_name,NIH_middle_name,first_name,middle_name
446,ARTHUR,ARTHUR,GOTTLIEB,A.,Arthur,ARTHUR,,Arthur,
4044,ALLAN,ALLAN,HOBSON,,,,,Allan,Allan


In [113]:
apps4.loc[
    apps4[['clean_first_name', 'clean_middle_name']].apply(same_name, axis=1), 'clean_middle_name'] = np.nan

clean_first_name     ARTHUR
clean_middle_name    ARTHUR
Name: 446, dtype: object
clean_first_name     ALLAN
clean_middle_name    ALLAN
Name: 4044, dtype: object


In [114]:
apps4.loc[
    apps4[['clean_middle_name', 'clean_last_name']].apply(same_name, axis=1), NAME_COLS]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name


In [115]:
# ADD LAST NAME COUNTS
nih = pd.read_csv(os.path.join(ATT_DATA_DIR, 'NIH_attendee_deduped_raw.csv'))

In [116]:
last_name_counts = Counter(nih.clean_last_name.values)

In [117]:
apps4['nih_last_name_counts'] = apps4.clean_last_name.apply(lambda x: last_name_counts[x])

In [118]:
def total_number_applications(app_years):
    unique_apps_years = app_years.dropna().unique()
    return unique_apps_years.shape[0]

apps4['number_applications'] = apps4[
    ['application_year', 'application_year_1', 'application_year_2', 'application_year_3']].apply(
        total_number_applications, axis=1)

In [119]:
to_drop = ['clean_first_initial', 'clean_middle_initial',  'exit', 'dob_app_diff', 'duplicate_dno', 'inst',
          'main_dept', 'sim', 'nih_last_name_counts']

In [120]:

apps5 = apps4.sort_values(['clean_last_name', 'clean_first_name']).drop(to_drop, axis=1)

IMP_COLS = [
    PERSON_ID, 'aamc_id', 'newsetnb', 'dno', 'clean_first_name', 'clean_middle_name', 'clean_last_name',
    'clean_suffix','control_flag', 'application_year_min', 'application_year_max', 'eod_year',
    'medical_school', 'birth_year', 'is_star',
    'residency_dates', 'residency_dates_NIH', 'internship_dates', 'internship_dates_NIH',
    'medschool_year_grad', 'birth_year', 
    'is_female', 'is_foreign', 'number_applications', 'NIH_first_name', 
    'NIH_middle_name', 'NIH_last_name', 'NIH_medical_school',
    'grant_first_name', 'grant_middle_name', 'grant_last_name',
    'grant_medschool_year_grad', 'grant_phd_year_grad', 'grant_hphd_year_grad',
    'aamc_first_name', 'aamc_middle_name', 'aamc_last_name',
    'year_accepted', 'rejected', 'rejection_date',
    'internship_start', 'internship_end', 
    'internship_hospital', 'residency_start', 'residency_end', 'residency_hospital', 
    'undergrad_year_grad', 
    'address', 'city', 'state', 'zip_code']

IMPORTANT_COLS = [NIH_ID, PERSON_ID, 'application_year_min', 'application_year_max', 'eod_year',
                  'clean_first_name', 'clean_middle_name', 
                 'clean_last_name', 'control_flag', 'time_period_flag', 'year_accepted', 'rejected',
                  'rejection_date', 'clean_college', 'medical_school',
                'residency_dates', 'internship_dates', 'is_female', 'is_foreign', 'number_applications']

RESEARCH_COLS = ['bob', 'ca', 'cc', 'clinical', 'cord', 'dbs', 'fifth', 'generation', 'honor_societies_first',
                 'honor_societies_fourth', 'honor_societies_second', 'honor_societies_third', 'ic', 'institute',
                  'nci', 'nei', 'nhi', 'nhli', 'niaid', 'niamd', 'niamdd', 'nichd', 'nichhd',
                 'nidr', 'niehs', 'nigms', 'nimh', 'nindb', 'ninds', 'oir', 
                  'pharm_ra', 'phd_year_grad', 'pi', 'program', 'ra',  'research', 'sa', 'sixth',
                 'supervisor', 'teaching', 'withdrawal']

In [121]:
col_ordered =  funcy.remove(lambda x: x in IMP_COLS or x in RESEARCH_COLS, apps5.columns)
print col_ordered

['aamc_id_2', 'aamc_medical_school', 'aamc_medschool_year_grad', 'age', 'application_date', 'application_year', 'application_year_1', 'application_year_2', 'application_year_3', 'associate_program_entered', 'birth_country_cd', 'birth_country_desc', 'birth_date', 'birth_state_cd', 'citizenship', 'clean_college', 'data_source', 'death_year', 'deg', 'deg_cgaf', 'degree_country_1', 'degree_type', 'department', 'first_grant_year', 'first_name', 'grant_birth_year', 'grant_degree_type', 'institution_aamc_id', 'internship_1', 'internship_hospital_ipfcode', 'internship_hospital_std', 'is_match', 'last_name', 'middle_name', 'original_medical_school', 'race', 'raw_uuid', 'raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3', 'raw_uuid_4', 'raw_uuid_5', 'residency', 'residency_1', 'residency_hospital_ipfcode', 'residency_hospital_std', 'reviewer', 'source', 'ssn', 'sub_department', 'suffix', 'suffix_cd', 'time_period_flag', 'undergraduate_school', 'year_grad', u'correct_match_flag', u'flag_manual_review']


In [122]:
apps5.loc[pd.isnull(apps5.is_star), 'is_star'] = 0

In [123]:
apps6 = apps5[IMP_COLS + col_ordered + RESEARCH_COLS]

In [124]:
apps7 = apps6[~pd.isnull(apps6.clean_last_name)]

In [126]:
apps7.to_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_grant_standardized.csv'), index=False)