In [1]:
from collections import Counter
import funcy
from fuzzywuzzy import fuzz
import numpy as np 
import pandas as pd 
import os

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings)

from dev import (
    APP_DATA_DIR, SUM_STAT_DIR, ATT_DATA_DIR, CARD_DATA_DIR, CORRECTIONS_DIR, AWARDS_KEYWORDS, NAME_COLS, RAW_NAME_COLS, 
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, FEMALE_FIRST_NAMES, FEMALE_MIDDLE_NAMES, 
    PICKLE_DIR, AAMC_DATA_DIR, GRANT_ID)

from merging_functions import *

OUTPUT_CSV = False 

PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']


# load autoreload extension
%load_ext autoreload
%autoreload 2

In [2]:
apps = pd.read_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_grant_standardized.csv')).drop('Unnamed: 0', axis=1)

In [3]:
apps = apps.rename(columns={'inst': 'medical_school_std', 'department': 'residency_department', 
                           'birth_country_cd': 'birth_country', 'institute': 'NIH_institution', 
                           'sub_department':'residency_sub_dep'})

In [4]:
apps['multiple_applications_flag'] = apps[['application_year_min', 'application_year_max']].apply(
    lambda x: 1 if x[0] != x[1] else 0, axis=1)

In [5]:
def get_percent_df(df_raw, group_col_1, value_col):
    if not isinstance(group_col_1, str):
        grouping_1 = list(group_col_1)
        grouping_2 = grouping_1 + [value_col]
    else:
        grouping_1 = [group_col_1]
        grouping_2 = [group_col_1, value_col]
    c = df_raw.groupby(grouping_1)[PERSON_ID].apply(lambda x: len(x))
    d2_raw = pd.DataFrame(df_raw.groupby(grouping_2)[PERSON_ID].apply(lambda x: len(x.unique())))
    vals = []
    for v in c.index.values:
        try:
            d2_0 = d2_raw.xs(v, level=group_col_1, drop_level=False).apply(lambda x: x/c[v])
            vals.append(d2_0)
        except KeyError as e:
            # if no grouped values in the index, continue loop
            print e
    concat_df = pd.concat(vals, axis=0)
    return concat_df

In [6]:
def calc_sum_stats(df, excel_writer, col_name, combos=None):
    # to basic grouping 
    d1_size = df.shape[0]
    d1 = pd.DataFrame(df[col_name].value_counts(dropna=False)/d1_size)
    d1.to_excel(excel_writer, col_name)
    if not combos:
        combos = [('control_flag', '{}_control_flag'), ('application_year_min', '{}_app_year'), 
                     (('control_flag', 'application_year_min'), '{}_app_year_control')]
    for grouping_combo, sheet_name in combos:
        formatted_sheetname = sheet_name.format(col_name)[:30]
        d2 = get_percent_df(df, grouping_combo, col_name)
        d2.to_excel(excel_writer, formatted_sheetname)
#     d3 = get_percent_df(df, 'application_year_min', col_name)
#     d3.to_excel(excel_writer, '{}_app_year'.format(col_name)[:30])
#     d4 = get_percent_df(df, ('control_flag', 'application_year_min'), col_name)
#     d4.to_excel(excel_writer, '{}_app_year_control'.format(col_name)[:30])
    excel_writer.save()
    # write each to excel

In [7]:
writer = pd.ExcelWriter(os.path.join(PICKLE_DIR, 'sample_summary_stats.xlsx'), engine='openpyxl')

In [8]:
calc_sum_stats(apps, writer, 'medical_school_std', None)

In [9]:
calc_sum_stats(apps, writer, 'residency_department', None)
calc_sum_stats(apps, writer, 'residency_hospital_std', None)
calc_sum_stats(apps, writer, 'internship_hospital_std', None)
calc_sum_stats(apps, writer, 'eod_year', None)
calc_sum_stats(apps, writer, 'application_year_min', [('control_flag', '{}_control_flag')])

1L
1L
1L
1L
1L
1L
1L
1L
1L
1L
1L
1L
1L
1L
1L


In [10]:
calc_sum_stats(apps, writer, 'birth_country')
calc_sum_stats(apps, writer, 'birth_year')
calc_sum_stats(apps, writer, 'multiple_applications_flag')

1959.0
1960.0
1959.0
1960.0
1959.0
1959.0


In [11]:
def count_missing_values(df, col_name, NULL_VALS=None):
    if not NULL_VALS:
        NULL_VALS = ['NONE', 'OTHER', '', 'None', 'NA']
    df.loc[df[col_name].isin(NULL_VALS), col_name] = np.nan
    missing_mask = pd.isnull(df[col_name])
    missing_colname = '{}_missing'.format(col_name)
    df[missing_colname] = 0
    df.loc[missing_mask, missing_colname] = 1
    miss_series = df[missing_colname].value_counts()/df.shape[0]
    return miss_series

In [12]:
res = map(lambda x: count_missing_values(apps, x), 
          ['clean_first_name', 'clean_middle_name', 'clean_last_name', 
            'birth_country', 'birth_year', 'medical_school_std', 'residency_department',
             'residency_hospital', 'internship_hospital', 'undergrad_year_grad', 'race', 
            'clean_college', 'medschool_year_grad', 'internship_start', 'internship_end',
            'residency_start', 'residency_end', 'address', 'city', 'state', 'aamc_id', 
           'eod_year',  'application_year_min'])

In [13]:
missing_series = pd.concat(res, axis=1)
missing_series.to_csv(os.path.join(SUM_STAT_DIR, 'missing_data_summary_stats.csv'))

In [14]:
apps.loc[apps.application_year_min==1959, NAME_COLS+['application_year_min', 'application_year_max', 'application_year']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,application_year_min,application_year_max,application_year
628,JOSEPH,DANIEL,,1959.0,1970.0,1970.0
3110,DONALD,,PUGATCH,1959.0,1961.0,1961.0


In [15]:
apps.loc[apps.application_year_min==1968, 'control_flag'].value_counts()

0    202
1     16
Name: control_flag, dtype: int64

In [16]:
apps.application_year_min.value_counts()

1971.0    679
1970.0    460
1972.0    428
1967.0    223
1969.0    220
1968.0    218
1973.0    214
1963.0    200
1966.0    184
1965.0    180
1964.0    164
1962.0    152
1961.0    140
1975.0    136
1974.0    129
1960.0      6
1959.0      2
Name: application_year_min, dtype: int64

In [17]:
apps.loc[apps.control_flag==0, :].application_year_min.value_counts()

1971.0    318
1970.0    256
1972.0    240
1967.0    211
1968.0    202
1963.0    198
1969.0    197
1966.0    180
1965.0    173
1964.0    164
1962.0    151
1973.0    139
1961.0    138
1975.0     93
1974.0     90
1960.0      6
1959.0      2
Name: application_year_min, dtype: int64

In [14]:
# change all numerical values to percents
# remove grant data

In [34]:
nih.columns

Index([u'Unnamed: 0', u'citizenship', u'data_source', u'dno', u'dob',
       u'eod_year', u'firstname', u'generation', u'institute',
       u'internship_hospital', u'lab_brch', u'lastname', u'medical_school',
       u'middlename', u'program', u'residency_hospital', u'residency',
       u'source', u'ssn', u'supervisor', u'unknown', u'year_grad', u'suffix',
       u'clean_first_name', u'clean_middle_name', u'clean_last_name',
       u'res_dates', u'intern_dates', u'clean_medical_school'],
      dtype='object')

In [27]:
# check for any suffixes or incorrect names
# check for people with missing first/last names
# find people with year accepted but not matched to NIH data set
# add last name counts
apps.loc[pd.isnull(apps.clean_first_name), NAME_COLS+['control_flag', 'medical_school']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,control_flag,medical_school
629,,,DEPPERMAN,1,PENNSLYVANIA
630,,,NEIDENGARD,1,JOHNS HOPKINS
631,,,MESIEFIELD,0,HARVARD
632,,,LUCKY,0,YALE
633,,,LICHTER,0,
635,,,LAURENO,1,CORNELL
638,,,DREWS,0,COLORADO HEALTH SCIENCES CENTER
643,,WILLIAM,MCLAIN,0,DUKE
647,,VIRGIL,FAMIGLIETTI,1,YALE
648,,VINCENT,CHISARI,0,CORNELL
