In [1]:
from collections import Counter
import funcy
from fuzzywuzzy import fuzz
import numpy as np 
import pandas as pd 
import os

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings)

from dev import (
    APP_DATA_DIR, SUM_STAT_DIR, ATT_DATA_DIR, CARD_DATA_DIR, CORRECTIONS_DIR, AWARDS_KEYWORDS, NAME_COLS, RAW_NAME_COLS, 
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, FEMALE_FIRST_NAMES,
    PICKLE_DIR, AAMC_DATA_DIR, GRANT_ID)

from merging_functions import *

OUTPUT_CSV = False 

PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']


# load autoreload extension
%load_ext autoreload
%autoreload 2

In [2]:
apps = pd.read_csv(os.path.join(APP_DATA_DIR, 'NIH_AAMC_index_cards_grant_standardized.csv'))

In [3]:
apps = apps.rename(columns={'inst': 'medical_school_std', 'department': 'residency_department', 
                           'birth_country_cd': 'birth_country', 'institute': 'NIH_institution', 
                           'sub_department':'residency_sub_dep'})

In [4]:
apps['multiple_applications_flag'] = apps[['application_year_min', 'application_year_max']].apply(
    lambda x: 1 if x[0] != x[1] else 0, axis=1)

In [5]:
def get_percent_df(df_raw, group_col_1, value_col):
    if not isinstance(group_col_1, str):
        grouping_1 = list(group_col_1)
        grouping_2 = grouping_1 + [value_col]
    else:
        grouping_1 = [group_col_1]
        grouping_2 = [group_col_1, value_col]
    c = df_raw.groupby(grouping_1)[PERSON_ID].apply(lambda x: len(x))
    d2_raw = pd.DataFrame(df_raw.groupby(grouping_2)[PERSON_ID].apply(lambda x: len(x.unique())))
    vals = []
    for v in c.index.values:
        try:
            d2_0 = d2_raw.xs(v, level=group_col_1, drop_level=False).apply(lambda x: x/c[v])
            vals.append(d2_0)
        except KeyError as e:
            # if no grouped values in the index, continue loop
            print e
    concat_df = pd.concat(vals, axis=0)
    return concat_df

In [6]:
def calc_sum_stats(df, excel_writer, col_name, combos=None):
    # to basic grouping 
    d1_size = df.shape[0]
    d1 = pd.DataFrame(df[col_name].value_counts(dropna=False)/d1_size)
    d1.to_excel(excel_writer, col_name)
    if not combos:
        combos = [('control_flag', '{}_control_flag'), ('application_year_min', '{}_app_year'), 
                     (('control_flag', 'application_year_min'), '{}_app_year_control')]
    for grouping_combo, sheet_name in combos:
        formatted_sheetname = sheet_name.format(col_name)[:30]
        d2 = get_percent_df(df, grouping_combo, col_name)
        d2.to_excel(excel_writer, formatted_sheetname)
#     d3 = get_percent_df(df, 'application_year_min', col_name)
#     d3.to_excel(excel_writer, '{}_app_year'.format(col_name)[:30])
#     d4 = get_percent_df(df, ('control_flag', 'application_year_min'), col_name)
#     d4.to_excel(excel_writer, '{}_app_year_control'.format(col_name)[:30])
    excel_writer.save()
    # write each to excel

In [7]:
# writer = pd.ExcelWriter(os.path.join(PICKLE_DIR, 'sample_summary_stats.xlsx'), engine='openpyxl')

# calc_sum_stats(apps, writer, 'medical_school_std', None)

# # calc_sum_stats(apps, writer, 'residency_department', None)
# # calc_sum_stats(apps, writer, 'residency_hospital_std', None)
# # calc_sum_stats(apps, writer, 'internship_hospital_std', None)
# calc_sum_stats(apps, writer, 'eod_year', [('control_flag', '{}_control_flag')])
# calc_sum_stats(apps, writer, 'is_foreign', None)
# calc_sum_stats(apps, writer, 'is_female', None)
# calc_sum_stats(apps, writer, 'application_year_min', [('control_flag', '{}_control_flag')])

# calc_sum_stats(apps, writer, 'birth_country')
# calc_sum_stats(apps, writer, 'birth_year')
# calc_sum_stats(apps, writer, 'multiple_applications_flag')

In [8]:
def count_missing_values(df, col_name, NULL_VALS=None):
    if not NULL_VALS:
        NULL_VALS = ['NONE', 'OTHER', '', 'None', 'NA']
    df.loc[df[col_name].isin(NULL_VALS), col_name] = np.nan
    missing_mask = pd.isnull(df[col_name])
    missing_colname = '{}_missing'.format(col_name)
    df[missing_colname] = 0
    df.loc[missing_mask, missing_colname] = 1
    miss_series = df[missing_colname].value_counts()/df.shape[0]
    return miss_series

In [9]:
res = map(lambda x: count_missing_values(apps, x), 
          ['clean_first_name', 'clean_middle_name', 'clean_last_name', 'birth_date',
            'birth_country', 'birth_year', 'medical_school', 'residency_department',
             'residency_hospital', 'internship_hospital', 'undergrad_year_grad', 'race', 
            'clean_college', 'medschool_year_grad', 'internship_start', 'internship_end',
            'residency_start', 'residency_end', 'address', 'city', 'state', 'aamc_id', 
           'eod_year',  'application_year_min'])

In [10]:
missing_series = pd.concat(res, axis=1)
missing_series.to_csv(os.path.join(SUM_STAT_DIR, 'missing_data_summary_stats.csv'))

In [11]:
apps.loc[pd.isnull(apps.eod_year)&(~pd.isnull(apps.dno)), ['eod_year', 'application_year_max']]

Unnamed: 0,eod_year,application_year_max
1395,,1960.0


In [12]:
apps.loc[pd.isnull(apps.eod_year)&(~pd.isnull(apps.dno)), 'eod_year'] =  apps.loc[
    pd.isnull(apps.eod_year)&(~pd.isnull(apps.dno)), 'application_year_max'] 

In [13]:
control = apps.groupby(['application_year_max', 'control_flag'])['person_uuid'].apply(lambda x: len(x))
female = apps.groupby(['application_year_max', 'is_female'])['person_uuid'].apply(lambda x: len(x))
foreign = apps.groupby(['application_year_max', 'is_foreign'])['person_uuid'].apply(lambda x: len(x))

control_nonmale_nonforeign = apps.loc[(apps.is_female==0) & (apps.is_foreign==0), :].groupby(
    ['application_year_max', 'control_flag'])['person_uuid'].apply(lambda x: len(x))

In [14]:
sum_df = pd.concat([control, female, foreign, control_nonmale_nonforeign], axis=1)

In [15]:
sum_df.columns = ['is_control', 'is_female', 'is_foreign', 'restricted_control']

In [16]:
sum_df.to_csv(os.path.join(SUM_STAT_DIR, 'broad_sample_sum_stats.csv'))