In [1]:
# import and clean AAMC data set

import funcy
import numpy as np 
import pandas as pd 
import os

# load autoreload extension
%load_ext autoreload
%autoreload 2

In [2]:
from data_cleaning_functions import clean_names

In [3]:
# set directories
from dev import (
    SUFFIXES, FEMALE_FIRST_NAMES, FEMALE_MIDDLE_NAMES, NAME_COLS, APP_DATA_DIR, ATT_DATA_DIR, PICKLE_DIR, 
        CORRECTIONS_DIR, SUM_STAT_DIR, AAMC_DATA_DIR)


In [4]:
aamc_raw = pd.read_stata(os.path.join(AAMC_DATA_DIR, 'aamc_match.dta'))

In [5]:
aamc_raw[['clean_first_name', 'clean_middle_name', 'clean_last_name']] = aamc_raw[
    ['fname', 'mname', 'lname']].applymap(clean_names)

In [6]:
string_cols = [
    'suffix_cd', 'birth_country_cd', 'birth_country_desc', 'birth_state_cd', 'ident_cat_desc',
    'degree_inst_1_desc', 'degree_country_1']
aamc_raw[string_cols] = aamc_raw[string_cols].applymap(clean_names)

In [7]:
# aamc_raw.loc[pd.isnull(aamc_raw.clean_middle_initial), :]

In [8]:
def get_middle_initial(raw_str):
    if pd.isnull(raw_str) or raw_str is None:
        return np.nan
    try:
        return raw_str[0]
    except IndexError:
        # this happens because of the annoying blanks in the .dta file
        return np.nan

In [9]:
aamc_raw.loc[:, 'clean_middle_initial'] = aamc_raw.clean_middle_name.apply(get_middle_initial)

In [10]:
aamc_df = aamc_raw.sort_values(['clean_last_name', 'clean_first_name', 'clean_middle_initial', 'yobb'])

In [11]:
# there are about 93 rows that seem to be duplicates on 
# take first duplicate and them merge with second duplicate on first name, middle initial, last name, birth_dt
dups = aamc_df.loc[aamc_df.duplicated(['clean_last_name', 'clean_first_name', 'clean_middle_initial', 'birth_dt'], keep=False), :]

In [12]:
# consolidate ppl from duplicates
first_dups = aamc_df.loc[
    aamc_df.duplicated(['clean_first_name', 'clean_middle_initial', 'clean_last_name', 'birth_dt'], keep='first'), :]
second_dups = aamc_df.loc[
    aamc_df.duplicated(['clean_first_name', 'clean_middle_initial', 'clean_last_name', 'birth_dt'], keep='last'), :]

In [13]:
fixed_dups = pd.merge(
    left=first_dups, right=second_dups, on=['clean_first_name', 'clean_middle_initial', 'clean_last_name', 'birth_dt'], 
    suffixes=['_x', '_y'], how='inner')
print first_dups.shape
print second_dups.shape
print fixed_dups.shape

(30, 19)
(30, 19)
(30, 34)


In [14]:
fixed_dups.rename(columns={'aamc_id_x': 'aamc_id', 'aamc_id_y': 'aamc_id_2'}, inplace=True)

In [15]:
paired_cols = [c.split('_x')[0] for c in fixed_dups if c.endswith('_x')]

In [16]:
def consolidate_col(row):
    # each row comes in as a series
    NULL_STRINGS = ['UNKNOWN', 'OTHER']
    row[row.isin(NULL_STRINGS)]=np.nan
    non_nulls = row[~pd.isnull(row)].values
    if len(non_nulls) == 0:
        return np.nan
    # sort values by string len
    if isinstance(non_nulls[0], str):
        return sorted(non_nulls, key=len)[0]
    # if not a string, just return first value
    return non_nulls[0]

In [17]:
to_replace_cols = {
    colname:
        fixed_dups[
            ['{}{}'.format(colname, suff) for suff in ['_x', '_y']]].apply(
                consolidate_col, axis=1) for colname in paired_cols
    }

In [18]:
fixed_dfs2 = pd.concat([fixed_dups, pd.DataFrame(to_replace_cols)], axis=1).drop(funcy.flatten(
        ('{}_x'.format(c), '{}_y'.format(c)) for c in paired_cols), axis=1)

In [19]:
fixed_dfs2.head()

Unnamed: 0,aamc_id,birth_dt,clean_first_name,clean_last_name,clean_middle_initial,aamc_id_2,birth_country_cd,birth_country_desc,birth_state_cd,clean_middle_name,degree_country_1,degree_inst_1_desc,degree_type,degree_year_1,fname,ident_cat_desc,lname,mname,suffix_cd,yobb
0,13758974,,KEITH,BALDWIN,,13707332,,,,,,,MD,,Keith,WHITE,Baldwin,,,
1,12416381,10/19/1946,MARK,BROWN,S,10967027,,,,S,,,MD,,Mark,,Brown,S,,1946.0
2,12222187,,JAMES,COLLINS,,12154436,,,,,,,MD,1965.0,James,BLACK,Collins,,,
3,13217221,,EDWARD,COOPER,C,12603243,,,,C,,,MD,,Edward,WHITE,Cooper,C,,
4,12881504,,RUDOLPH,CUMBERBATCH,,10967675,,,,,USA,HOWARD UNIVERSITY COLLEGE OF MEDICINE,MD,1959.0,Rudolph,BLACK,Cumberbatch,,,


In [20]:
# append fixed dups to non duplicates and export data set
aamc_fixed = pd.concat([fixed_dfs2, aamc_df.loc[
    ~aamc_df.duplicated(['clean_first_name', 'clean_middle_initial', 'clean_last_name', 'birth_dt'], keep=False), :]])
print aamc_df.shape
print aamc_fixed.shape
print fixed_dfs2.shape

(40744, 19)
(40714, 20)
(30, 20)


In [21]:
aamc_fixed.to_csv(os.path.join(AAMC_DATA_DIR, 'aamc_deduped_raw.csv'))