In [1]:
import difflib
import itertools
import pandas as pd
from collections import Counter
import numpy as np
import string
import funcy
import re
import os
import uuid
import math

In [2]:
APP_DATA_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/applicants_data'
ATT_DATA_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/attendees_data'
RAW_ATT_DATA_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/attendees_data/raw_NIH_data'
CARD_DATA_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/applicants_data/raw_card_data'
SUM_STAT_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/summary stats/raw'
print CARD_DATA_DIR

r1_file = 'App card info spreadsheet_delaney.xlsx'
r2_file = 'App card info spreadsheet SavedDJH2.xlsx'
r3_file = 'App card_last484 (1).xlsx'


CLEAN_NAMES = ['clean_first_name', 'clean_middle_name', 'clean_last_name']
NAMES = ['first_name', 'middle_name', 'last_name']
PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']

AWARDS_KEYWORDS = ['HONORS', 'AWARD', 'HONOR', 'SOCIETY', 'SCHOLAR', 'AOA', 'PME', 'FNHS', 'ODK']

~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/applicants_data/raw_card_data


In [3]:
r1 = pd.read_excel(os.path.join(CARD_DATA_DIR, r1_file), skiprows=1)
r2 = pd.read_excel(os.path.join(CARD_DATA_DIR, r2_file), skiprows=1)
r3 = pd.read_excel(os.path.join(CARD_DATA_DIR, r3_file), skiprows=1)

r1_2 = r1.drop([c for c in r1.columns if c.startswith('Unnamed:')], axis=1)
r2_2 = r2.drop([c for c in r2.columns if c.startswith('Unnamed:')], axis=1)
r3_2 = r3.drop([c for c in r3.columns if c.startswith('Unnamed:')], axis=1)

del r1, r2, r3

# check differing columns
print 'Extra r1 columns'
print set(r1_2.columns) - set(r2_2.columns)
print 'Extra r2 columns'
print set(r2_2.columns) - set(r1_2.columns)

Extra r1 columns
set([u'Sixth', u'Internship Hospital 1', u'Fifth', u'Internship Year(s)'])
Extra r2 columns
set([u'Intership Year(s)', u'Intership Hospital 1'])


In [4]:
r1_2.dropna(how='all', subset=['Sixth', 'Fifth']).loc[:, ['First', 'Second', 'Third', 'Fourth', 'Sixth', 'Fifth']]
# only 16 rows aren't totally blank for the columns fifth and sixth, and these columns are blank for all other honor societies
r1_2['First'] = r1_2.loc[(~r1_2['Fifth'].isnull()), 'Fifth']

# fill in any info from sixth (only 1 row with info)
r1_2['Second'] = r1_2.loc[(~r1_2['Sixth'].isnull()), 'Sixth']

# drop fifth and sixth columns
r1_2.drop(['Fifth', 'Sixth'], axis=1, inplace=True)

# some duplicate column names exist, find them
sorted(r1_2.columns)
# rename second column first_name
# rename 1st year graduated undergrad_graduated and second med_graduated
col_rename_dict = {
    'First': 'first_name', 'Last': 'last_name', 'Middle': 'middle_name',
    'First.1': 'honor_societies_first', 'Second': 'honor_societies_second', 
    'Third': 'honor_societies_third', 'Fourth': 'honor_societies_fourth',
    'Year Graduated': 'undergrad_year_grad', 'Year Graduated.1': 'medschool_year_grad'
}
r1_2.rename(columns=col_rename_dict, inplace=True)
r2_2.rename(columns=col_rename_dict, inplace=True)
r3_2.rename(columns=col_rename_dict, inplace=True)
# note that in data set r2 internship is spelled intership, correcting
r1_2.rename(
    columns={'Internship Year(s)': 'internship year(s)', 'Internship Hospital 1': 'internship hospital 1'}, inplace=True)
r2_2.rename(
    columns={'Intership Year(s)': 'internship year(s)', 'Intership Hospital 1': 'internship hospital 1'}, inplace=True)
r3_2.rename(
    columns={'Internship Year(s)': 'internship year(s)', 'Internship Hospital 1': 'internship hospital 1'}, inplace=True)
# add reviewer column
r1_2['reviewer'] = 1
r2_2['reviewer'] = 2
r3_2['reviewer'] = 3

In [5]:
# append the 2 data sets on top of each other, adding an indicator which reviewer they come from
all_appcards = pd.concat([r1_2, r2_2, r3_2], axis=0)

all_appcards2 = all_appcards.dropna(subset=['first_name', 'last_name'], axis=0, how='all')

all_appcards2.shape

(7788, 63)

In [6]:
# verify the lengths of pieces all up to len of new data set
print(r1_2.shape[0] + r2_2.shape[0] + r3_2.shape[0] == all_appcards.shape[0])

True


In [7]:
# change all variable names to lowercase and insert _ instead of spaces
def to_lower(str_var):
    # lowercase, remove extraneous spaces, join with '_'
    lower = str_var.lower()
    return '_'.join(filter(None, lower.split(' ')))

# apply column name cleaning fnc
all_appcards2.columns = map(to_lower, all_appcards2.columns)

# find and delete duplicate columns
count = Counter(all_appcards2.columns)
print count.most_common()

[('last_name', 1), (u'citizenship', 1), (u'pharm_ra', 1), (u'teaching', 1), (u'cc', 1), (u'ca', 1), (u'nhi', 1), ('honor_societies_fourth', 1), (u'niamdd', 1), ('reviewer', 1), (u'dbs', 1), (u'research', 1), (u'ic', 1), (u'fifth', 1), (u'nimh', 1), (u'city', 1), ('first_name', 1), (u'nigms', 1), ('honor_societies_third', 1), (u'nei', 1), (u'state', 1), (u'oir', 1), (u'niaid', 1), (u'date_of_birth', 1), (u'ra', 1), (u'clinical', 1), (u'rejection_date', 1), (u'pi', 1), (u'nichhd', 1), (u'niamd', 1), (u'zip_code', 1), (u'cord', 1), (u'nichd', 1), ('undergrad_year_grad', 1), (u'sixth', 1), ('medschool_year_grad', 1), (u'rejected', 1), (u'nci', 1), (u'niehs', 1), (u'residency_hospital', 1), (u'address', 1), ('middle_name', 1), (u'application_date', 1), (u'ninds', 1), (u"daniel's_comments", 1), (u'nidr', 1), (u'sa', 1), (u'undergraduate_school', 1), (u'age', 1), (u'residency_type', 1), ('honor_societies_first', 1), (u'medical_school', 1), (u'nhli', 1), (u'nindb', 1), (u'associate_program_ent

In [8]:
all_appcards3 = all_appcards2.reset_index(drop=True)

In [9]:
all_appcards4 = all_appcards3.reset_index(drop=False).rename(columns={'index': 'raw_uuid'})

In [10]:
# add an id column to save raw data set
str_cols = [c for c in all_appcards2.columns if all_appcards4.dtypes[c]!='float64']

In [11]:
def convert_to_str(row_val):
    if pd.isnull(row_val):
        return np.nan
    try:
        return str(row_val)
    except UnicodeEncodeError:
        return row_val.encode('ascii', 'ignore')

In [12]:
def get_year(x):
    if pd.isnull(x):
        return np.nan
    try:
        return pd.to_datetime(x).year
    except ValueError:
        print x
    

In [13]:
all_appcards4['application_year'] = all_appcards4.application_date.apply(get_year)

3/31971
41/8/1966


In [14]:
all_appcards4.loc[all_appcards4.last_name=='Aron', ['last_name', 'first_name', 'medical_school']]

Unnamed: 0,last_name,first_name,medical_school
53,Aron,,New York University School of Medicine
3760,Aron,Ber,New York University School of Medicine


In [15]:
all_appcards4.application_year.unique()

array([ 1959.,  1960.,  1961.,  1962.,  1963.,  1964.,  1965.,  1966.,
        1967.,  1968.,  1969.,  1970.,  1971.,  1972.,  1973.,  1974.,
        1975.,  1957.,  1958.,  1940.,  1932.,  1941.,  2022.,    nan,
        2016.])

In [16]:
all_appcards4.reviewer.value_counts()

1    3704
2    3600
3     484
Name: reviewer, dtype: int64

In [17]:
# convert unicode issues to string
all_appcards4.loc[:, str_cols] = all_appcards4[str_cols].applymap(convert_to_str)

all_appcards4.to_csv(os.path.join(CARD_DATA_DIR, 'raw_applicant_card_data.csv'))