In [None]:
import difflib
import itertools
import pandas as pd
from collections import Counter
import numpy as np
import string
import funcy
import re
import os
import uuid
import math

In [None]:
APP_DATA_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/applicants_data'
ATT_DATA_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/attendees_data'
RAW_ATT_DATA_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/attendees_data/raw_NIH_data'
CARD_DATA_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/applicants_data/raw_card_data'
SUM_STAT_DIR = '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/summary stats/raw'
print CARD_DATA_DIR

r1_file = 'App card info spreadsheet_delaney.xlsx'
r2_file = 'App card info spreadsheet SavedDJH2.xlsx'
r3_file = 'App card_last484 (1).xlsx'


CLEAN_NAMES = ['clean_first_name', 'clean_middle_name', 'clean_last_name']
NAMES = ['first_name', 'middle_name', 'last_name']
PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']

AWARDS_KEYWORDS = ['HONORS', 'AWARD', 'HONOR', 'SOCIETY', 'SCHOLAR', 'AOA', 'PME', 'FNHS', 'ODK']

In [None]:
r1 = pd.read_excel(os.path.join(CARD_DATA_DIR, r1_file), skiprows=1)
r2 = pd.read_excel(os.path.join(CARD_DATA_DIR, r2_file), skiprows=1)
r3 = pd.read_excel(os.path.join(CARD_DATA_DIR, r3_file), skiprows=1)

r1_2 = r1.drop([c for c in r1.columns if c.startswith('Unnamed:')], axis=1)
r2_2 = r2.drop([c for c in r2.columns if c.startswith('Unnamed:')], axis=1)
r3_2 = r3.drop([c for c in r3.columns if c.startswith('Unnamed:')], axis=1)

del r1, r2, r3

# check differing columns
print 'Extra r1 columns'
print set(r1_2.columns) - set(r2_2.columns)
print 'Extra r2 columns'
print set(r2_2.columns) - set(r1_2.columns)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
r1_2.dropna(how='all', subset=['Sixth', 'Fifth']).loc[:, ['First', 'Second', 'Third', 'Fourth', 'Sixth', 'Fifth']]
# only 16 rows aren't totally blank for the columns fifth and sixth, and these columns are blank for all other honor societies
r1_2['First'] = r1_2.loc[(~r1_2['Fifth'].isnull()), 'Fifth']

# fill in any info from sixth (only 1 row with info)
r1_2['Second'] = r1_2.loc[(~r1_2['Sixth'].isnull()), 'Sixth']

# drop fifth and sixth columns
r1_2.drop(['Fifth', 'Sixth'], axis=1, inplace=True)

# some duplicate column names exist, find them
sorted(r1_2.columns)
# rename second column first_name
# rename 1st year graduated undergrad_graduated and second med_graduated
col_rename_dict = {
    'First': 'first_name', 'Last': 'last_name', 'Middle': 'middle_name',
    'First.1': 'honor_societies_first', 'Second': 'honor_societies_second', 
    'Third': 'honor_societies_third', 'Fourth': 'honor_societies_fourth',
    'Year Graduated': 'undergrad_year_grad', 'Year Graduated.1': 'medschool_year_grad'
}
r1_2.rename(columns=col_rename_dict, inplace=True)
r2_2.rename(columns=col_rename_dict, inplace=True)
r3_2.rename(columns=col_rename_dict, inplace=True)
# note that in data set r2 internship is spelled intership, correcting
r1_2.rename(
    columns={'Internship Year(s)': 'internship year(s)', 'Internship Hospital 1': 'internship hospital 1'}, inplace=True)
r2_2.rename(
    columns={'Intership Year(s)': 'internship year(s)', 'Intership Hospital 1': 'internship hospital 1'}, inplace=True)
r3_2.rename(
    columns={'Internship Year(s)': 'internship year(s)', 'Internship Hospital 1': 'internship hospital 1'}, inplace=True)
# add reviewer column
r1_2['reviewer'] = 1
r2_2['reviewer'] = 2
r3_2['reviewer'] = 3

In [None]:
# append the 2 data sets on top of each other, adding an indicator which reviewer they come from
all_appcards = pd.concat([r1_2, r2_2, r3_2], axis=0)

all_appcards2 = all_appcards.dropna(subset=['first_name', 'last_name'], axis=0, how='all')

all_appcards2.shape

In [None]:
# verify the lengths of pieces all up to len of new data set
print(r1_2.shape[0] + r2_2.shape[0] + r3_2.shape[0] == all_appcards.shape[0])

In [None]:
# change all variable names to lowercase and insert _ instead of spaces
def to_lower(str_var):
    # lowercase, remove extraneous spaces, join with '_'
    lower = str_var.lower()
    return '_'.join(filter(None, lower.split(' ')))

# apply column name cleaning fnc
all_appcards2.columns = map(to_lower, all_appcards2.columns)

# find and delete duplicate columns
count = Counter(all_appcards2.columns)
print count.most_common()

In [None]:
all_appcards3 = all_appcards2.reset_index(drop=True)

In [None]:
all_appcards4 = all_appcards3.reset_index(drop=False).rename(columns={'index': 'raw_uuid'})

In [None]:
# add an id column to save raw data set
str_cols = [c for c in all_appcards2.columns if all_appcards4.dtypes[c]!='float64']

In [None]:
def convert_to_str(row_val):
    if pd.isnull(row_val):
        return np.nan
    try:
        return str(row_val)
    except UnicodeEncodeError:
        return row_val.encode('ascii', 'ignore')

In [None]:
def get_year(x):
    if pd.isnull(x):
        return np.nan
    try:
        return pd.to_datetime(x).year
    except ValueError:
        print x
    

In [None]:
all_appcards4['application_year'] = all_appcards4.application_date.apply(get_year)

In [None]:
all_appcards4.loc[all_appcards4.last_name=='Aron', ['last_name', 'first_name', 'medical_school']]

In [None]:
all_appcards4.application_year.unique()

In [None]:
all_appcards4.reviewer.value_counts()

In [None]:
# convert unicode issues to string
all_appcards4.loc[:, str_cols] = all_appcards4[str_cols].applymap(convert_to_str)

all_appcards4.to_csv(os.path.join(CARD_DATA_DIR, 'raw_applicant_card_data.csv'))