In [1]:
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import os

In [2]:
CARD_DATA_DIR = '/Users/lrraymond13/MIT/Azoulay_RA_2016/Data/raw_card_data'
r1_file = 'delaney_card_info.csv'
r2_file = 'R3_card_info.csv'
r3_file = 'non_double_entered_card_info.csv'


In [3]:
r1 = pd.read_csv(os.path.join(CARD_DATA_DIR, r1_file), skiprows=1)
r2 = pd.read_csv(os.path.join(CARD_DATA_DIR, r2_file), skiprows=1)
r3 = pd.read_csv(os.path.join(CARD_DATA_DIR, r3_file), skiprows=1)

In [4]:
# check differing columns
print 'Extra r1 columns'
print set(r1.columns) - set(r2.columns)
print 'Extra r2 columns'
print set(r2.columns) - set(r1.columns)


Extra r1 columns
set(['Sixth', 'Unnamed: 61', 'Internship Hospital 1', 'Fifth', 'Internship Year(s)'])
Extra r2 columns
set(['Intership Year(s)', 'Unnamed: 59', 'Intership Hospital 1'])


In [5]:
r1.dropna(how='all', subset=['Sixth', 'Fifth']).loc[:, ['First', 'Second', 'Third', 'Fourth', 'Sixth', 'Fifth']]
# only 16 rows aren't totally blank for the columns fifth and sixth, and these columns are blank for all other honor societies
r1['First'] = r1.loc[(~r1['Fifth'].isnull()), 'Fifth']

In [6]:
# fill in any info from sixth (only 1 row with info)
r1['Second'] = r1.loc[(~r1['Sixth'].isnull()), 'Sixth']

In [7]:
# drop fifth and sixth columns
r1.drop(['Fifth', 'Sixth'], axis=1, inplace=True)

In [8]:
# some duplicate column names exist, find them
sorted(r1.columns)
# rename second column first_name
# rename 1st year graduated undergrad_graduated and second med_graduated
col_rename_dict = {
    'First': 'first_name', 'Last': 'last_name', 'Middle': 'middle_name',
    'First.1': 'honor_societies_first', 'Second': 'honor_societies_second', 
    'Third': 'honor_societies_third', 'Fourth': 'honor_societies_fourth',
    'Year Graduated': 'undergrad_year_grad', 'Year Graduated.1': 'medschool_year_grad'
}
r1.rename(columns=col_rename_dict, inplace=True)
r2.rename(columns=col_rename_dict, inplace=True)
r3.rename(columns=col_rename_dict, inplace=True)
# note that in data set r2 internship is spelled intership, correcting
r2.rename(
    columns={'Intership Year(s)': 'internship year(s)', 'Intership Hospital 1': 'internship hospital 1'}, inplace=True)
# add reviewer column
r1['reviewer'] = 1
r2['reviewer'] = 2
r3['reviewer'] = 3

In [9]:
# append the 2 data sets on top of each other, adding an indicator which reviewer they come from
all_appcards = pd.concat([r1, r2, r3], axis=0)


In [10]:
# verify the lengths of pieces all up to len of new data set
print(r1.shape[0] + r2.shape[0] + r3.shape[0] == all_appcards.shape[0])

True


In [11]:
# change all variable names to lowercarse and insert _ instead of spaces
def to_lower(str_var):
    # lowercase, remove extraneous spaces, join with '_'
    lower = str_var.lower()
    return '_'.join(filter(None, lower.split(' ')))

In [12]:
# apply column name cleaning fnc
all_appcards.columns = map(to_lower, all_appcards.columns)

In [22]:
# drop all rows with first, last name NA
all_app_short = all_appcards.dropna(subset=['first_name', 'last_name'], how='all')

In [23]:
all_app_short.application_date.head()

0    6/18/1959
1    5/19/1959
2    7/17/1959
3    4/13/1959
4     1/1/1959
Name: application_date, dtype: object

In [24]:
def id_poorlyfmtdates(str_date):
    try:
        dt = pd.to_datetime(str_date, format='%m/%d/%Y')
        return True
    except (ValueError, AssertionError):
        return False

In [25]:
mask = all_app_short.application_date.apply(id_poorlyfmtdates)

In [26]:
all_app_short.loc[~mask, :]

Unnamed: 0,address,age,application_date,associate_program_entered,bob,ca,cc,cord,citizenship,city,...,honor_societies_fourth,honor_societies_second,honor_societies_third,internship_hospital_1,internship_year(s),last_name,medschool_year_grad,middle_name,reviewer,undergrad_year_grad
681,158 Fox Meadow Rd.,,3/31971,,0.0,1,0.0,,,Scarsdale,...,PBK,AOA,Sigma XI,UNIVERSITY OF CALIFORNIA SAN FRANCISCO,1971-72,Clyman,1971,Ian,2,
3317,600 Ford Ave.,,41/8/1966,RA,0.0,1,0.0,1.0,US,Kingston,...,,PBK,,H LEE MOFFITT CANCER CTR & RES INSTITUTE,1966-67,Siegel,1966,Charles,2,


In [27]:
# change '3/31971 to 3/3/1971
# change 41/8/1966 to 4/8/1966
all_app_short.loc[all_app_short.application_date == '3/31971', 'application_date'] = '3/3/1971'
all_app_short.loc[all_app_short.application_date == '41/8/1966', 'application_date'] = '4/8/1966'

In [28]:
# convert application date to date object
all_app_short['application_date_dt'] = all_app_short.loc[:, 'application_date'].apply(lambda x: pd.to_datetime(x, format='%m/%d/%Y'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [29]:
# do the same date check for birth date columns
bdate_mask = all_app_short.date_of_birth.apply(id_poorlyfmtdates)
all_app_short.loc[~bdate_mask, ['first_name', 'last_name', 'date_of_birth']]
all_app_short.loc[(
        all_app_short.last_name=='Cook') & 
                  (all_app_short.middle_name=='James') & 
                  (all_app_short.first_name.isnull()), 'date_of_birth'] = '1/27/1940'

In [30]:
# all_app_short.loc[all_app_short.last_name=='Cook', ['first_name', 'last_name', 'middle_name', 'address', 'age', 'date_of_birth']]

In [31]:
# apply a basic string cleaning function to the names- removing all punctuation, changing to all uppercase

def trans_remov_punc(to_change, change_to):
    # removes specified punctuation using string maketrans (very fast, C lookups)
    #returns partially evaluated fnc
    trantab = string.maketrans(to_change, change_to)
    return funcy.func_partial(lambda x: x.translate(trantab))


def standardize_whitespace(pub_str):
    return ' '.join(filter(None, pub_str.split(' ')))


def remove_punc(pub_str):
    # function to remove punctuation
    nonelst = ' '*len(string.punctuation)
    fn = trans_remov_punc(string.punctuation, nonelst)
    new_str = fn(pub_str)
    # standardize spaces
    return standardize_whitespace(new_str)


def clean_names(name):
    # if name is missing, return null
    if pd.isnull(name):
        return np.nan
    # uppercase 
    try:
        upp = name.upper()
        return remove_punc(upp)
    except AttributeError:
        # this should catch any other weird issues, like floats, unicode etc
        print name
        return None


In [32]:
names = ['first_name', 'last_name', 'middle_name']
for n in names:
    all_app_short.loc[:, 'clean_{}'.format(n)] = all_app_short[n].apply(clean_names)

In [33]:
# look at all the unique med school
# sorted(all_app_short.medical_school.unique())

In [34]:
all_app_short.loc[:, 'clean_college'] = all_app_short.undergraduate_school.apply(clean_names)

In [35]:
# college name standardization fnc
def clean_std_college_name(college_raw):
# need to change 'college to university' unless Boston college or BU remove ANDS, AT, THE expand UCLA to UCAL, UC Davis etc.
# remove mispellings 'collge', UNIVERWSITY, MASSACHUSSETTS 
# RENSSELAER UNIVERSITY and RENSSELAER POLYTECHNICAL INSTITUTE to RENSSELAER POLYTECHNIC INSTITUTE
# JOHN HOPKINS UNIVERSITY to JOHNS
# HOLY CROSS COLLEGE to HOLY CROSS
# FRANKLIN MARSHALL COLLEGE' to FRANKLIN MARSHALL
# DE PAUW to DEPAUW
# ASBURY ASHBURY 
# DREXEL INSTITUTE OF TECHNOLOGY to DREXEL UNIVERSITY
# A B BROWN UNIVERSITY to Brown
# DARTMOUTH MEDICAL SCHOOL to Dartmouth
# remove random 1961 at the end of strings, (anything after university unless univ is the first word)
    if pd.isnull(college_raw):
        return np.nan
    # if AT or AND or THE, remove
    to_remove = [
        ' AND ', ' AT ', 'THE ', ' COLLGE', 'UNIVERISTY', 'UNIVERWSITY', 'MASSACHUSSETTS', 'JOHN ', 'DE PAUW', 'ASBURY', 
    'DREXEL INSTITUTE OF TECHNOLOGY', 'A B BROWN UNIVERSITY', 'DARTMOUTH MEDICAL SCHOOL', 'RENSSELAER UNIVERSITY', 
    'RENSSELAER POLYTECHNICAL INSTITUTE', ' STE', 'COLLEGE OF HOLY CROSS', 'HOLLY CROSS', 'JOHNSS ',  'BERKLEY',
    'UC ', 'PITTSBURRGH', 'WESLYN', 'WILLAMS', 'GEORGIA TECH', 'NEW YORK UNIVERSITY UNIV', 
    'UNIVERSITY OF MICHIGAN IS A', 'OHIO', 'STATE UNIVERSITY OF NEW YORK AT BUFFALO']
    to_replace = [
        ' ', ' ', ' ', ' COLLEGE', 'UNIVERSITY', 'UNIVERSITY', 'MASSACHUSETTS', 'JOHNS ', 'DEPAUW', 'ASHBURY',
        'DREXEL UNIVERSITY', 'BROWN UNIVERSITY', 'DARTMOUTH', 'RENSSELAER POLYTECHNIC INSTITUTE', 
        'RENSSELAER POLYTECHNIC INSTITUTE', ' STATE', 'HOLY CROSS', 'HOLY CROSS', 'JOHNS ', 
        ' BERKELEY', 'UNIVERSITY OF CALIFORNIA ', 'PITTSBURGH', 'WESLEYAN', 'WILLIAMS', 
        'GEORGIA INSTITUTE OF TECHNOLOGY', 'NEW YORK', 'UNIVERSITY OF MICHIGAN', 'OHIO STATE', 'SUNY BUFFALO']
    word_pairs = zip(to_remove, to_replace)
    # list of words to replace
    words_in_str = filter(lambda (x, y): x in college_raw, word_pairs)
    trans_word = college_raw
    for to_remove_wrd, to_replace_wrd in words_in_str:
        trans_word = trans_word.replace(to_remove_wrd, to_replace_wrd)
        
    # after replacing the mispellings and removing and/at, remove everything after college/university
    if 'BOSTON' in trans_word:
        # then this string is BC or BU, so just return string
        return trans_word
    if trans_word.find('UNIVESITY ') == 0:
        return trans_word
    split_wrd = ' UNIVERSITY'
    if 'COLLEGE' in trans_word:
        split_wrd = ' COLLEGE'
    base_word = trans_word.split(split_wrd)[0]
    return standardize_whitespace(base_word)
    

In [36]:
# make college mispelling and different reference translation table
all_app_short.loc[:, 'clean_college_trans'] = all_app_short.clean_college.apply(clean_std_college_name)

In [37]:
personal_info = [
    'date_of_birth', 'medical_school', 'clean_college_trans', 
    'clean_first_name', 'clean_last_name', 'clean_middle_name']


In [39]:
# need to convert undergrad_year_grad and med_school_grad to numbers to maintain consistence
all_app_short.loc[:, ['undergrad_year_grad', 'medschool_year_grad']] = all_app_short.loc[:, ['undergrad_year_grad', 'medschool_year_grad']].apply(
    lambda x: pd.to_numeric(x, errors='coers'))

In [40]:
# now, sort by names, med school, undergrad school, 
all_app_shorted = all_app_short.sort_values(by=personal_info)

In [41]:
all_app_grouped = all_app_shorted.groupby(['clean_last_name', 'application_date_dt'])

In [42]:
# convert ca column to float62
all_app_shorted.loc[:, 'ca'] = all_app_shorted.loc[:, 'ca'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [43]:
# some of the first names contain 'research society', obviously a mistake
# so replace those with np.nan
all_app_shorted.loc[(all_app_shorted.clean_first_name.isnull()) &
    (all_app_shorted.clean_first_name.str.contains('RESEARCH SOCIETY')), 'clean_first_name'] = np.nan

In [44]:
def sorting_fnc(v):
    # if string entry, sort by length, otherwise return value
    if isinstance(v, str):
        return len(v)
    return v


def consolidate_holes(df_col):
    # for each dataframe of info on one person/application year pair, consolidate info
    # drop null values, get unique and take longest by string length or highest numeric value
    lst_vals = list(df_col.dropna().unique())
    srt = sorted(lst_vals, key=sorting_fnc, reverse=True)
    if len(srt) == 0:
        return np.nan
    return srt[0]


def format_consolidated_data(df):
    # applies consolidate fnc to dataframe, converts it into a df that can be merged
    d2 = df.apply(consolidate_holes)
    return pd.DataFrame(d2).T

In [49]:
def consolidate_candidate(candidate_df):
    # for each dataframe, with a common last name and application date, seperate into unique candidate pairs
    # easiest case, if 2 rows, 1 from reviewer 1 and 2, then consolidate
    df_len = candidate_df.shape[0]
    if df_len < 2:
        return candidate_df
    # check for different first names
    unique_first_names = candidate_df['clean_first_name'].dropna().unique()
    unique_undergrad = candidate_df['undergrad_year_grad'].dropna().unique()
    unique_college = candidate_df['clean_college_trans'].dropna().unique()
#     reviewers = candidate_df['reviewer'].dropna()
    if len(unique_first_names) < 2 and len(unique_undergrad) < 2:
        return format_consolidated_data(candidate_df)
        # otherwise there are more than 1 person to combine
    if len(unique_first_names) >= 2:
        new_grped = candidate_df.groupby(['clean_first_name', 'clean_last_name'])
        print 'group by first name'
        print candidate_df
        res = map(lambda (x, y): format_consolidated_data(y), new_grped)
        return pd.concat(res)
    if len(unique_undergrad) >= 2:
        print 'group by undergrad'
        print candidate_df
        new_grped = candidate_df.groupby(['clean_first_name', 'undergrad_year_grad'])
        res = map(lambda (x, y): format_consolidated_data(y), new_grped)
        return pd.concat(res)
    if len(unique_college) >= 2:
        print 'group by college name'
        print candidate_df
        new_grped = candidate_df.groupby(['clean_first_name', 'clean_college_trans'])
        res = map(lambda (x, y): format_consolidated_data(y), new_grped)
        return pd.concat(res)
    # otherise this is an edge case
    # otherwise, although the person doesn't match on unique first and undergrad years, they are still most likely one 
    # person
    print 'Hitting a bad edge case'
    print candidate_df
    raise AttributeError
    print candidate_df
    return candidate_df

In [50]:
combined_rows = map(lambda (x, y): consolidate_candidate(y), all_app_grouped) 

group by first name
                  address  age application_date associate_program_entered  \
105    2707 Duke Hospital  NaN        6/10/1971                       NaN   
104    2707 Duke Hospital  NaN        6/10/1971                       NaN   
106  2862 East 3335 South  NaN        6/10/1971                       NaN   
105  2862 East 3335 South  NaN        6/10/1971                       NaN   

     bob   ca   cc  cord citizenship            city         ...           \
105  0.0  1.0  0.0   NaN         NaN          Durham         ...            
104  0.0  1.0  0.0   NaN         NaN          Durham         ...            
106  0.0  1.0  0.0   NaN         NaN  Salt Lake City         ...            
105  0.0  1.0  0.0   NaN         NaN  Salt Lake City         ...            

     medschool_year_grad  middle_name reviewer undergrad_year_grad  \
105               1972.0      Kimball        2                 NaN   
104               1972.0      Kimball        1                 NaN  

In [53]:
unique_apps_df = pd.concat(combined_rows)
combined_rows[1]
# [c for c in list_cr if not isinstance(c, pd.DataFrame)]

Unnamed: 0,address,age,application_date,associate_program_entered,bob,ca,cc,cord,citizenship,city,...,medschool_year_grad,middle_name,reviewer,undergrad_year_grad,application_date_dt,clean_first_name,clean_last_name,clean_middle_name,clean_college,clean_college_trans
0,6 Val Mar Place,,5/17/1966,,0,1,0,0,US,San Carlos,...,1966,Alan,2,,1966-05-17 00:00:00,STUART,AARONSON,ALAN,,


In [55]:
# read to csv
unique_apps_df.to_csv(os.path.join(CARD_DATA_DIR, 'deduped_applicants.csv'))
unique_apps_df.to_pickle(os.path.join(CARD_DATA_DIR, 'deduped_applicants.p'))

In [None]:
g1 = list(all_app_grouped)[0][1]

In [51]:
g2 = all_app_grouped.get_group(('ALEXANDER', '1969-04-08'))
g3 = all_app_grouped.get_group(('DRACHMAN', '1958-11-01'))

In [52]:
consolidate_candidate(g3)

group by first name
                                               address  age application_date  \
752  New England Center Hospital, Dept. of Neurolog...  NaN        11/1/1958   
753                                      Duke Hospital  NaN        11/1/1958   
772                                      Duke Hospital  NaN        11/1/1958   
771                               Boston City Hospital  NaN        11/1/1958   

    associate_program_entered  bob   ca   cc  cord citizenship    city  \
752                       NaN  0.0  0.0  0.0   NaN         NaN  Boston   
753                       NaN  0.0  0.0  0.0   NaN          US  Durham   
772                       NaN  NaN  NaN  NaN   NaN          US  Durham   
771                       NaN  NaN  NaN  NaN   NaN         NaN  Boston   

            ...           medschool_year_grad  middle_name reviewer  \
752         ...                        1956.0            B        2   
753         ...                        1956.0            A        

Unnamed: 0,address,age,application_date,associate_program_entered,bob,ca,cc,cord,citizenship,city,...,medschool_year_grad,middle_name,reviewer,undergrad_year_grad,application_date_dt,clean_first_name,clean_last_name,clean_middle_name,clean_college,clean_college_trans
0,"New England Center Hospital, Dept. of Neurolog...",,11/1/1958,,0,0,0,,,Boston,...,1956,B,2,,1958-11-01 00:00:00,DANIEL,DRACHMAN,B,,
0,Duke Hospital,,11/1/1958,,0,0,0,,US,Durham,...,1956,A,2,,1958-11-01 00:00:00,DAVID,DRACHMAN,A,,
