In [143]:
import difflib
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import re
import os

In [88]:
CARD_DATA_DIR = os.path.abspath('Data/raw_card_data')
print CARD_DATA_DIR
r1_file = 'delaney_card_info.csv'
r2_file = 'R3_card_info.csv'
r3_file = 'non_double_entered_card_info.csv'


/home/lraymond/MIT/Azoulay_2016/yellow_berets/yellow_beret/Data/raw_card_data


In [89]:
CLEAN_NAMES = ['clean_first_name', 'clean_middle_name', 'clean_last_name']
NAMES = ['first_name', 'middle_name', 'last_name']
PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']

AWARDS_KEYWORDS = ['HONORS', 'AWARD', 'HONOR', 'SOCIETY', 'SCHOLAR', 'AOA', 'PME', 'FNHS', 'ODK']

In [90]:
r1 = pd.read_csv(os.path.join(CARD_DATA_DIR, r1_file), skiprows=1)
r2 = pd.read_csv(os.path.join(CARD_DATA_DIR, r2_file), skiprows=1)
r3 = pd.read_csv(os.path.join(CARD_DATA_DIR, r3_file), skiprows=1)

In [91]:
# check differing columns
print 'Extra r1 columns'
print set(r1.columns) - set(r2.columns)
print 'Extra r2 columns'
print set(r2.columns) - set(r1.columns)


Extra r1 columns
set(['Sixth', 'Unnamed: 61', 'Internship Hospital 1', 'Fifth', 'Internship Year(s)'])
Extra r2 columns
set(['Intership Year(s)', 'Unnamed: 59', 'Intership Hospital 1'])


In [92]:
r1.dropna(how='all', subset=['Sixth', 'Fifth']).loc[:, ['First', 'Second', 'Third', 'Fourth', 'Sixth', 'Fifth']]
# only 16 rows aren't totally blank for the columns fifth and sixth, and these columns are blank for all other honor societies
r1['First'] = r1.loc[(~r1['Fifth'].isnull()), 'Fifth']

In [93]:
# fill in any info from sixth (only 1 row with info)
r1['Second'] = r1.loc[(~r1['Sixth'].isnull()), 'Sixth']

In [94]:
# drop fifth and sixth columns
r1.drop(['Fifth', 'Sixth'], axis=1, inplace=True)

In [95]:
# some duplicate column names exist, find them
sorted(r1.columns)
# rename second column first_name
# rename 1st year graduated undergrad_graduated and second med_graduated
col_rename_dict = {
    'First': 'first_name', 'Last': 'last_name', 'Middle': 'middle_name',
    'First.1': 'honor_societies_first', 'Second': 'honor_societies_second', 
    'Third': 'honor_societies_third', 'Fourth': 'honor_societies_fourth',
    'Year Graduated': 'undergrad_year_grad', 'Year Graduated.1': 'medschool_year_grad'
}
r1.rename(columns=col_rename_dict, inplace=True)
r2.rename(columns=col_rename_dict, inplace=True)
r3.rename(columns=col_rename_dict, inplace=True)
# note that in data set r2 internship is spelled intership, correcting
r2.rename(
    columns={'Intership Year(s)': 'internship year(s)', 'Intership Hospital 1': 'internship hospital 1'}, inplace=True)
# add reviewer column
r1['reviewer'] = 1
r2['reviewer'] = 2
r3['reviewer'] = 3

In [96]:
# append the 2 data sets on top of each other, adding an indicator which reviewer they come from
all_appcards = pd.concat([r1, r2, r3], axis=0)


In [97]:
# verify the lengths of pieces all up to len of new data set
print(r1.shape[0] + r2.shape[0] + r3.shape[0] == all_appcards.shape[0])

True


In [98]:
# change all variable names to lowercase and insert _ instead of spaces
def to_lower(str_var):
    # lowercase, remove extraneous spaces, join with '_'
    lower = str_var.lower()
    return '_'.join(filter(None, lower.split(' ')))

In [99]:
# apply column name cleaning fnc
all_appcards.columns = map(to_lower, all_appcards.columns)

In [100]:
# drop all rows with first, last name NA
all_app_short = all_appcards.dropna(subset=['first_name', 'last_name'], how='all')

In [101]:
def id_poorlyfmtdates(str_date):
    try:
        dt = pd.to_datetime(str_date, format='%m/%d/%Y')
        return True
    except (ValueError, AssertionError):
        return False

In [102]:
mask = all_app_short.application_date.apply(id_poorlyfmtdates)

In [103]:
all_app_short.loc[~mask, :]

Unnamed: 0,address,age,application_date,associate_program_entered,bob,ca,cc,cord,citizenship,city,...,honor_societies_fourth,honor_societies_second,honor_societies_third,internship_hospital_1,internship_year(s),last_name,medschool_year_grad,middle_name,reviewer,undergrad_year_grad
681,158 Fox Meadow Rd.,,3/31971,,0.0,1,0.0,,,Scarsdale,...,PBK,AOA,Sigma XI,UNIVERSITY OF CALIFORNIA SAN FRANCISCO,1971-72,Clyman,1971,Ian,2,
3317,600 Ford Ave.,,41/8/1966,RA,0.0,1,0.0,1.0,US,Kingston,...,,PBK,,H LEE MOFFITT CANCER CTR & RES INSTITUTE,1966-67,Siegel,1966,Charles,2,


In [104]:
# change '3/31971 to 3/3/1971
# change 41/8/1966 to 4/8/1966 
all_app_short.loc[all_app_short.application_date=='3/31971', 'application_date'] = '3/3/1971'
all_app_short.loc[all_app_short.application_date=='41/8/1966', 'application_date'] = '4/8/1966'

In [105]:
# convert application date to date object
all_app_short.loc[:, 'application_date_dt'] = all_app_short.loc[:, 'application_date'].apply(lambda x: pd.to_datetime(x, format='%m/%d/%Y'))

In [106]:
# do the same date check for birth date columns
bdate_mask = all_app_short.date_of_birth.apply(id_poorlyfmtdates)
all_app_short.loc[~bdate_mask, ['first_name', 'last_name', 'date_of_birth']]
all_app_short.loc[(
        all_app_short.last_name=='Cook') & 
                  (all_app_short.middle_name=='James') & 
                  (all_app_short.first_name.isnull()), 'date_of_birth'] = '1/27/1940'

In [107]:
# all_app_short.loc[all_app_short.last_name=='Cook', ['first_name', 'last_name', 'middle_name', 'address', 'age', 'date_of_birth']]

In [108]:
# apply a basic string cleaning function to the names- removing all punctuation, changing to all uppercase

def trans_remov_punc(to_change, change_to):
    # removes specified punctuation using string maketrans (very fast, C lookups)
    #returns partially evaluated fnc
    trantab = string.maketrans(to_change, change_to)
    return funcy.func_partial(lambda x: x.translate(trantab))


def standardize_whitespace(pub_str):
    return ' '.join(filter(None, pub_str.split(' ')))


def remove_punc(pub_str):
    # function to remove punctuation
    nonelst = ' '*len(string.punctuation)
    fn = trans_remov_punc(string.punctuation, nonelst)
    new_str = fn(pub_str)
    # standardize spaces
    return standardize_whitespace(new_str)


def clean_names(name):
    # if name is missing, return null
    if pd.isnull(name):
        return np.nan
    # uppercase 
    try:
        upp = name.upper()
        return remove_punc(upp)
    except AttributeError:
        # this should catch any other weird issues, like floats, unicode etc
        print name
        return None


In [109]:
for n in NAMES:
    all_app_short.loc[:, 'clean_{}'.format(n)] = all_app_short.loc[:, n].apply(clean_names)

In [110]:
# pull off suffix in some last names into seperate column
def has_suffix(raw_last_name):
    # a boolean fnc to identify which rows may have a suffix
    last_lst = raw_last_name.split(' ')
    if len(last_lst) == 1:
        # if no white spaces in last name, only 1 word, so no suffix
        return False
    suffixes = ['JR', 'SR', 'I', 'II', 'III', 'IV', 'V', 'VI']
    # want to differentiate between suffix (JR, SR, I, II, III, IV, V) between last names with multiple parts (ex. st john)
    # check if last word in list
    return (last_lst[-1] in suffixes)

In [111]:
def get_suffix(clean_last_name):
    # this assumes the fnc will only be applied to rows id as those with a suffix
    return clean_last_name.split(' ')[-1]

In [112]:
def remove_suffix_from_last_name(last_name_raw):
    non_suffix = last_name_raw.split(' ')[:-1]
    return ' '.join(non_suffix)

In [113]:
suffix_mask = all_app_short.clean_last_name.apply(has_suffix)
# all_app_short.loc[suffix_mask, ['clean_last_name', 'clean_first_name', 'clean_middle_name']]

In [114]:
# for those last names that seem to have a suffix, pull into seperate column and keep everything last word of last name
all_app_short.loc[suffix_mask, 'clean_suffix'] = all_app_short.loc[suffix_mask, 'clean_last_name'].apply(get_suffix)
all_app_short.loc[suffix_mask, 'clean_last_name'] = all_app_short.loc[suffix_mask, 'clean_last_name'].apply(remove_suffix_from_last_name)

In [115]:
# some first names also contain some honors such as 'Pfizer Award' or 'Honor Society'
# these should be pulled into the honors and awards columns
def has_award(raw_name):
    # if the first name has honor or award it it, return True
    if pd.isnull(raw_name):
        return False
    return any(map(lambda x: x in raw_name, AWARDS_KEYWORDS))

In [116]:
# get a list of all the med school honors columns
honors_columns = [c for c in all_app_short.columns if 'honor' in c]
print honors_columns

has_award_mask = all_app_short['clean_first_name'].apply(has_award)

all_app_short.loc[has_award_mask, 'extra_honor'] = all_app_short.loc[has_award_mask, 'clean_first_name']

['honor_societies_first', 'honor_societies_fourth', 'honor_societies_second', 'honor_societies_third']


In [117]:
# create column mask for each row where one of the honors columns is blank
for hc in honors_columns:
    hc_mask = (has_award_mask) & (pd.isnull(all_app_short[hc]))
    all_app_short.loc[hc_mask, hc] = all_app_short.loc[hc_mask, 'extra_honor']
# check for any columns that already have full honors and cant be filled
all_app_short.loc[hc_mask, honors_columns].dropna(how='any')

Unnamed: 0,honor_societies_first,honor_societies_fourth,honor_societies_second,honor_societies_third


In [118]:
# drop extra honor columns
all_app_short2 = all_app_short.drop('extra_honor', axis=1)

In [119]:
# replace those honors first names with np.nan
all_app_short2.loc[hc_mask, 'clean_first_name'] = np.nan

In [146]:
m = re.search(r'\d{4}-\d{2,4}', '1972-74')

In [149]:
def is_year_range(med_school_str):
    # check if a year range coded into the med school columns 1972-73
    if pd.isnull(med_school_str):
        return False
    if re.search(r'\d{4}-\d{2,4}', med_school_str) is not None:
        return True

In [151]:
difflib.SequenceMatcher(None, 'QUICKLY', 'QUICKLY').ratio()

1.0

In [162]:
def str_sim(row):
    med_school = row['medical_school']
    last = row['clean_last_name']
    if pd.isnull(med_school) and pd.isnull(last):
        return 0
    if isinstance(med_school, float):
        return 0
    upp_med_school = clean_names(med_school)
    res = difflib.SequenceMatcher(None, last, upp_med_school).ratio()
    return res

In [163]:
# for those columns where med school is equal to last name or med_school is a year range, delete
all_app_short2.loc[:, 'school_name_sim'] = all_app_short2.loc[:, ['clean_last_name', 'medical_school']].apply(str_sim, axis=1)

In [167]:
all_app_short2.loc[all_app_short2.school_name_sim > .6, :]

Unnamed: 0,address,age,application_date,associate_program_entered,bob,ca,cc,cord,citizenship,city,...,medschool_year_grad,middle_name,reviewer,undergrad_year_grad,application_date_dt,clean_first_name,clean_middle_name,clean_last_name,clean_suffix,school_name_sim


In [165]:
all_app_short2.loc[all_app_short2.clean_last_name=='GRIBBLE', ['clean_last_name', 'medical_school', 'school_name_sim']]

Unnamed: 0,clean_last_name,medical_school,school_name_sim
1237,GRIBBLE,Stanford University School of Medicine,0.177778
1238,GRIBBLE,Stanford University School of Medicine,0.177778
1203,GRIBBLE,Stanford University School of Medicine,0.177778


In [48]:
# look at all the unique med school
# sorted(all_app_short.medical_school.unique())

In [168]:
all_app_short2.loc[:, 'clean_college'] = all_app_short2.undergraduate_school.apply(clean_names)

In [169]:
# college name standardization fnc
def clean_std_college_name(college_raw):
# need to change 'college to university' unless Boston college or BU remove ANDS, AT, THE expand UCLA to UCAL, UC Davis etc.
# remove mispellings 'collge', UNIVERWSITY, MASSACHUSSETTS 
# RENSSELAER UNIVERSITY and RENSSELAER POLYTECHNICAL INSTITUTE to RENSSELAER POLYTECHNIC INSTITUTE
# JOHN HOPKINS UNIVERSITY to JOHNS
# HOLY CROSS COLLEGE to HOLY CROSS
# FRANKLIN MARSHALL COLLEGE' to FRANKLIN MARSHALL
# DE PAUW to DEPAUW
# ASBURY ASHBURY 
# DREXEL INSTITUTE OF TECHNOLOGY to DREXEL UNIVERSITY
# A B BROWN UNIVERSITY to Brown
# DARTMOUTH MEDICAL SCHOOL to Dartmouth
# remove random 1961 at the end of strings, (anything after university unless univ is the first word)
    if pd.isnull(college_raw):
        return np.nan
    # if AT or AND or THE, remove
    to_remove = [
        ' AND ', ' AT ', 'THE ', ' COLLGE', 'UNIVERISTY', 'UNIVERWSITY', 'MASSACHUSSETTS', 'JOHN ', 'DE PAUW', 'ASBURY', 
    'DREXEL INSTITUTE OF TECHNOLOGY', 'A B BROWN UNIVERSITY', 'DARTMOUTH MEDICAL SCHOOL', 'RENSSELAER UNIVERSITY', 
    'RENSSELAER POLYTECHNICAL INSTITUTE', ' STE', 'COLLEGE OF HOLY CROSS', 'HOLLY CROSS', 'JOHNSS ',  'BERKLEY',
    'UC ', 'PITTSBURRGH', 'WESLYN', 'WILLAMS', 'GEORGIA TECH', 'NEW YORK UNIVERSITY UNIV', 
    'UNIVERSITY OF MICHIGAN IS A', 'OHIO', 'STATE UNIVERSITY OF NEW YORK AT BUFFALO']
    to_replace = [
        ' ', ' ', ' ', ' COLLEGE', 'UNIVERSITY', 'UNIVERSITY', 'MASSACHUSETTS', 'JOHNS ', 'DEPAUW', 'ASHBURY',
        'DREXEL UNIVERSITY', 'BROWN UNIVERSITY', 'DARTMOUTH', 'RENSSELAER POLYTECHNIC INSTITUTE', 
        'RENSSELAER POLYTECHNIC INSTITUTE', ' STATE', 'HOLY CROSS', 'HOLY CROSS', 'JOHNS ', 
        ' BERKELEY', 'UNIVERSITY OF CALIFORNIA ', 'PITTSBURGH', 'WESLEYAN', 'WILLIAMS', 
        'GEORGIA INSTITUTE OF TECHNOLOGY', 'NEW YORK', 'UNIVERSITY OF MICHIGAN', 'OHIO STATE', 'SUNY BUFFALO']
    word_pairs = zip(to_remove, to_replace)
    # list of words to replace
    words_in_str = filter(lambda (x, y): x in college_raw, word_pairs)
    trans_word = college_raw
    for to_remove_wrd, to_replace_wrd in words_in_str:
        trans_word = trans_word.replace(to_remove_wrd, to_replace_wrd)
        
    # after replacing the mispellings and removing and/at, remove everything after college/university
    if 'BOSTON' in trans_word:
        # then this string is BC or BU, so just return string
        return trans_word
    if trans_word.find('UNIVESITY ') == 0:
        return trans_word
    split_wrd = ' UNIVERSITY'
    if 'COLLEGE' in trans_word:
        split_wrd = ' COLLEGE'
    base_word = trans_word.split(split_wrd)[0]
    return standardize_whitespace(base_word)
    

In [170]:
# make college mispelling and different reference translation table
all_app_short2.loc[:, 'clean_college_trans'] = all_app_short2.clean_college.apply(clean_std_college_name)

In [171]:
# need to convert undergrad_year_grad and med_school_grad to numbers to maintain consistence
all_app_short2.loc[:, ['undergrad_year_grad', 'medschool_year_grad']] = all_app_short2.loc[:, ['undergrad_year_grad', 'medschool_year_grad']].apply(
    lambda x: pd.to_numeric(x, errors='coerce'))

In [172]:
# now, sort by names, med school, undergrad school, 
all_app_shorted = all_app_short2.sort_values(by=PERSONAL_INFO)

In [173]:
all_app_grouped = all_app_shorted.groupby(['clean_last_name', 'application_date_dt'])

In [174]:
# convert ca column to float62
all_app_shorted.loc[:, 'ca'] = all_app_shorted.loc[:, 'ca'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [175]:
def sorting_fnc(v):
    # if string entry, sort by length, otherwise return value
    if isinstance(v, str):
        return len(v)
    return v

def consolidate_holes(df_col):
    # for each dataframe of info on one person/application year pair, consolidate info
    # drop null values, get unique and take longest by string length or first numeric value
    # all dataframes are coming in with reviewer 2 on top, who seems to be more accurate
    # so take that answer if possible
    lst_vals = list(df_col.dropna().unique())
    if len(lst_vals) == 0:
        return np.nan
    if isinstance(lst_vals[0], str):
        lst_vals = sorted(lst_vals, key=sorting_fnc, reverse=True)
    return lst_vals[0]

def stringify_personal_info(df_row):
    # accepts a series, returns a list object
    val_series = df_row[['clean_first_name', 'clean_middle_name', 
                        'clean_last_name', 'clean_college_trans', 'medical_school', 'date_of_birth']]
    str_vals = [str(v) for v in val_series]
    return '_'.join(str_vals)


def add_sanity_check_row(df, vals=None):
    if vals is None:
        df.loc[:, 'sanity_check'] = np.nan
    else:
        df.loc[:, 'sanity_check'] = vals
    return df


def format_consolidated_data(df):
    # applies consolidate fnc to dataframe, converts it into a df that can be merged
    # sort df so reviewer 2 is on top
    df_sort = df.sort_values(by=['reviewer'], ascending=False)
    vals = map(lambda x: stringify_personal_info(df_sort.loc[x, :]), df_sort.index)
    d2_series = df_sort.apply(consolidate_holes)
    df_trans = pd.DataFrame(d2_series).T
    # create a column that compares string values in another column
    sanity_checks = '\n'.join(vals)
    return add_sanity_check_row(df_trans, sanity_checks)

In [177]:
all_app_short2.loc[all_app_short2.clean_last_name=='HASELBY', ['medical_school']]

Unnamed: 0,medical_school
1685,Indiana University School of Medicine
1709,Indiana University School of Medicine


In [178]:
def consolidate_candidate(candidate_df):
    # for each dataframe, with a common last name and application date, seperate into unique candidate pairs
    # easiest case, if 2 rows, 1 from reviewer 1 and 2, then consolidate
    df_len = candidate_df.shape[0]
    if df_len < 2:
        # if only 1 row in data group, return row
        return add_sanity_check_row(candidate_df)
    unique_first_names = candidate_df['clean_first_name'].dropna().unique()
    unique_undergrad = candidate_df['undergrad_year_grad'].dropna().unique()
    unique_college = candidate_df['clean_college_trans'].dropna().unique()
    if (len(unique_first_names) < 2 and len(unique_undergrad) < 2) or (len(unique_first_names) < 2 and len(unique_college) < 2):
        # if unique first names < 2 and < 2 diff years of undergrad OR 
        # unique first names < 2 and <2 unique college names
        # most likely this is the same person
        res = format_consolidated_data(candidate_df)
        return res
    # otherwise there are more than 1 person to combine
    if len(unique_first_names) >= 2:
        # if there are 2 or more different first names, try to group by first and last name
        # and then analyze each group seperately
        print candidate_df['clean_last_name']
        print candidate_df['application_date']
        new_grped = candidate_df.groupby(['clean_first_name', 'clean_last_name'])
        print 'group by first name'
        res = map(lambda (x, y): format_consolidated_data(y), new_grped)
        res2 = pd.concat(res)
        return res2 
    if len(unique_college) >= 2:
        # if two different colleges, try to group by last name and college name
        print 'group by college name'
        print candidate_df
        new_grped = candidate_df.groupby(['clean_first_name', 'clean_college_trans'])
        res = map(lambda (x, y): format_consolidated_data(y), new_grped)
        res2 = pd.concat(res)
        return res2 
    # otherise this is an edge case
    # otherwise, although the person doesn't match on unique first and undergrad years, they are still most likely one 
    # person
    print candidate_df
    print 'Hitting a bad edge case'
    raise AttributeError('Hitting a Bad Edge Case')
    return None

In [179]:
combined_rows = map(lambda (x, y): consolidate_candidate(y), all_app_grouped) 

105    ANDERSON
106    ANDERSON
104    ANDERSON
105    ANDERSON
Name: clean_last_name, dtype: object
105    6/10/1971
106    6/10/1971
104    6/10/1971
105    6/10/1971
Name: application_date, dtype: object
group by first name
206    BANTA
203    BANTA
Name: clean_last_name, dtype: object
206    3/19/1963
203    3/19/1963
Name: application_date, dtype: object
group by first name
247    BLASS
251    BLASS
Name: clean_last_name, dtype: object
247    4/13/1965
251    4/13/1965
Name: application_date, dtype: object
group by first name
297    BRENSIKE
293    BRENSIKE
Name: clean_last_name, dtype: object
297    3/23/1967
293    3/23/1967
Name: application_date, dtype: object
group by first name
729    COHEN
713    COHEN
Name: clean_last_name, dtype: object
729    1/25/1972
713    1/25/1972
Name: application_date, dtype: object
group by first name
820    DICKLER
800    DICKLER
Name: clean_last_name, dtype: object
820    3/18/1968
800    3/18/1968
Name: application_date, dtype: object
group by

In [180]:
unique_apps_df = pd.concat(combined_rows)
# combined_rows[1]
# [c for c in combined_rows if c.shape[1]!=74]

In [181]:
# sort columns
unique_apps_df.sort_index(axis=1, inplace=True)
# order columns and sort values
unique_apps_df.sort_values(by=['clean_last_name', 'application_date'], axis=0, inplace=True)


In [188]:
unique_apps_df.loc[unique_apps_df.clean_last_name=='HUMPHREY', PERSONAL_INFO+['application_date']]

Unnamed: 0,clean_first_name,clean_last_name,clean_middle_name,date_of_birth,medical_school,clean_college_trans,application_date
0,GEORGE,HUMPHREY,BENNETT,,University of Chicago Pritzker School of Medicine,,7/24/1962
1534,,HUMPHREY,,,University of Chicago Pritzker School of Medicine,,8/24/1961


In [None]:
MISPELLINGS = {'clean_last_name': {'HOMCY': 'HOMEY', 'DROBIS': 'DROBIN', 'DEFRONZO':' DEFRENZO'}}

In [None]:
# create uuid for each person

In [62]:
unique_apps_df.head()

Unnamed: 0,address,age,application_date,application_date_dt,associate_program_entered,bob,ca,cc,citizenship,city,...,sixth,state,teaching,undergrad_year_grad,undergraduate_school,unnamed:_59,unnamed:_61,withdrawal,year_accepted,zip_code
0,154 Beach 142nd Street,,4/8/1970,1970-04-08 00:00:00,CA,0.0,1.0,1.0,,Neponsit,...,,New York,1.0,,,,1,-9,1972.0,11694.0
0,6 Val Mar Place,,5/17/1966,1966-05-17 00:00:00,,0.0,1.0,0.0,US,San Carlos,...,,California,1.0,,,,1,-9,1967.0,
33,University of Washington,,1/1/1965,1965-01-01 00:00:00,RA,,,,,Seattle,...,,Washington,,,,,2,-9,1965.0,
0,67 Frederick Place,,4/8/1971,1971-04-08 00:00:00,,0.0,1.0,0.0,,Mt. Vernon,...,,New York,1.0,,,,1,0,,
0,2280 Loring Place,,4/10/1962,1962-04-10 00:00:00,,0.0,0.0,0.0,US,New York 68,...,,New York,0.0,,,,2,-9,1964.0,


In [63]:
# read to csv
unique_apps_df.to_csv(os.path.join(CARD_DATA_DIR, 'deduped_applicants.csv'))
unique_apps_df.to_pickle(os.path.join(CARD_DATA_DIR, 'deduped_applicants.p'))

In [None]:
g1 = list(all_app_grouped)[0][1]

In [None]:

g1 = all_app_grouped.get_group(('HARLOW JR', '1971-02-28'))
g2 = all_app_grouped.get_group(('ALEXANDER', '1969-04-08'))
g3 = all_app_grouped.get_group(('DRACHMAN', '1958-11-01'))
g4 = all_app_grouped.get_group(('SACKS', '4/21/1967'))
g5 = all_app_grouped.get_group(('ZIVIN', '4/4/1975'))

In [None]:
g5

In [None]:
out = consolidate_candidate(g1)
out