In [1]:
import difflib
from fuzzywuzzy import fuzz
import itertools
import pandas as pd
from collections import Counter
import numpy as np
import string
import funcy
import re
import os
import uuid
import math

%load_ext autoreload

%autoreload 2

from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name)

from dev import (
    APP_DATA_DIR, SUM_STAT_DIR, CARD_DATA_DIR, CORRECTIONS_DIR, AWARDS_KEYWORDS, NAME_COLS, RAW_NAME_COLS, 
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, FEMALE_FIRST_NAMES, FEMALE_MIDDLE_NAMES, 
    PICKLE_DIR)

from merging_functions import *

OUTPUT_CSV = True 
RAW_APPLICANT_DATA_FILENAME = 'raw_applicant_card_data.csv'
MISSING_APPDATE_FILENAME = 'index_cards_no_application_date.csv'
APP_SPELLING_CORRECTIONS = 'index_card_manual_corrections.xlsx'


PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']


In [2]:
all_appcards2 = pd.read_csv(os.path.join(CARD_DATA_DIR, RAW_APPLICANT_DATA_FILENAME))

# drop all rows with first, last name NA
all_app3 = all_appcards2.dropna(subset=['application_date'], how='all')


def id_poorlyfmtdates(str_date):
    try:
        dt = pd.to_datetime(str_date, format='%m/%d/%Y')
        return True
    except (ValueError, AssertionError):
        return False

mask = all_app3.application_date.apply(id_poorlyfmtdates)
# all_app3.loc[~mask, :]

# change '3/31971 to 3/3/1971
# change 41/8/1966 to 4/8/1966 
all_app3.loc[all_app3.application_date=='3/31971', 'application_date'] = '3/3/1971'
all_app3.loc[all_app3.application_date=='41/8/1966', 'application_date'] = '4/8/1966'

# convert application date to date object
all_app3.loc[:, 'application_date'] = all_app3['application_date'].apply(lambda x: pd.to_datetime(x))

# do the same date check for birth date columns
bdate_mask = all_app3.date_of_birth.apply(id_poorlyfmtdates)
all_app3.loc[~bdate_mask, ['first_name', 'last_name', 'date_of_birth']]
all_app3.loc[(
        all_app3.last_name=='Cook') & 
                  (all_app3.middle_name=='James') & 
                  (all_app3.first_name.isnull()), 'date_of_birth'] = '1/27/1940'

all_app3[NAME_COLS] = all_app3[RAW_NAME_COLS].applymap(clean_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [3]:
all_app3.loc[(all_app3.clean_last_name=='RUBENSTEIN') & (all_app3.clean_middle_name=='ALLAN'), 'application_date'] = pd.to_datetime('1/21/1973')

all_app3.loc[(
        all_app3.clean_last_name=='ROOT') & 
                  (all_app3.clean_first_name=='RICHARD'), 'application_date'] = pd.to_datetime('5/23/1963')

all_app3.loc[(
        all_app3.clean_last_name=='FREIDMAN') & (all_app3.clean_first_name=='STANFORD'), 'application_date'] = pd.to_datetime('01/01/1960')


In [4]:
all_app3.rename(columns={'medical_school':'original_medical_school'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [5]:
# deal with suffixes and honors
suffix_mask = all_app3.clean_last_name.apply(has_suffix)
# all_app_short.loc[suffix_mask, ['clean_last_name', 'clean_first_name', 'clean_middle_name']]

# for those last names that seem to have a suffix, pull into seperate column and keep everything last word of last name
all_app3.loc[suffix_mask, 'clean_suffix'] = all_app3.loc[suffix_mask, 'clean_last_name'].apply(get_suffix)
all_app3.loc[suffix_mask, 'clean_last_name'] = all_app3.loc[suffix_mask, 'clean_last_name'].apply(remove_suffix_from_last_name)

# some first names also contain some honors such as 'Pfizer Award' or 'Honor Society'
# these should be pulled into the honors and awards columns
has_award_fnc = funcy.rpartial(has_award, AWARDS_KEYWORDS)


# get a list of all the med school honors columns
honors_columns = [c for c in all_app3.columns if 'honor' in c]
print honors_columns

has_award_mask = all_app3['clean_first_name'].apply(has_award_fnc)

all_app3.loc[has_award_mask, 'extra_honor'] = all_app3.loc[has_award_mask, 'clean_first_name']

# create column mask for each row where one of the honors columns is blank
for hc in honors_columns:
    hc_mask = (has_award_mask) & (pd.isnull(all_app3[hc]))
    all_app3.loc[hc_mask, hc] = all_app3.loc[hc_mask, 'extra_honor']
# check for any columns that already have full honors and cant be filled
all_app3.loc[hc_mask, honors_columns].dropna(how='any')

# drop extra honor columns
all_app4 = all_app3.drop('extra_honor', axis=1)

# replace those honors first names with np.nan
all_app4.loc[has_award_mask, 'clean_first_name'] = np.nan

# for those columns where med school is equal to last name or med_school is a year range, delete
med_school_str_sim = funcy.rpartial(str_sim, 'medical_school', 'clean_last_name')
all_app4.loc[:, 'school_name_sim'] = all_app4.loc[:, ['clean_last_name', 'medical_school']].apply(med_school_str_sim, axis=1)

all_app4.loc[all_app4.school_name_sim > .6, :]

all_app4.loc[:, 'clean_college'] = all_app4.undergraduate_school.apply(clean_names)

to_remove_college = [
    ' AND ', ' AT ', 'THE ', ' COLLGE', 'UNIVERISTY', 'UNIVERWSITY', 'MASSACHUSSETTS', 'JOHN ', 'DE PAUW', 'ASBURY', 
'DREXEL INSTITUTE OF TECHNOLOGY', 'A B BROWN UNIVERSITY', 'DARTMOUTH MEDICAL SCHOOL', 'RENSSELAER UNIVERSITY', 
'RENSSELAER POLYTECHNICAL INSTITUTE', ' STE', 'COLLEGE OF HOLY CROSS', 'HOLLY CROSS', 'JOHNSS ',  'BERKLEY',
'UC ', 'PITTSBURRGH', 'WESLYN', 'WILLAMS', 'GEORGIA TECH', 'NEW YORK UNIVERSITY UNIV', 
'UNIVERSITY OF MICHIGAN IS A', 'OHIO', 'STATE UNIVERSITY OF NEW YORK AT BUFFALO']
to_replace_college = [
    ' ', ' ', ' ', ' COLLEGE', 'UNIVERSITY', 'UNIVERSITY', 'MASSACHUSETTS', 'JOHNS ', 'DEPAUW', 'ASHBURY',
    'DREXEL UNIVERSITY', 'BROWN UNIVERSITY', 'DARTMOUTH', 'RENSSELAER POLYTECHNIC INSTITUTE', 
    'RENSSELAER POLYTECHNIC INSTITUTE', ' STATE', 'HOLY CROSS', 'HOLY CROSS', 'JOHNS ', 
    ' BERKELEY', 'UNIVERSITY OF CALIFORNIA ', 'PITTSBURGH', 'WESLEYAN', 'WILLIAMS', 
    'GEORGIA INSTITUTE OF TECHNOLOGY', 'NEW YORK', 'UNIVERSITY OF MICHIGAN', 'OHIO STATE', 'SUNY BUFFALO']

clean_college_fnc = funcy.rpartial(clean_std_college_name, to_remove_college, to_replace_college)

# make college mispelling and different reference translation table
all_app4.loc[:, 'clean_college_trans'] = all_app4.clean_college.apply(clean_college_fnc)

all_app4.drop(['clean_college', 'school_name_sim'], axis=1, inplace=True)

all_app4.loc[:, 'medical_school'] = all_app4.original_medical_school.apply(funcy.rcompose(clean_names, clean_med_school))

all_app4.medical_school.sort_values().unique()

all_app4.loc[pd.isnull(all_app4.medical_school), 'medical_school'] = np.nan

# need to convert undergrad_year_grad and med_school_grad to numbers to maintain consistency
all_app4.loc[:, ['undergrad_year_grad', 'medschool_year_grad']] = all_app4.loc[:, ['undergrad_year_grad', 'medschool_year_grad']].apply(
    lambda x: pd.to_numeric(x, errors='coerce'))

# now, sort by names, med school, undergrad school, 
all_app5 = all_app4.sort_values(by=PERSONAL_INFO)

# LAST_NAME_MISSPELLINGS = {
#         'HOMCY': 'HOMEY', 'DROBIS': 'DROBIN', 'DEFRONZO':'DEFRENZO', 
#         'BRADEN 3R': 'BRADEN', 'BORKER': 'BORER', 'CASTLES': 'CASTLE',
#         'CUONO': 'CUOMO', 'CYRULNIK': 'CYRULINK', 'EISENBATH': 'EISENBARTH', 
#         'ELLIOTT': 'ELIOT', 'FINKLESTEIN': 'FINKELSTEIN', 'HEINRICK': 'HEINRICH', 
#         'HERLIKY': 'HERLIHY', 'HIMMELHOCK': 'HIMMELHOCH', 'JANOWSKY': 'JANKOWSKY', 
#         'KLINENBERG': 'KLINEBERG', 'KORNFELD': 'KORNFIELD', 'NEIDORF': 'NEIDOFT',
#         'OLEINICK': 'OLENICK', 'ROSKES': 'ROSKE'
# }
# removed CUONO, DROBIS, 


LAST_NAME_MISSPELLINGS = {
        'HOMCY': 'HOMEY', 
        'BRADEN 3R': 'BRADEN', 'BORKER': 'BORER', 'CASTLES': 'CASTLE',
        'CYRULNIK': 'CYRULINK', 'EISENBATH': 'EISENBARTH', 
        'HEINRICK': 'HEINRICH', 
        'HERLIKY': 'HERLIHY', 'HIMMELHOCK': 'HIMMELHOCH', 'JANOWSKY': 'JANKOWSKY', 
        'KLINENBERG': 'KLINEBERG', 'KORNFELD': 'KORNFIELD', 'NEIDORF': 'NEIDOFT',
        'OLEINICK': 'OLENICK', 'ROSKES': 'ROSKE'
}

replace_last_name_fnc = funcy.rpartial(replace_last_name, LAST_NAME_MISSPELLINGS)

# correct last name mispellings
all_app5.loc[:, 'clean_last_name'] = all_app5.loc[:, 'clean_last_name'].apply(replace_last_name_fnc)
all_app5.loc[all_app5.clean_last_name=='MORTON', 'clean_first_name'] = 'JOHN'

# convert ca column to float62
all_app5.loc[:, 'ca'] = all_app5.loc[:, 'ca'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


['honor_societies_first', 'honor_societies_fourth', 'honor_societies_second', 'honor_societies_third']


In [6]:
female_mask = (
    (all_app5['clean_first_name'].isin(FEMALE_FIRST_NAMES)))
#     | (
#         all_app5['clean_middle_name'].isin(FEMALE_MIDDLE_NAMES)))
all_app5['is_female'] = 0
all_app5.loc[female_mask & ~pd.isnull(all_app5['clean_first_name']), 'is_female'] = 1

In [7]:
all_app5 = all_app5[all_app5.is_female==0]

In [8]:
def get_first_letter(str_var):
    if pd.isnull(str_var) or str_var=='':
        return np.nan
    return str_var[0]

In [28]:
last_names = 'BROWN'

In [29]:
all_app5.loc[all_app5.clean_last_name==last_names, NAME_COLS+['medical_school', 'medschool_grad_year']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,medical_school,medschool_grad_year
4073,BEN,MAURICE,BROWN,STANFORD,
4043,BRUCE,GREGORY,BROWN,JOHNS HOPKINS,
3972,CLARENCE,H,BROWN,EMORY,
3843,DONALD,D,BROWN,UNIVERSITY OF CHICAGO,
4126,EDWARD,MEIGS,BROWN,HARVARD,
4208,ERIC,JOEL,BROWN,HARVARD,
4209,FRANK,R,BROWN,WASHINGTON,
3999,JAMES,ALAN,BROWN,ROCHESTER,
3973,JAMES,EDWARD,BROWN,YALE,
4163,JAMES,KINGSBURY,BROWN,JOHNS HOPKINS,


In [9]:
# merge in manual corrections excel sheet
manual_fixes = pd.read_excel(os.path.join(CORRECTIONS_DIR, 'index_card_manual_corrections.xlsx')).rename(
    columns={
        'clean_medical_school': 'medical_school', 
        'to_fix_clean_medical_school': 'to_fix_medical_school'})

In [10]:
c = filter(lambda x: '_year' not in x, manual_fixes.columns)
manual_fixes.loc[:, c] = manual_fixes[c].applymap(clean_names)

In [11]:
# manual_fixes
# manual_fixes[manual_fixes.clean_last_name=='ARON']

In [12]:
# for the people who match, consolidate the columns

missing_names = pd.merge(
    left=all_app5, right=manual_fixes, on=['clean_first_name', 'clean_middle_name', 'clean_last_name', 'medical_school'], how='inner')

print manual_fixes.shape
print missing_names.shape
# consolidate columns
for x in ['medical_school', 'clean_first_name', 'clean_middle_name', 'medschool_year_grad']:
    mask = ~pd.isnull(missing_names['to_fix_{}'.format(x)])
    missing_names.loc[mask, x] = missing_names.loc[mask, 'to_fix_{}'.format(x)]
    
    
# drop extra columns
missing_names2 = missing_names.drop(
    (c for c in missing_names.columns if c not in all_app5.columns), axis=1).drop_duplicates([RAW_CARD_ID])

all_app6  = pd.concat(
    [all_app5.loc[~all_app5[RAW_CARD_ID].isin(missing_names2[RAW_CARD_ID]), :], missing_names2], axis=0)

(373, 9)
(328, 78)


In [13]:
manual_fixes.loc[manual_fixes['clean_last_name'].apply(lambda x: False if pd.isnull(x) else x.startswith('D')), :]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,medical_school,to_fix_clean_first_name,to_fix_clean_middle_name,to_fix_clean_last_name,to_fix_medical_school,to_fix_medschool_year_grad
81,,,DANFORTH,,DAVID,,DANFORTH,NORTHWESTERN,
104,,C,DAVIE,,JAMES,C,DAVIE,UNIVERSITY OF ALABAMA,
105,,JOHN A,DAVIES,MIAMI,PETER,JOHN,DAVIES,MIAMI,
106,PETER,RONALD,DEAL,EMORY,DAVEY,RONALD,DEAL,ARKANSAS,
107,,A,DEFRONZO,HARVARD,RALPH,A,DEFRENZO,HARVARD,
108,,ROBERT,DELONG,HARVARD,GEORGE,ROBERT,DELONG,HARVARD,
109,G,ROBERT,DELONG,HARVARD,GEORGE,ROBERT,DELONG,HARVARD,
110,,R,DELONG,HARVARD,MAHLON,R,DELONG,HARVARD,
111,,,DREWS,COLORADO HEALTH SCIENCES CENTER,GENEVIVE,,DREWS,COLORADO HEALTH SCIENCES CENTER,
112,,E,DIEZMAN,,DALE,E,DIETZMAN,VERMONT MEDICAL CENTER,


In [17]:
last_name = 'WOLFISH'


# f_name = 'ADAM'
# m_name = 'NORMAN'
# all_app5.loc[(
#         all_app5.clean_first_name==f_name) & (all_app5.clean_middle_name==m_name), NAME_COLS+['medical_school', 'residency_hospital', 'internship_hospital_1']]
all_app5.loc[all_app5.clean_last_name==last_name, NAME_COLS+['medical_school', 'residency_hospital', 'internship_hospital_1']]
# missing_names.loc[missing_names.clean_last_name==last_name, NAME_COLS+['medical_school']]
# manual_fixes.loc[manual_fixes.clean_first_name=='LAWRENCE',  NAME_COLS+['medical_school']]
# manual_fixes.loc[manual_fixes.clean_last_name==last_name,  NAME_COLS+['medical_school']]
all_app6.loc[all_app6.clean_last_name==last_name,  NAME_COLS+['medical_school', 'medschool_year_grad']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,medical_school,medschool_year_grad
7703,PAUL,STUART,WOLFISH,NYU,1973.0
7738,PAUL,,WOLFISH,NYU,1973.0


In [38]:
all_app6['clean_first_initial'] = all_app6.clean_first_name.apply(get_first_letter)
all_app6['clean_middle_initial'] = all_app6.clean_middle_name.apply(get_first_letter)
all_app6['application_year'] = all_app6.application_date.apply(lambda x: pd.to_datetime(x).year)


In [86]:
all_app6.loc[(all_app6['application_year'] > 1990) | (all_app6['application_year'] < 1950), 'application_year'] = np.nan

In [87]:
# drop people with female names and two columns
is_female_mask = ((all_app6.clean_first_name.isin(FEMALE_FIRST_NAMES)) | (
        all_app6.clean_middle_name.isin(FEMALE_MIDDLE_NAMES)))
# RENAME INTERNSHIP HOSPITAL COL
all_app7 = all_app6.loc[~is_female_mask, :].drop(['Unnamed: 0',"daniel's_comments"], axis=1).rename(
    columns={'internship_hospital_1': 'internship_hospital'})

In [88]:
# split into reviewer 1 and 2, and try to match
all_app7['fuzzy_merge_col'] = all_app7[
    ['clean_first_name', 'clean_middle_name', 'clean_last_name']].apply(create_str_merge, axis=1)
rev2 = df_get_closest_matches(all_app7[all_app7.reviewer==2], all_app7[all_app7.reviewer==1], 'fuzzy_merge_col') 


feature_dict = {
    'clean_first_name': get_name_str_sim,
    'clean_middle_name': get_name_str_sim,
    'clean_last_name': get_name_str_sim,
    'medical_school': get_name_str_sim,
    'application_year': get_dt_sim,
    'address': get_name_str_sim
}

rev3 = add_similarity_features(rev2, feature_dict, check_match)

rev1_counter = Counter(all_app7[all_app7.reviewer==1].clean_last_name.values)
rev2_counter = Counter(all_app7[all_app7.reviewer==2].clean_last_name.values)
rev3['last_name_counts_1'] = rev3.clean_last_name_1.apply(lambda x: rev1_counter[x])
rev3['last_name_counts_2'] = rev3.clean_last_name_2.apply(lambda x: rev2_counter[x])

# now, sort by is_match, similarity scores and only keep 1 uuid from each data set
last_name_unique_mask = (
    (rev3.last_name_counts_1==1) & (rev3.last_name_counts_2==1) & (
        rev3.application_year_sim<4) & (rev3.medical_school_sim > .8))
rev3.loc[last_name_unique_mask, 'is_match'] = 1

In [89]:
rev3.is_match.describe()

count    3687.000000
mean        0.778953
std         0.415008
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: is_match, dtype: float64

In [90]:
sims_cols = ['medical_school_sim', 'address_sim', 'clean_middle_name_sim', 'clean_first_name_sim']

rev4 = rev3.loc[~pd.isnull(rev3.index), :].sort_values([
        'raw_uuid_2', 'raw_uuid_1', 'is_match']+sims_cols, ascending=False)
print rev3.shape
print rev4.shape
rev5 = filter_one_match_per_group(rev4, 'raw_uuid_1', {'raw_uuid_2': 'raw_uuid_3'}, sims_cols)
print rev5.shape
rev6 = filter_one_match_per_group(rev5, 'raw_uuid_2', {'raw_uuid_1': 'raw_uuid_4'}, sims_cols)
print rev6.shape

(3687, 155)
(3535, 155)
                      raw_uuid_1  raw_uuid_3
JAMES L SCHERER           7084.0        3493
JOHN STONER               7078.0        3487
STEPHEN KARAS             5617.0        1971
OLIVER WILLIAM JONES      5449.0        1797
PAUL HINKES               5308.0        1656
MARC A FRADER             4801.0        1134
(2847, 157)
Empty DataFrame
Columns: [raw_uuid_2, raw_uuid_4]
Index: []
(2845, 159)


In [91]:
del rev6['raw_uuid_4']
rev6[rev6.clean_last_name_1=='BERG']

Unnamed: 0,raw_uuid_2,address_2,age_2,application_date_2,associate_program_entered_2,bob_2,ca_2,cc_2,cord_2,citizenship_2,...,clean_last_name_sim,application_year_sim,address_sim,clean_middle_name_sim,is_match,last_name_counts_1,last_name_counts_2,raw_uuid_1_duplicate,raw_uuid_3,raw_uuid_2_duplicate
2486,447,5710 Chadowes Road,,1972-02-25,,0.0,1.0,0.0,,,...,100.0,0.0,100.0,100.0,1,3,3,0.0,,0.0


In [92]:
matched_ids = np.concatenate([
        rev6.raw_uuid_1.dropna().unique(), rev6.raw_uuid_2.dropna().unique(), rev6.raw_uuid_3.dropna().unique()], 
                        axis=0)
print matched_ids.shape

(5696,)


In [93]:
# try to merge on middle initial and clean last name

unmatched_r1 = get_nonmatched(all_app7[all_app7.reviewer==1], matched_ids, 'raw_uuid')
unmatched_r2 = get_nonmatched(all_app7[all_app7.reviewer==2], matched_ids, 'raw_uuid')

unmatched_r1['fuzzy_merge_col'] = unmatched_r1[
    ['clean_middle_initial', 'clean_last_name']].apply(create_str_merge, axis=1)
unmatched_r2['fuzzy_merge_col'] = unmatched_r2[
    ['clean_middle_initial', 'clean_last_name']].apply(create_str_merge, axis=1)

(785, 74)
(616, 74)


In [94]:
rev2_counter = Counter(unmatched_r2.clean_last_name.values)
rev1_counter = Counter(unmatched_r1.clean_last_name.values)

match_round2 = df_get_closest_matches(unmatched_r2, unmatched_r1, 'fuzzy_merge_col')

In [95]:
middle_last3 = add_similarity_features(match_round2, feature_dict, check_match)

middle_last3['last_name_counts_1'] = middle_last3.clean_last_name_1.apply(lambda x: rev1_counter[x])
middle_last3['last_name_counts_2'] = middle_last3.clean_last_name_2.apply(lambda x: rev2_counter[x])

In [96]:
# now, sort by is_match, similarity scores and only keep 1 uuid from each data set
last_name_unique_mask = (
    (middle_last3.last_name_counts_1==1) & (middle_last3.last_name_counts_2==1) & (
        middle_last3.application_year_sim<2) & (middle_last3.medical_school_sim > .8))
middle_last3.loc[last_name_unique_mask, 'is_match'] = 1

In [97]:
middle_last4 = middle_last3.loc[~pd.isnull(middle_last3.index), :].sort_values([
        'raw_uuid_2', 'raw_uuid_1', 'is_match']+sims_cols, ascending=False)
print middle_last3.shape
print middle_last4.shape
middle_last5 = filter_one_match_per_group(middle_last4, 'raw_uuid_1', {'raw_uuid_2': 'raw_uuid_3'}, sims_cols)
print middle_last5.shape
middle_last6 = filter_one_match_per_group(middle_last5, 'raw_uuid_2', {'raw_uuid_1': 'raw_uuid_4'}, sims_cols)
print middle_last6.shape

(895, 155)
(866, 155)
Empty DataFrame
Columns: [raw_uuid_1, raw_uuid_3]
Index: []
(556, 157)
Empty DataFrame
Columns: [raw_uuid_2, raw_uuid_4]
Index: []
(556, 159)


In [98]:
middle_last6.loc[~pd.isnull(middle_last6.raw_uuid_4),['raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3']]

Unnamed: 0,raw_uuid_1,raw_uuid_2,raw_uuid_3


In [99]:
matched_ids = np.concatenate([
        rev6.raw_uuid_1.dropna().unique(), rev6.raw_uuid_2.dropna().unique(), rev6.raw_uuid_3.dropna().unique(), 
        middle_last6.raw_uuid_1.dropna().unique(), middle_last6.raw_uuid_2.dropna().unique(), middle_last6.raw_uuid_3.dropna().unique()], 
                        axis=0)
print matched_ids.shape

(6808,)


In [108]:
# people who don't match on first or last
all_app7a = all_app7.drop('fuzzy_merge_col', axis=1)
unmatched = get_nonmatched(all_app7a, matched_ids, 'raw_uuid')

(749, 73)


In [109]:
matches = pd.concat([rev6, middle_last6], axis=0).drop(['raw_uuid_1_duplicate', 'raw_uuid_2_duplicate'], axis=1)
matches1 = consolidate_merge_cols(matches, ['_1', '_2'], ['application_year', 'raw_uuid'])

['address', 'age', 'application_date', 'associate_program_entered', 'bob', 'ca', 'cc', 'citizenship', 'city', 'clean_college_trans', 'clean_first_initial', 'clean_first_name', 'clean_last_name', 'clean_middle_initial', 'clean_middle_name', 'clean_suffix', 'clinical', 'cord', 'date_of_birth', 'dbs', 'fifth', 'first_name', 'honor_societies_first', 'honor_societies_fourth', 'honor_societies_second', 'honor_societies_third', 'ic', 'internship_hospital', 'internship_year(s)', 'is_female', 'last_name', 'last_name_counts', 'medical_school', 'medschool_year_grad', 'middle_name', 'nci', 'nei', 'nhi', 'nhli', 'niaid', 'niamd', 'niamdd', 'nichd', 'nichhd', 'nidr', 'niehs', 'nigms', 'nimh', 'nindb', 'ninds', 'oir', 'original_medical_school', 'other', 'pharm_ra', 'pi', 'ra', 'rejected', 'rejection_date', 'research', 'residency_hospital', 'residency_type', 'residency_year(s)', 'reviewer', 'sa', 'sixth', 'state', 'teaching', 'undergrad_year_grad', 'undergraduate_school', 'withdrawal', 'year_accepted'

In [110]:
# append people by reviewer 3 and people not matched but reviewed by reviewer 1 or 2
full_matches = pd.concat([matches1, unmatched],
                      axis=0, ignore_index=True).sort_values(
                            ['clean_last_name', 'clean_middle_name', 'clean_first_name']).reset_index(drop=True)

In [111]:
full_matches1 = full_matches.drop([c for c in full_matches.columns if c.endswith('_sim') or '_counts' in c or c.endswith('_duplicate')], axis=1)

In [112]:
full_matches[~pd.isnull(full_matches.raw_uuid_4)]

Unnamed: 0,address,address_sim,age,application_date,application_year,application_year_1,application_year_2,application_year_sim,associate_program_entered,bob,...,reviewer,sa,sixth,state,teaching,undergrad_year_grad,undergraduate_school,withdrawal,year_accepted,zip_code


In [113]:
# remove duplicate data from application year
full_matches1.loc[pd.isnull(full_matches1.application_year), 'application_year'] = full_matches1.loc[
    pd.isnull(full_matches1.application_year), 'application_year_2'] 
full_matches1.loc[pd.isnull(full_matches1.application_year), 'application_year'] = full_matches1.loc[
    pd.isnull(full_matches1.application_year), 'application_year_1'] 

full_matches1.loc[pd.isnull(full_matches1.application_year_1), 'application_year_1'] = full_matches1.loc[
    pd.isnull(full_matches1.application_year_1), 'application_year_2'] 

full_matches1.loc[full_matches1.application_year_1==full_matches1.application_year_2, ]
full_matches1.loc[~pd.isnull(full_matches1.application_year_2), ]

dup_app_year_mask= full_matches1.application_year==full_matches1.application_year_2
full_matches1.loc[dup_app_year_mask, 'application_year_2'] = np.nan

dup_app_year_mask= full_matches1.application_year_1==full_matches1.application_year_2
full_matches1.loc[dup_app_year_mask, 'application_year_2'] = np.nan

dup_app_year_mask= full_matches1.application_year==full_matches1.application_year_1
full_matches1.loc[dup_app_year_mask, 'application_year_1'] = np.nan



full_matches1.loc[~pd.isnull(full_matches1.application_year_2), ['application_year', 'application_year_1', 'application_year_2']]

Unnamed: 0,application_year,application_year_1,application_year_2


In [114]:
# remove uuid dups

full_matches1.loc[pd.isnull(full_matches1.raw_uuid), 'raw_uuid'] = full_matches1.loc[
    pd.isnull(full_matches1.raw_uuid), 'raw_uuid_2'] 
full_matches1.loc[pd.isnull(full_matches1.raw_uuid), 'raw_uuid'] = full_matches1.loc[
    pd.isnull(full_matches1.raw_uuid), 'raw_uuid_1'] 

full_matches1.loc[pd.isnull(full_matches1.raw_uuid_1), 'raw_uuid_1'] = full_matches1.loc[
    pd.isnull(full_matches1.raw_uuid_1), 'raw_uuid_2'] 

dup_uuid_mask= full_matches1.raw_uuid==full_matches1.raw_uuid_2
full_matches1.loc[dup_uuid_mask, 'raw_uuid_2'] = np.nan

dup_uuid_mask= full_matches1.raw_uuid_1==full_matches1.raw_uuid_2
full_matches1.loc[dup_uuid_mask, 'raw_uuid_2'] = np.nan

dup_uuid_mask= full_matches1.raw_uuid==full_matches1.raw_uuid_1
full_matches1.loc[dup_uuid_mask, 'raw_uuid_1'] = np.nan

dup_uuid_mask= full_matches1.raw_uuid==full_matches1.raw_uuid_3
full_matches1.loc[dup_uuid_mask, 'raw_uuid_3'] = np.nan
# full_matches1.loc[full_matches1.raw_uuid_1==full_matches1.raw_uuid_2, ['raw_uuid_1', 'raw_uuid_2']]
full_matches1.loc[~pd.isnull(full_matches1.raw_uuid_3), ['raw_uuid', 'raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3']]

Unnamed: 0,raw_uuid,raw_uuid_1,raw_uuid_2,raw_uuid_3


In [115]:
full_matches2 = full_matches1.drop(['raw_uuid_2', 'raw_uuid_3', 'application_year_2', 'raw_uuid_4'], axis=1)

In [120]:
# del all_app3, all_appcards2, all_app4, all_app5, all_app6, all_app7a

In [121]:
full_matches2['fuzzy_merge_col'] = full_matches2[
    ['clean_middle_initial', 'clean_last_name', 'medical_school']].apply(create_str_merge, axis=1)

last_name_counter = Counter(full_matches2.clean_last_name)
full_matches2.loc[:, 'last_name_counts'] = full_matches2.apply(
    lambda x: last_name_counter[x['clean_last_name']], axis=1)

possible_dups = full_matches2[full_matches2['last_name_counts']>1].sort_values(
    ['clean_last_name', 'clean_middle_name', 'medical_school', 'city', 'application_year'])

people_match = df_get_closest_matches(possible_dups, possible_dups, 'fuzzy_merge_col', suffixes=['_x', '_y'])

In [122]:
# for people with the same information, drop from the data set
same_person_mask = (
        (people_match.raw_uuid_x==people_match.raw_uuid_y) & (people_match.raw_uuid_1_x==people_match.raw_uuid_1_y))

# need to add a second mask for people who have only 1 uuid
same_person_mask2 = (
        (people_match.raw_uuid_x==people_match.raw_uuid_y) & (pd.isnull(people_match.raw_uuid_1_x)) &
            (pd.isnull(people_match.raw_uuid_1_y)))

people_match2 = people_match[~(same_person_mask | same_person_mask2)]
print people_match2.shape

(318, 154)


In [123]:
# NOW, need to redo the merging process, but merge in based on same people, not just same application year
def check_similar(row):
        # address and application year match
    if row['application_year_sim'] > 3:
        return 0
    if row['medical_school_sim'] > 80 and row['clean_middle_name_sim'] > 60:
        return 1
    if row['medical_school_sim'] > 80 and row['clean_first_name_sim'] > 60:
        return 1
    if row['address_sim'] > 60 and row['medical_school_sim'] > 80 and (
            pd.isnull(row['clean_first_name_sim']) or row['clean_first_name_sim'] > 80) :
        return 1
    return 0



In [127]:
people_match3 = add_similarity_features(people_match2, feature_dict, check_similar, suffixes=['_x', '_y'])

In [128]:
people_match3.loc[people_match3.clean_last_name_x.isin(['ALEXANDER', 'ALPERT']), 
                  ['is_match', 'clean_first_name_sim', 
                   'clean_middle_initial_x', 'clean_first_name_x', 'medical_school_sim', 'clean_middle_name_x', 'clean_middle_name_y']]

Unnamed: 0,is_match,clean_first_name_sim,clean_middle_initial_x,clean_first_name_x,medical_school_sim,clean_middle_name_x,clean_middle_name_y
C ALEXANDER DUKE,1,100.0,C,JOHN,100.0,CHARLES,C
C ALEXANDER DUKE,1,100.0,C,JOHN,100.0,C,CHARLES
S ALPERT HARVARD,1,100.0,S,JOSEPH,100.0,STEPHEN,STEPHEN
S ALPERT HARVARD,1,100.0,S,JOSEPH,100.0,STEPHEN,STEPHEN


In [129]:
people_match4 = people_match3[people_match3['is_match']==1].reset_index().drop_duplicates(subset='index',keep='first')
people_match4.shape

(131, 162)

In [130]:
people_match5 = consolidate_merge_cols(people_match4, ['_x', '_y'], ['application_year', 'application_year_1', 'raw_uuid', 'raw_uuid_1'])
people_match6 = people_match5.drop([c for c in full_matches.columns if c.endswith('_sim') or '_counts' in c or c.endswith('_duplicate')], axis=1)

['address', 'age', 'application_date', 'associate_program_entered', 'bob', 'ca', 'cc', 'citizenship', 'city', 'clean_college_trans', 'clean_first_initial', 'clean_first_name', 'clean_last_name', 'clean_middle_initial', 'clean_middle_name', 'clean_suffix', 'clinical', 'cord', 'date_of_birth', 'dbs', 'fifth', 'first_name', 'honor_societies_first', 'honor_societies_fourth', 'honor_societies_second', 'honor_societies_third', 'ic', 'internship_hospital', 'internship_year(s)', 'is_female', 'is_match', 'last_name', 'medical_school', 'medschool_year_grad', 'middle_name', 'nci', 'nei', 'nhi', 'nhli', 'niaid', 'niamd', 'niamdd', 'nichd', 'nichhd', 'nidr', 'niehs', 'nigms', 'nimh', 'nindb', 'ninds', 'oir', 'original_medical_school', 'other', 'pharm_ra', 'pi', 'ra', 'rejected', 'rejection_date', 'research', 'residency_hospital', 'residency_type', 'residency_year(s)', 'reviewer', 'sa', 'sixth', 'state', 'teaching', 'undergrad_year_grad', 'undergraduate_school', 'withdrawal', 'year_accepted', 'zip_c

In [131]:
# consolidate uuid columns
people_match6['raw_uuid'] = people_match6['raw_uuid_x']
people_match6['raw_uuid_1'] = people_match6['raw_uuid_1_x']
people_match6['raw_uuid_2'] = np.nan
people_match6['raw_uuid_3'] = np.nan

fill_in_y = (
    (people_match6.raw_uuid_y!=people_match6.raw_uuid_x) & (people_match6.raw_uuid_y!=people_match6.raw_uuid_1))

people_match6.loc[fill_in_y, 'raw_uuid_2'] = people_match6.loc[fill_in_y, 'raw_uuid_y']

fill_in_y_1 = (
    (people_match6.raw_uuid_1_y!=people_match6.raw_uuid) & (people_match6.raw_uuid_1_y!=people_match6.raw_uuid_1) &
        (people_match6.raw_uuid_1_y!=people_match6.raw_uuid_2))

people_match6.loc[fill_in_y_1, 'raw_uuid_3'] = people_match6.loc[fill_in_y_1, 'raw_uuid_1_y']

people_match6.loc[pd.isnull(people_match6.raw_uuid_1), 'raw_uuid_1'] = people_match6.loc[pd.isnull(people_match6.raw_uuid_1), 'raw_uuid_3']
people_match6.loc[pd.isnull(people_match6.raw_uuid_1), 'raw_uuid_1'] = people_match6.loc[pd.isnull(people_match6.raw_uuid_1), 'raw_uuid_2']

people_match6.loc[people_match6.raw_uuid_1==people_match6.raw_uuid_3, 'raw_uuid_3'] = np.nan

# drop all uuidds except for i, 2, 2
UUID_COLS = ['raw_uuid', 'raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3']

people_match7 = people_match6.drop(['raw_uuid_x', 'raw_uuid_y', 'raw_uuid_1_x', 'raw_uuid_1_y'], axis=1)

In [132]:
# consolidate_app_year columns
# consolidate uuid columns
people_match7['application_year'] = people_match7['application_year_x']
people_match7['application_year_1'] = people_match7['application_year_1_x']
people_match7['application_year_2'] = np.nan
people_match7['application_year_3'] = np.nan

fill_in_y = (
    (people_match7.application_year_y!=people_match7.application_year_x) & (people_match7.application_year_y!=people_match7.application_year_1))

people_match7.loc[fill_in_y, 'application_year_2'] = people_match7.loc[fill_in_y, 'application_year_y']

fill_in_y_1 = (
    (people_match7.application_year_1_y!=people_match7.application_year) & (people_match7.application_year_1_y!=people_match7.application_year_1) &
        (people_match7.application_year_1_y!=people_match7.application_year_2))

people_match7.loc[fill_in_y_1, 'application_year_3'] = people_match7.loc[fill_in_y_1, 'application_year_1_y']

people_match7.loc[pd.isnull(people_match7.application_year_1), 'application_year_1'] = people_match7.loc[pd.isnull(people_match7.application_year_1), 'application_year_3']
people_match7.loc[pd.isnull(people_match7.application_year_1), 'application_year_1'] = people_match7.loc[pd.isnull(people_match7.application_year_1), 'application_year_2']

people_match7.loc[people_match7.application_year_1==people_match7.application_year_3, 'application_year_3'] = np.nan
people_match7.loc[people_match7.application_year_2==people_match7.application_year_3, 'application_year_3'] = np.nan
people_match7.loc[people_match7.application_year_1==people_match7.application_year_2, 'application_year_2'] = np.nan

# drop all uuidds except for i, 2, 2
APPLICATION_YEAR_COLS = ['application_year', 'application_year_1']

people_match8 = people_match7.drop(['application_year_2', 'application_year_3', 'application_year_x', 'application_year_y', 'application_year_1_x', 'application_year_1_y'], axis=1)

In [133]:
people_match4.loc[people_match6.clean_last_name.isin(['ALEXANDER', 'ALPERT']), [
        'application_year_x', 'application_year_1_x', 'application_year_y', 'application_year_1_y', 'clean_last_name_x']]

Unnamed: 0,application_year_x,application_year_1_x,application_year_y,application_year_1_y,clean_last_name_x
54,1971.0,,1971.0,,ALEXANDER
216,1969.0,,1968.0,,ALPERT


In [134]:
people_match8.loc[people_match8.clean_last_name.isin(['ALEXANDER', 'ALPERT'])]

Unnamed: 0,index,is_match,address,age,application_date,associate_program_entered,bob,ca,cc,citizenship,...,undergraduate_school,withdrawal,year_accepted,zip_code,raw_uuid,raw_uuid_1,raw_uuid_2,raw_uuid_3,application_year,application_year_1
54,C ALEXANDER DUKE,1.0,1416 Beal Street,,1971-03-11,,0.0,1.0,0.0,,...,Duke University,0.0,1973.0,,96.0,3802.0,95.0,3800.0,1971.0,
216,S ALPERT HARVARD,1.0,92 Curtis Street,,1969-01-01,,0.0,1.0,1.0,,...,Yale University,-9.0,,2144.0,62.0,6295.0,63.0,3758.0,1969.0,1968.0


In [135]:
# add back to main data set
multi_apps_ids = np.concatenate([
        people_match8.raw_uuid.dropna().unique(), people_match8.raw_uuid_1.dropna().unique(), 
        people_match8.raw_uuid_2.dropna().unique(), people_match8.raw_uuid_3.dropna().unique()], 
                        axis=0)

In [136]:
one_app = get_nonmatched(full_matches2, multi_apps_ids, 'raw_uuid')

(3888, 78)


In [137]:
full_apps = pd.concat([one_app, people_match8], axis=0).sort_values(NAME_COLS+UUID_COLS)
print full_apps.shape

(4019, 81)


In [138]:
dups1 = full_apps[full_apps.duplicated(['clean_last_name', 'clean_first_initial', 'clean_middle_initial', 'medical_school'], keep='first')].sort_values('clean_last_name')
dups2 = full_apps[full_apps.duplicated(['clean_last_name', 'clean_first_initial', 'clean_middle_initial', 'medical_school'], keep='last')].sort_values('clean_last_name')
dups2 = dups2[['clean_last_name', 'clean_first_name', 'clean_first_initial', 'medical_school', 'raw_uuid', 'raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3', 'application_year', 'application_year_1']]

In [139]:
dups3 = dups2.rename(columns={'raw_uuid': 'raw_uuid_4', 'raw_uuid_1': 'raw_uuid_5', 'raw_uuid_2': 'raw_uuid_6',
                             'raw_uuid_3': 'raw_uuid_7', 'application_year': 'application_year_2', 
                              'application_year_1': 'application_year_3'})

In [140]:
dups4 = pd.merge(left=dups1, right=dups3, on=['clean_last_name', 'clean_first_initial', 'medical_school'], how='inner')
dups4.loc[:, ['raw_uuid', 'raw_uuid_4', 'raw_uuid_1', 'raw_uuid_5', 'raw_uuid_2', 'raw_uuid_6',
                             'raw_uuid_3', 'raw_uuid_7', 'application_year', 'application_year_2', 
                              'application_year_1', 'application_year_3']]
dups4.loc[pd.isnull(dups4.application_year_1), 'application_year_1'] =  dups4.loc[
    pd.isnull(dups4.application_year_1), 'application_year_3'] 
dups4.loc[pd.isnull(dups4.application_year_1), 'application_year_1'] =  dups4.loc[
    pd.isnull(dups4.application_year_1), 'application_year_2']
dup_years = dups4.application_year_1==dups4.application_year_2
dups4.loc[dup_years, 'application_year_2'] = np.nan


In [141]:
dups5 = dups4.join(dups4[['raw_uuid', 'raw_uuid_4', 'raw_uuid_1', 'raw_uuid_5', 'raw_uuid_2', 'raw_uuid_6',
                             'raw_uuid_3', 'raw_uuid_7']].apply(get_unique_vals, axis=1))

In [142]:
dups6 = dups5.rename(columns=dict(zip(range(6), ['raw_uuid', 'raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3', 'raw_uuid_4',
                             'raw_uuid_5'])))
sorted(dups6.columns)
dups6 = dups5[['raw_uuid', 'raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3', 'raw_uuid_4',
                'raw_uuid_5', 'clean_last_name', 'clean_first_initial', 'medical_school', 
                   'application_year', 'application_year_2', 
                              'application_year_1',  'application_year_3']]

In [143]:
not_dups = full_apps[
    ~full_apps.duplicated(
        ['clean_last_name', 'clean_first_initial', 'clean_middle_initial', 'medical_school'], keep=False)]

In [157]:
full_apps1 = pd.concat([not_dups, dups6], axis=0).reset_index(drop=True)

In [173]:
full_apps1.index.name = PERSON_ID
print PERSON_ID in full_apps1.columns
full_apps2 = full_apps1.reset_index(drop=False)
# .rename(columns={'index': PERSON_ID})
# sorted(full_apps2.columns)

False


In [175]:
last_name='AXELROD'
full_apps2.loc[full_apps2.clean_last_name==last_name, NAME_COLS+[PERSON_ID, 'medical_school']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,person_uuid,medical_school
37,ALAN,JAY,AXELROD,37,ILLINOIS
709,DAVID,,AXELROD,709,HARVARD


In [176]:
manual_fixes.loc[manual_fixes.clean_last_name==last_name]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,medical_school,to_fix_clean_first_name,to_fix_clean_middle_name,to_fix_clean_last_name,to_fix_medical_school,to_fix_medschool_year_grad


In [182]:
full_apps1.loc[full_apps1.clean_last_name=='ARON', NAME_COLS+['medical_school']]

Unnamed: 0_level_0,clean_first_name,clean_middle_name,clean_last_name,medical_school
person_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
293,BER,,ARON,NYU


In [178]:
# write to csv
full_apps2.to_csv(os.path.join(APP_DATA_DIR, 'index_cards_deduped_fuzzy.csv'), index=False)
full_apps2.to_pickle(os.path.join(PICKLE_DIR, 'index_cards_deduped_fuzzy.p'))


In [179]:
# also write out original raw index card (pre merge data set)
all_app7.to_csv(os.path.join(APP_DATA_DIR, 'index_cards_raw.csv'), idnex=False)
all_app7.to_pickle(os.path.join(PICKLE_DIR, 'index_cards_raw.p'))

In [181]:
last_name='ALTMAN'

full_apps2.loc[full_apps2.clean_last_name==last_name, NAME_COLS+[PERSON_ID, 'medical_school']]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name,person_uuid,medical_school
599,DAVID,F,ALTMAN,599,PITTSBURGH
2109,LEONARD,CHARLES,ALTMAN,2109,HARVARD


In [63]:
# do some sanity checks on the data
all_last_names = set(all_app7.clean_last_name.values)
merged_last_names = set(full_apps.clean_last_name.values)
diff_names = all_last_names - merged_last_names



In [64]:
print diff_names

set(['MACLOWRY', 'STABENAU', 'HARRIN', 'GARFIN', 'PENDERGAST', 'DEFRONZO', 'ROBINS', 'COLBERG', 'CHESEBRO', 'MOND', 'COLLIN', 'GLASSROTH', 'COSTANTIN', 'FRIEDLANDER', 'SCHUTZ', 'DAVISON', 'SARAL', 'KEISER', 'FINKLESTEIN', 'EILER', 'EISCH', 'DIEZMAN', 'KINNEY', 'KETOVER', 'FREY', 'HEIBY', 'BEAK', 'HERSH', 'LURIA', 'BRADEN', 'KEBIAN', 'CUONO', 'STEVENS', 'ARNSON', 'GREELEY', 'HUNT', 'LIST', 'DROBIS', 'STAMPER', 'BULKEY', 'BENETT'])


In [65]:
print UUID_COLS, APPLICATION_YEAR_COLS

['raw_uuid', 'raw_uuid_1', 'raw_uuid_2', 'raw_uuid_3'] ['application_year', 'application_year_1']


In [66]:
missing_ppl = all_app7.loc[all_app7.clean_last_name.isin(diff_names), NAME_COLS+['raw_uuid', 'application_year']]

In [67]:
missing_ids = all

In [68]:
full_apps.loc[full_apps.raw_uuid.isin(missing_ppl.raw_uuid) | full_apps.raw_uuid_1.isin(missing_ppl.raw_uuid), NAME_COLS]

Unnamed: 0,clean_first_name,clean_middle_name,clean_last_name
86,ALAN,N,ANDERSON
1167,ARTHUR,MICHAEL,FRIENDLANDER
1947,BART,PETER,KENTOVER
605,BRUCE,WILCOX,CHESEBRE
766,CHARLES,BOSETTI,CUOMO
246,CLAUDE,CLAUDE,BENNETT
850,DALE,E,DIETZMAN
3656,DAVID,ALEC,STEVENSON
217,DAVID,MICHAEL,BEAR
1431,DONALD,ALLEN,GREENLEY
