In [1]:
import difflib
import itertools
import pandas as pd
from collections import Counter
import numpy as np
import string
import funcy
import re
import os
import uuid
import math

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name)


In [5]:
CARD_DATA_DIR = os.path.abspath('Data/applicant_data/raw_card_data')
APP_DATA_DIR = os.path.abspath('Data/applicant_data')
ATT_DATA_DIR = os.path.abspath('Data/attendees_data')
print CARD_DATA_DIR

/home/lraymond/MIT/Azoulay_2016/yellow_berets/yellow_beret/Data/applicant_data/raw_card_data


In [6]:
OUTPUT_CSV = False
RAW_APPLICANT_DATA_FILENAME = 'raw_applicant_card_data.csv'
MISSING_APPDATE_FILENAME = 'index_cards_no_application_date.csv'


In [7]:
CLEAN_NAMES = ['clean_first_name', 'clean_middle_name', 'clean_last_name']
NAMES_COLS = ['first_name', 'middle_name', 'last_name']
PERSONAL_INFO = [
    'clean_first_name', 'clean_last_name', 'clean_middle_name',
    'date_of_birth', 'medical_school', 'clean_college_trans']

AWARDS_KEYWORDS = ['HONORS', 'AWARD', 'HONOR', 'SOCIETY', 'SCHOLAR', 'AOA', 'PME', 'FNHS', 'ODK']

# id column that links back to raw applicant data file
RAW_CARD_ID = 'raw_uuid'

# column where the raw id information is stored
RAW_INDEX_IDS = 'raw_card_ids'

# try to get one id per unique applicant in the dataset
PERSON_ID = 'person_uuid'
# id per deduped application-person - if someone applied multiple times, they will have multiple ids
PERSON_APPLICATION_ID = 'person_app_uuid' 

In [8]:
all_appcards2 = pd.read_csv(os.path.join(CARD_DATA_DIR, RAW_APPLICANT_DATA_FILENAME))

In [9]:
# drop all rows with first, last name NA
all_app3 = all_appcards2.dropna(subset=['application_date'], how='all')

In [10]:
all_app3.loc[pd.isnull(all_app3.application_date), 'flag_missing_app_date'] = 1
all_app3.loc[~pd.isnull(all_app3.application_date), 'flag_missing_app_date'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
missing_app_date = all_appcards2.loc[pd.isnull(all_appcards2.application_date), :]

In [12]:
if OUTPUT_CSV:
    missing_app_date.to_csv(os.path.join(CARD_DATA_DIR, MISSING_APPDATE_FILENAME), index=False)

In [13]:
def id_poorlyfmtdates(str_date):
    try:
        dt = pd.to_datetime(str_date, format='%m/%d/%Y')
        return True
    except (ValueError, AssertionError):
        return False

In [14]:
mask = all_app3.application_date.apply(id_poorlyfmtdates)
# all_app3.loc[~mask, :]

In [15]:
# change '3/31971 to 3/3/1971
# change 41/8/1966 to 4/8/1966 
all_app3.loc[all_app3.application_date=='3/31971', 'application_date'] = '3/3/1971'
all_app3.loc[all_app3.application_date=='41/8/1966', 'application_date'] = '4/8/1966'

In [16]:
# convert application date to date object
all_app3.loc[:, 'application_date'] = all_app3['application_date'].apply(lambda x: pd.to_datetime(x))

In [17]:
# do the same date check for birth date columns
bdate_mask = all_app3.date_of_birth.apply(id_poorlyfmtdates)
all_app3.loc[~bdate_mask, ['first_name', 'last_name', 'date_of_birth']]
all_app3.loc[(
        all_app3.last_name=='Cook') & 
                  (all_app3.middle_name=='James') & 
                  (all_app3.first_name.isnull()), 'date_of_birth'] = '1/27/1940'

In [18]:
for n in NAMES_COLS:
    all_app3.loc[:, 'clean_{}'.format(n)] = all_app3[n].apply(clean_names)

In [19]:
suffix_mask = all_app3.clean_last_name.apply(has_suffix)
# all_app_short.loc[suffix_mask, ['clean_last_name', 'clean_first_name', 'clean_middle_name']]

In [20]:
# for those last names that seem to have a suffix, pull into seperate column and keep everything last word of last name
all_app3.loc[suffix_mask, 'clean_suffix'] = all_app3.loc[suffix_mask, 'clean_last_name'].apply(get_suffix)
all_app3.loc[suffix_mask, 'clean_last_name'] = all_app3.loc[suffix_mask, 'clean_last_name'].apply(remove_suffix_from_last_name)

In [21]:
# some first names also contain some honors such as 'Pfizer Award' or 'Honor Society'
# these should be pulled into the honors and awards columns
has_award_fnc = funcy.rpartial(has_award, AWARDS_KEYWORDS)


In [22]:
# get a list of all the med school honors columns
honors_columns = [c for c in all_app3.columns if 'honor' in c]
print honors_columns

has_award_mask = all_app3['clean_first_name'].apply(has_award_fnc)

all_app3.loc[has_award_mask, 'extra_honor'] = all_app3.loc[has_award_mask, 'clean_first_name']

['honor_societies_first', 'honor_societies_fourth', 'honor_societies_second', 'honor_societies_third']


In [23]:
# create column mask for each row where one of the honors columns is blank
for hc in honors_columns:
    hc_mask = (has_award_mask) & (pd.isnull(all_app3[hc]))
    all_app3.loc[hc_mask, hc] = all_app3.loc[hc_mask, 'extra_honor']
# check for any columns that already have full honors and cant be filled
all_app3.loc[hc_mask, honors_columns].dropna(how='any')

Unnamed: 0,honor_societies_first,honor_societies_fourth,honor_societies_second,honor_societies_third


In [35]:
# drop extra honor columns
all_app4 = all_app3.drop('extra_honor', axis=1)

In [36]:
# replace those honors first names with np.nan
all_app4.loc[has_award_mask, 'clean_first_name'] = np.nan

In [37]:
# for those columns where med school is equal to last name or med_school is a year range, delete
med_school_str_sim = funcy.rpartial(str_sim, 'medical_school', 'clean_last_name')
all_app4.loc[:, 'school_name_sim'] = all_app4.loc[:, ['clean_last_name', 'medical_school']].apply(med_school_str_sim, axis=1)

In [27]:
all_app4.loc[all_app4.school_name_sim > .6, :]

Unnamed: 0.1,Unnamed: 0,address,age,application_date,associate_program_entered,bob,ca,cc,cord,citizenship,...,middle_name,reviewer,undergrad_year_grad,raw_uuid,flag_missing_app_date,clean_first_name,clean_middle_name,clean_last_name,clean_suffix,school_name_sim


In [38]:
all_app4.loc[:, 'clean_college'] = all_app4.undergraduate_school.apply(clean_names)

In [39]:
to_remove_college = [
    ' AND ', ' AT ', 'THE ', ' COLLGE', 'UNIVERISTY', 'UNIVERWSITY', 'MASSACHUSSETTS', 'JOHN ', 'DE PAUW', 'ASBURY', 
'DREXEL INSTITUTE OF TECHNOLOGY', 'A B BROWN UNIVERSITY', 'DARTMOUTH MEDICAL SCHOOL', 'RENSSELAER UNIVERSITY', 
'RENSSELAER POLYTECHNICAL INSTITUTE', ' STE', 'COLLEGE OF HOLY CROSS', 'HOLLY CROSS', 'JOHNSS ',  'BERKLEY',
'UC ', 'PITTSBURRGH', 'WESLYN', 'WILLAMS', 'GEORGIA TECH', 'NEW YORK UNIVERSITY UNIV', 
'UNIVERSITY OF MICHIGAN IS A', 'OHIO', 'STATE UNIVERSITY OF NEW YORK AT BUFFALO']
to_replace_college = [
    ' ', ' ', ' ', ' COLLEGE', 'UNIVERSITY', 'UNIVERSITY', 'MASSACHUSETTS', 'JOHNS ', 'DEPAUW', 'ASHBURY',
    'DREXEL UNIVERSITY', 'BROWN UNIVERSITY', 'DARTMOUTH', 'RENSSELAER POLYTECHNIC INSTITUTE', 
    'RENSSELAER POLYTECHNIC INSTITUTE', ' STATE', 'HOLY CROSS', 'HOLY CROSS', 'JOHNS ', 
    ' BERKELEY', 'UNIVERSITY OF CALIFORNIA ', 'PITTSBURGH', 'WESLEYAN', 'WILLIAMS', 
    'GEORGIA INSTITUTE OF TECHNOLOGY', 'NEW YORK', 'UNIVERSITY OF MICHIGAN', 'OHIO STATE', 'SUNY BUFFALO']

clean_college_fnc = funcy.rpartial(clean_std_college_name, to_remove_college, to_replace_college)

In [40]:
# make college mispelling and different reference translation table
all_app4.loc[:, 'clean_college_trans'] = all_app4.clean_college.apply(clean_college_fnc)

In [50]:
all_app4.drop(['clean_college', 'school_name_sim'], axis=1, inplace=True)

In [59]:
all_app4.loc[:, 'medical_school'] = all_app4.medical_school.apply(funcy.rcompose(clean_names, clean_med_school))

UC BERKELEY


In [60]:
all_app4.medical_school.sort_values().unique()

array(['ALABAMA', 'ALBERT EINSTEIN COLLEGE OF MEDICINE OF YESHIVA',
       'ARIZONA', 'ARKANSAS', 'BAYLOR', 'BOSTON', 'CASE WESTERN RESERVE',
       'CHICAGO', 'CINCINNATI', 'COLORADO', 'COLUMBIA', 'CONNECTICUT',
       'DARTMOUTH ', 'DUKE', 'EMORY', 'FLORIDA', 'GEORGE WASHINGTON',
       'GEORGETOWN', 'GEORGIA', 'HAHNEMANN', 'HARVARD ', 'HOWARD',
       'ILLINOIS', 'INDIANA', 'IOWA',
       'JEFFERSON MEDICAL COLLEGE OF THOMAS JEFFERSON',
       'JOAN SANFORD I WEILL MEDICAL COLLEGE CORNELL', 'JOHNS HOPKINS',
       'KANSAS', 'KENTUCKY', 'LOMA LINDA', 'LOUISIANA STATE', 'LOUISVILLE',
       'LOYOLA', 'MARYLAND', 'MIAMI', 'MICHIGAN', 'MICHIGAN STATE',
       'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'NEBRASKA', 'NEW', 'NORTH',
       'NORTHWESTERN', 'NYU', 'OHIO', 'OHIO STATE', 'OKLAHOMA',
       'OREGON HEALTH SCIENCES', 'PENNSYLVANIA', 'PENNSYLVANIA STATE',
       'PITTSBURGH', 'ROCHESTER', 'SAINT LOUIS', 'SOUTH', 'STANFORD',
       'SUNY', 'TEMPLE', 'TENNESSEE', 'TEXAS', 'TUFTS', 'TUL

In [61]:
# need to convert undergrad_year_grad and med_school_grad to numbers to maintain consistence
all_app4.loc[:, ['undergrad_year_grad', 'medschool_year_grad']] = all_app4.loc[:, ['undergrad_year_grad', 'medschool_year_grad']].apply(
    lambda x: pd.to_numeric(x, errors='coerce'))

In [62]:
# now, sort by names, med school, undergrad school, 
all_app5 = all_app4.sort_values(by=PERSONAL_INFO)

In [63]:
# LAST_NAME_MISSPELLINGS = {
#         'HOMCY': 'HOMEY', 'DROBIS': 'DROBIN', 'DEFRONZO':'DEFRENZO', 
#         'BRADEN 3R': 'BRADEN', 'BORKER': 'BORER', 'CASTLES': 'CASTLE',
#         'CUONO': 'CUOMO', 'CYRULNIK': 'CYRULINK', 'EISENBATH': 'EISENBARTH', 
#         'ELLIOTT': 'ELIOT', 'FINKLESTEIN': 'FINKELSTEIN', 'HEINRICK': 'HEINRICH', 
#         'HERLIKY': 'HERLIHY', 'HIMMELHOCK': 'HIMMELHOCH', 'JANOWSKY': 'JANKOWSKY', 
#         'KLINENBERG': 'KLINEBERG', 'KORNFELD': 'KORNFIELD', 'NEIDORF': 'NEIDOFT',
#         'OLEINICK': 'OLENICK', 'ROSKES': 'ROSKE'
# }
# removed CUONO, DROBIS, 


LAST_NAME_MISSPELLINGS = {
        'HOMCY': 'HOMEY', 
        'BRADEN 3R': 'BRADEN', 'BORKER': 'BORER', 'CASTLES': 'CASTLE',
        'CYRULNIK': 'CYRULINK', 'EISENBATH': 'EISENBARTH', 
        'HEINRICK': 'HEINRICH', 
        'HERLIKY': 'HERLIHY', 'HIMMELHOCK': 'HIMMELHOCH', 'JANOWSKY': 'JANKOWSKY', 
        'KLINENBERG': 'KLINEBERG', 'KORNFELD': 'KORNFIELD', 'NEIDORF': 'NEIDOFT',
        'OLEINICK': 'OLENICK', 'ROSKES': 'ROSKE'
}

In [64]:
replace_last_name_fnc = funcy.rpartial(replace_last_name, LAST_NAME_MISSPELLINGS)

In [65]:
# correct last name mispellings
all_app5.loc[:, 'clean_last_name'] = all_app5.loc[:, 'clean_last_name'].apply(replace_last_name_fnc)
all_app5.loc[all_app5.clean_last_name=='MORTON', 'clean_first_name'] = 'JOHN'

In [66]:
# convert ca column to float62
all_app5.loc[:, 'ca'] = all_app5.loc[:, 'ca'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [67]:
def sorting_fnc(v):
    # if string entry, sort by length, otherwise return value
    if isinstance(v, str):
        return len(v)
    return v

def consolidate_holes(df_col):
    # for each dataframe of info on one person/application year pair, consolidate info
    # drop null values, get unique and take longest by string length or first numeric value
    # all dataframes are coming in with reviewer 2 on top, who seems to be more accurate
    # so take that answer if possible
    lst_vals = list(df_col.dropna().unique())
    if len(lst_vals) == 0:
        return np.nan
    if isinstance(lst_vals[0], str):
        lst_vals = sorted(lst_vals, key=sorting_fnc, reverse=True)
    elif isinstance(lst_vals[0], np.datetime64):
        print lst_vals
        # if multiple dates of application, choose latest
        lst_vals = sorted(lst_vals, reverse=True)
    return lst_vals[0]

def _stringify_info(val_series):
    # accepts a series, returns a string
    str_vals = [str(v) for v in val_series]
    return '_'.join(str_vals)

def stringify_personal_info(df_row):
    # accepts a series, returns a list object
    val_series = df_row[['clean_first_name', 'clean_middle_name', 
                        'clean_last_name', 'clean_college_trans', 'medical_school', 'date_of_birth']]
    return _stringify_info(val_series)

def stringify_ids(df_row):
    # accepts a series, returns a list object
    # takes the raw id card row of consolidated cols and turns it into a string to save
    val_series = df_row[RAW_CARD_ID]
    return _stringify_info(val_series)

def add_sanity_check_row(df, vals=None):
    if vals is None:
        df.loc[:, 'sanity_check'] = np.nan
    else:
        df.loc[:, 'sanity_check'] = vals
    return df


def add_merged_ids(df, vals=None):
    # add combined id columns
    # add an id column that track what is the index of columns consolidated into one
    if vals is None:
        df.loc[:, RAW_INDEX_IDS] = np.nan
    else:
        df.loc[:, RAW_INDEX_IDS] = vals
    return df
    

def format_consolidated_data(df):
    # applies consolidate fnc to dataframe, converts it into a df that can be merged
    # sort df so reviewer 2 is on top
    df_sort = df.sort_values(by=['reviewer'], ascending=False)
    vals = map(lambda x: stringify_personal_info(df_sort.loc[x, :]), df_sort.index)
    d2_series = df_sort.apply(consolidate_holes)
    df_trans = pd.DataFrame(d2_series).T
    # create a column that compares string values in another column
    str_ids = stringify_ids(df)
    df_trans2 = add_merged_ids(df_trans, str_ids)
    sanity_checks = '\n'.join(vals)
    return add_sanity_check_row(df_trans2, sanity_checks)

In [68]:
# VERY IMPORTANT, NEED TO RESET THE INDEX
all_app6 = all_app5.reset_index(drop=True)

In [69]:
all_app6.loc[:, 'application_year'] = pd.DatetimeIndex(all_app6.application_date).year

In [70]:
all_app6.sort_values('clean_last_name', inplace=True)

In [71]:
last_name_grouped = all_app6.groupby('clean_last_name')

In [72]:
del all_app3, all_app4, all_app5

In [73]:
# function to consolidate multiple rows into one person
def consolidate_person(candidate_df):
    # for each dataframe, with a common last name, seperate into unique people
    # easiest case, if 2 rows, 1 from reviewer 1 and 2, then consolidate
    df_len = candidate_df.shape[0]
    if df_len < 2:
        # if only 1 row in data group, return row
        cd2 = add_sanity_check_row(candidate_df)
        # create a column that compares string values in another column
        str_ids = stringify_ids(candidate_df)
        return add_merged_ids(cd2, str_ids)
    unique_first_names = candidate_df['clean_first_name'].dropna().unique()
    unique_undergrad = candidate_df['undergrad_year_grad'].dropna().unique()
    unique_college = candidate_df['clean_college_trans'].dropna().unique()
    if (len(unique_first_names) < 2 and len(unique_undergrad) < 2) or (len(unique_first_names) < 2 and len(unique_college) < 2):
        # if unique first names < 2 and < 2 diff years of undergrad OR 
        # unique first names < 2 and <2 unique college names
        # most likely this is the same person
        return format_consolidated_data(candidate_df)
    # otherwise there are more than 1 person to combine
    if len(unique_first_names) >= 2:
        # if there are 2 or more different first names, try to group by first and last name
        # and then analyze each group seperately
        new_grped = candidate_df.groupby(['clean_first_name', 'clean_last_name'])
        res = map(lambda (x, y): format_consolidated_data(y), new_grped)
        return pd.concat(res)
    if len(unique_college) >= 2:
        # if two different colleges, try to group by last name and college name
        new_grped = candidate_df.groupby(['clean_first_name', 'clean_college_trans'])
        res = map(lambda (x, y): format_consolidated_data(y), new_grped)
        return pd.concat(res)
    # otherise this is an edge case
    # otherwise, although the person doesn't match on unique first and undergrad years, they are still most likely one 
    # person
    print candidate_df
    raise AttributeError('Hitting a Bad Edge Case')
    return None

In [74]:
combined_rows = map(lambda (x, y): consolidate_person(y), last_name_grouped)

In [75]:
unique_people_df = pd.concat(combined_rows)

In [76]:
# reset index
unique_people_df = unique_people_df.reset_index(drop=True)

In [77]:
unique_people_df.sort_values(by=['clean_last_name'], axis=0, inplace=True)

In [78]:
unique_people_df[PERSON_ID] = unique_people_df.clean_last_name.apply(lambda x: str(uuid.uuid4()))

In [79]:
# check to make sure no uuid dups
unique_people_df.sort_values(PERSON_ID, inplace=True)
unique_people_df[unique_people_df.duplicated(PERSON_ID, keep=False)]

Unnamed: 0.1,Unnamed: 0,address,age,application_date,application_year,associate_program_entered,bob,ca,cc,citizenship,...,sanity_check,sixth,state,teaching,undergrad_year_grad,undergraduate_school,withdrawal,year_accepted,zip_code,person_uuid


In [82]:
sorted(unique_people_df.columns)

['Unnamed: 0',
 'address',
 'age',
 'application_date',
 'application_year',
 'associate_program_entered',
 'bob',
 'ca',
 'cc',
 'citizenship',
 'city',
 'clean_college_trans',
 'clean_first_name',
 'clean_last_name',
 'clean_middle_name',
 'clean_suffix',
 'clinical',
 'cord',
 'date_of_birth',
 'dbs',
 'fifth',
 'first_name',
 'flag_missing_app_date',
 'honor_societies_first',
 'honor_societies_fourth',
 'honor_societies_second',
 'honor_societies_third',
 'ic',
 'internship_hospital_1',
 'internship_year(s)',
 'last_name',
 'medical_school',
 'medical_school2',
 'medschool_year_grad',
 'middle_name',
 'nci',
 'nei',
 'nhi',
 'nhli',
 'niaid',
 'niamd',
 'niamdd',
 'nichd',
 'nichhd',
 'nidr',
 'niehs',
 'nigms',
 'nimh',
 'nindb',
 'ninds',
 'oir',
 'person_uuid',
 'pharm_ra',
 'pi',
 'ra',
 'raw_card_ids',
 'raw_uuid',
 'rejected',
 'rejection_date',
 'research',
 'residency_hospital',
 'residency_type',
 'residency_year(s)',
 'reviewer',
 'sa',
 'sanity_check',
 'sixth',
 'state'

In [83]:
unique_people_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [89]:
# merge uuid back into person, app date data set
def update_person_uuid(indexs, g_uuid, df):
    # take the stringified information stored in the 'raw_index_ids' column,
    # which should hold the raw card ids related to each unique person
    # go to places in the data set with that location and update the uuid column
    if pd.isnull(indexs):
        return np.nan
    if isinstance(indexs, str):
        inds = indexs.split('_')
    else:
        raise TypeError('Unkown id type')
    df.loc[df[RAW_CARD_ID].isin(inds), PERSON_ID] = g_uuid
        
# note this is a gross mutation operation! Im mutating the ids column in all_app5 to add in the personal id of all people 
# matched into one column
unique_people_df[[RAW_INDEX_IDS, PERSON_ID]].apply(lambda x: update_person_uuid(x[RAW_INDEX_IDS], x[PERSON_ID], all_app6), axis=1)

2329    None
2141    None
3781    None
3724    None
1718    None
3433    None
3125    None
3046    None
921     None
1014    None
525     None
3237    None
724     None
2913    None
483     None
778     None
2222    None
474     None
2888    None
3789    None
713     None
2771    None
3100    None
1016    None
497     None
2877    None
2049    None
2930    None
209     None
1461    None
        ... 
563     None
2432    None
1374    None
83      None
918     None
2815    None
3655    None
692     None
2476    None
2656    None
2133    None
3723    None
753     None
2445    None
1997    None
3025    None
651     None
2663    None
1928    None
1686    None
2501    None
3984    None
2609    None
1840    None
1586    None
859     None
2848    None
3694    None
1833    None
2174    None
dtype: object

In [90]:
all_app6.sort_values([PERSON_ID, 'clean_last_name'], inplace=True)

In [91]:
unique_people_df.loc[:, 'application_year'] = pd.DatetimeIndex(unique_people_df.application_date).year

In [92]:
all_app6.loc[pd.isnull(all_app6[PERSON_ID]), 'not_matched'] = 1

In [93]:
def find_match(row, to_search):
    matching_rows = to_search.loc[
        (to_search.clean_last_name==row['clean_last_name']) & (
            to_search.application_date==row['application_date']) & 
        (to_search.city==row['city']), :]
    if matching_rows.shape[0] == 0:
         return np.nan
    if matching_rows.shape[0] == 1:
        return matching_rows[PERSON_ID].values[0]
    raise AttributeError('Multiple Matches')
    print row[['clean_last_name', 'city', 'clean_first_name', 'application_date']]
    print matching_rows[['clean_last_name', 'city', 'clean_first_name', 'application_date']]
    return np.nan

In [94]:
# for each not matched person, search for a name, application date
all_app6.loc[all_app6.not_matched==1, PERSON_ID] = all_app6.loc[all_app6.not_matched==1, :].apply(lambda x: find_match(x, unique_people_df), axis=1)

In [95]:
def add_extra_match_ids(row, to_search):
    to_add_ind = to_search.loc[to_search[PERSON_ID]==row[PERSON_ID], :][RAW_CARD_ID].values[0]
    existing = row[RAW_INDEX_IDS]
    if pd.isnull(existing):
        return str(to_add_ind)
    return '{0}_{1}'.format(existing, str(to_add_ind))

In [96]:
# update unique df ids columns with extra info
not_matched_people = all_app6.loc[all_app6.not_matched==1, :]

unique_people_df.loc[unique_people_df[PERSON_ID].isin(not_matched_people[PERSON_ID].dropna()), 'extra_ids'] = \
    unique_people_df.loc[unique_people_df[PERSON_ID].isin(not_matched_people[PERSON_ID].dropna()), :].apply(
        lambda x: add_extra_match_ids(x, not_matched_people), axis=1)

In [97]:
# replace ids columns with extra ids where applicable
unique_people_df.loc[~pd.isnull(unique_people_df.extra_ids), RAW_INDEX_IDS] = unique_people_df.loc[~pd.isnull(
        unique_people_df.extra_ids), 'extra_ids']

In [98]:
# now, I want to group all_app5 by personal uuid and then find people with two different application dates
all_apps_grouped = all_app6.groupby(PERSON_ID)

In [99]:
# all_app6.loc[all_app6.duplicated('uuid', keep=False), PERSONAL_INFO+['sanity_check','uuid']]

In [100]:
# for people in a group, flag those with multiple app dates
# flag those whose med schools don't match
mismatch_med_school = []
for g, items in all_apps_grouped:
    unique_ms = items['medical_school'].dropna().unique()
    if len(unique_ms) > 1:
        mismatch_med_school.append(g)

In [104]:
# all_app6.loc[all_app6[PERSON_ID].isin(mismatch_med_school), ['clean_last_name', 'clean_first_name', 'medical_school']]

In [105]:
# for each group, if med school and city are not close at all for string sim, add person to the unique data set
# func should output rows to add to unique data set or None
def compute_city_sim(row, main_app_year, main_city):
    if row['application_year'] != main_app_year or pd.isnull(row['city']):
        return np.nan
    return difflib.SequenceMatcher(None, main_city, row['city']).ratio()



def reconcile_med_school(group_to_reconcile):
    g_uuid = group_to_reconcile[PERSON_ID].unique()[0]
    main_ms = unique_people_df.loc[unique_people_df[PERSON_ID]==g_uuid, 'medical_school'].unique()[0]
    main_app_year = unique_people_df.loc[unique_people_df[PERSON_ID]==g_uuid, 'application_year'].unique()[0]
    main_college =  unique_people_df.loc[unique_people_df[PERSON_ID]==g_uuid, 'clean_college_trans'].unique()[0]
    main_city = unique_people_df.loc[unique_people_df[PERSON_ID]==g_uuid, 'city'].unique()[0]
    mismatched = group_to_reconcile.loc[
        (group_to_reconcile.medical_school!=main_ms) & (~pd.isnull(group_to_reconcile.medical_school)), :]
    mismatched.loc[:, 'med_school_sim'] = mismatched.medical_school.apply(
        lambda x: difflib.SequenceMatcher(None, main_ms, x).ratio())
    mismatched.loc[:, 'city_sim'] = mismatched.apply(lambda x: compute_city_sim(x, main_app_year, main_city), axis=1)
    mismatched.loc[:, 'app_year_diff'] = mismatched.application_year.apply(lambda x: abs(main_app_year-x))
    return mismatched.loc[
        (mismatched.med_school_sim < .8) | ((
                ~pd.isnull(mismatched.city_sim)) & (mismatched.city_sim < .8)) | (
            mismatched.app_year_diff > 3), :]

In [106]:
def find_mismatched_row(g_uuid, to_select):
    g = to_select.loc[to_select[PERSON_ID]==g_uuid, :]
    return reconcile_med_school(g)

In [107]:
to_unique = pd.concat(map(lambda x: find_mismatched_row(x, all_app6), mismatch_med_school), axis=0)

In [108]:
# for each of the rows accidentally grouped, find in data set, change uuid and add new row to unique_df with ids col
all_app6.loc[all_app6.index.isin(to_unique.index), 'unique_flag'] = 1
print len(to_unique.index)

111


In [109]:
all_app6.loc[all_app6.unique_flag==1, 'uuid2'] = all_app6.apply(lambda x: str(uuid.uuid4()), axis=1)

In [110]:
all_app6.loc[all_app6.unique_flag==1, PERSON_ID] = all_app6.loc[all_app6.unique_flag==1, 'uuid2']

In [111]:
to_add = all_app6.loc[all_app6.unique_flag==1, :]

In [112]:
# overwrite raw data set column with the raw card id of the row to add to unique data set
to_add.loc[:, RAW_INDEX_IDS] = to_add[RAW_CARD_ID]

In [113]:
del all_app6['uuid2']

In [114]:
unique_people2 = unique_people_df.append(to_add)

In [115]:
unique_people2.shape

(4167, 80)

In [116]:
all_g2 = all_app6.groupby(PERSON_ID)

In [117]:
multiple_apps = []
mismatch_med_school = []
for g, items in all_g2:
    unique_apps = items['application_date'].dropna().unique()
    if len(unique_apps) > 1:
        multiple_apps.append(g)
    unique_ms = items['medical_school'].dropna().unique()
    if len(unique_ms) > 1:
        mismatch_med_school.append(g)

In [118]:
len(mismatch_med_school)

2

In [119]:
len(multiple_apps)

139

In [120]:
all_app7 = all_app6.reset_index(drop=False)

In [121]:
all_app8 = all_app7.rename(columns={'index': PERSON_APPLICATION_ID})

In [122]:
# drop application dates prior to 1950
all_app8 = all_app8[(all_app8.application_year > 1950) & (all_app8.application_year < 1976)]

In [123]:
all_app9 = all_app8.drop_duplicates([PERSON_ID, 'application_year'])

In [124]:
def convert_year_accepted(x):
    if pd.isnull(x):
        return np.nan
    try:
        return float(x)
    except ValueError:
        return np.nan

In [125]:
all_app9.loc[:, 'flag_rejected'] = all_app9.apply(
    lambda x: 1 if not pd.isnull(x['rejection_date']) or x['rejected']==1 else 0, axis=1)

In [126]:
all_app9.loc[:, 'year_accepted'] = all_app9.year_accepted.apply(convert_year_accepted)

In [127]:
# now, we want to make data set wide- add multiple applications horizontally
all_app9.sort_values([PERSON_ID, 'application_year'], ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [128]:
# at most 3 application dates
grouped_ppl = all_app9.groupby(PERSON_ID)
multiple_apps = []

for key, df in grouped_ppl:
    if df['application_date'].shape[0] == 2:
        multiple_apps.append(
            pd.DataFrame(
            {PERSON_ID: [df.iloc[1][PERSON_ID]], 'application_date_2': [df.iloc[1]['application_date']]}))
    elif df['application_date'].shape[0] == 3:
        multiple_apps.append(pd.DataFrame(
            {PERSON_ID: [df.iloc[1][PERSON_ID]], 'application_date_2': [df.iloc[1]['application_date']], 
            'application_date_3': [df.iloc[2]['application_date']]}))


In [129]:
add_app_dates = pd.concat(multiple_apps)

In [130]:
all_app10 = pd.merge(left=all_app9, right=add_app_dates, left_on=PERSON_ID, right_on=PERSON_ID, how='left')
unique_people3 = pd.merge(left=unique_people2, right=add_app_dates, left_on=PERSON_ID, right_on=PERSON_ID, how='left')

In [137]:
all_app11 = all_app10.drop(['Unnamed: 0', "daniel's_comments", 'other',  'fifth', 'sixth', 'unique_flag'], axis=1)

In [138]:
all_app9 = all_app9.drop(['Unnamed: 0', "daniel's_comments", 'other',  'fifth', 'sixth', 'unique_flag'], axis=1)

In [142]:
unique_people4 = unique_people3.drop([
        'Unnamed: 0', "daniel's_comments", 'other',  'fifth', 'sixth', 
        'unique_flag', 'extra_ids', 'medical_school2', 'uuid2'], axis=1)

In [144]:
all_app9.sort_values(['clean_last_name', 'application_year']).to_pickle(os.path.join(APP_DATA_DIR, 'person_application_date.p'))
all_app9.sort_values(['clean_last_name', 'application_year']).to_csv(os.path.join(APP_DATA_DIR, 'person_application_date.csv'))

In [140]:
all_app11.sort_values(['clean_last_name', 'application_year']).to_pickle(os.path.join(APP_DATA_DIR, 'person_application_date_wide.p'))
all_app11.to_csv(os.path.join(APP_DATA_DIR, 'person_application_date_wide.csv'))

In [None]:
unique_people3.to_csv(os.path.join(APP_DATA_DIR, 'unique_applicants.csv'))
unique_people3.to_pickle(os.path.join(APP_DATA_DIR, 'unique_applicants.p'))