In [1]:
# import and compile NIH attendees
# read in known applicant files, dedupe and try to merge with applicants file
from collections import Counter
import difflib
import uuid
import itertools
import pandas as pd
import numpy as np
import string
import funcy
import re
import os


%load_ext autoreload
%autoreload 2

In [2]:
from data_cleaning_functions import (trans_remov_punc, standardize_whitespace, remove_punc, remove_suffix_from_last_name,
                                     clean_names, has_award, has_suffix, get_suffix, replace_last_name, 
                                     is_year_range, str_sim, clean_med_school, clean_std_college_name, long_form_date, 
                                    correct_mispellings)

from dev import (
    APP_DATA_DIR, SUM_STAT_DIR, RAW_ATT_DATA_DIR, ATT_DATA_DIR, CARD_DATA_DIR, CORRECTIONS_DIR, AWARDS_KEYWORDS, NAME_COLS, RAW_NAME_COLS, 
    RAW_CARD_ID, RAW_INDEX_IDS, PERSON_APPLICATION_ID, PERSON_ID, NIH_ID, FEMALE_FIRST_NAMES, FEMALE_MIDDLE_NAMES, 
    PICKLE_DIR)

In [3]:
# concat_df['first_middle']

In [31]:
def strip_first_middle(raw_str):
    if pd.isnull(raw_str):
        return pd.Series({'firstname2': np.nan, 'middlename2': np.nan, 'suffix': np.nan})
    # try to split on comma
    lst = raw_str.split(', ')
    lst2 = raw_str.split(' ')
    if len(lst) == 1 and len(lst2)==1:
        return pd.Series({'firstname2': lst[0], 'middlename2': np.nan, 'suffix': np.nan})
    # now split on spaces
    lst3 = map(remove_punc, lst2)
    return pd.Series({'firstname2': lst3[0], 'middlename2': lst3[1], 'suffix': np.nan if len(lst3) < 3 else lst3[2]})
    

In [11]:
r1_file = '1964-1973 associates.XLS'
r2_file = 'Associates alpha by institute.XLS'
r3_file = 'Associates data.XLS'
r4_file = 'NIMH Associates Complete.XLS'
r5_file = 'NINDB Associates alpha by year.xls'
filenames = [r1_file, r2_file, r3_file, r4_file, r5_file]

file_df = map(lambda x: pd.read_excel(os.path.join(RAW_ATT_DATA_DIR, x)), filenames)

file_4_columns = [
    'dno', 'source', 'unknown', 'lastname', 'first_middle', 'institute', 'lab_brch', 
    'program', 'supervisor', 'eod_year', 'med_school', 'year_grad', 'intern_hos', 'intern_dte',
       'res_hosp', 'residency', 'res_dtes'
]

file_df[2].rename(columns={'lname':'lastname', 'fname': 'first_middle'}, inplace=True)
file_df[1].rename(columns={'lname':'lastname', 'fname': 'first_middle'}, inplace=True)

file_df[4].columns = file_4_columns
# for each files in the list, add a column to track source
for name, f in zip(filenames, file_df):
    f.loc[:, 'data_source'] = name
concat_df = pd.concat(file_df)

print sum(map(lambda x: x.shape[0], file_df)) == concat_df.shape[0]

True


In [32]:
df2_a = concat_df.loc[:, 'first_middle'].apply(strip_first_middle)

In [33]:
# now we have all the associates, sep first middle into first and middle name, then sort and check 
# to see if we have any duplicates
df2 = pd.concat([concat_df, df2_a], axis=1)

In [34]:
df2.loc[~pd.isnull(df2.first_middle), ['first_middle', 'firstname2', 'middlename2', 'suffix']]

Unnamed: 0,first_middle,firstname2,middlename2,suffix
0,"Thomas, Bruce",Thomas,Bruce,
1,Carmie,Carmie,,
2,"Michael, C.",Michael,C,
3,"Daniel, N.",Daniel,N,
4,Roland,Roland,,
5,"Oren, Wyatt",Oren,Wyatt,
6,"Howard, Laurence",Howard,Laurence,
7,"William, Zane",William,Zane,
8,"Anthony, John",Anthony,John,
9,"Philip, Allen",Philip,Allen,


In [35]:
# consolidate firstname columns
df2.loc[~pd.isnull(df2.first_middle), 'firstname'] = df2.loc[~pd.isnull(df2.first_middle), 'firstname2']
df2.loc[pd.isnull(df2.middlename), 'middlename'] = df2.loc[pd.isnull(df2.middlename), 'middlename2']

df3 = df2.drop(['first_middle', 'firstname2', 'middlename2'], axis=1)

In [37]:
# dropnow where both first and last name are missing
df3 = df3.dropna(subset=['firstname', 'lastname'], how='all')

# df3.dropna(subset=['firstname', 'lastname'], how='all').loc[:, ['firstname', 'lastname', 'dno', 'data_source']]
df3.dropna(subset=['firstname', 'lastname'], how='all').loc[:, 'data_source'].unique()

df3_sorted = df3.sort_values(by=['dno'])

df3_unique = df3.drop_duplicates('dno')

df3_unique.loc[:, 'clean_firstname'] = df3_unique['firstname'].apply(clean_names)
df3_unique.loc[:, 'clean_middlename'] = df3_unique['middlename'].apply(clean_names)
df3_unique.loc[:, 'clean_lastname'] = df3_unique['lastname'].apply(clean_names)

df3_unique.sort_values(['clean_firstname', 'clean_middlename', 'clean_lastname'], inplace=True)

df3_unique.loc[df3_unique.duplicated(['clean_firstname', 'clean_middlename', 'clean_lastname'], keep=False), :].to_csv(
os.path.join(CARD_DATA_DIR, 'attendees_appearing_twice.csv'))

df4 = df3_unique.drop_duplicates(['clean_firstname', 'clean_middlename', 'clean_lastname'], keep='first')

# it seems that dno does refer to unique person, so drop dups based on that 
# save this unique to pick
df3_unique.to_pickle(os.path.join(ATT_DATA_DIR, 'unique_attendees.p'))

# to csv
df3_unique.to_csv(os.path.join(ATT_DATA_DIR, 'unique_attendees.csv'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


IOError: [Errno 2] No such file or directory: '~/Dropbox (MIT)/yellowberets/lindsey/intermediate_data/attendees_data/unique_attendees.p'