In [3]:
import numpy as np
import pandas as pd
import os
import cPickle as pickle
import funcy

%load_ext autoreload
%autoreload 2

In [4]:
from dev import BIOGRAPHY_DATA_DIR 

In [5]:
name_uuid_filename = 'biography_data/search_name_person_uuid.xlsx'


In [6]:
names_uuid = pd.read_excel(os.path.join(BIOGRAPHY_DATA_DIR, name_uuid_filename))

In [7]:
# d is a dictionary of bio results, 
# the key is the uuid of the name search
# the value is a tuple with (num_results, search string, list of scraped results)
# the list of results is a list of dicts with {'biography': ..., 'full_text':...} sections

d = pickle.load(open('raw_bio_scrape.p'))

In [8]:
possible_errors = funcy.select_values(lambda x: x is None, d)

In [9]:
no_errors_d = funcy.select_values(lambda x: x is not None, d)

In [10]:
# first, collect the names with no results
no_results = funcy.select_values(lambda x: x[0] is None or x[0]==0, no_errors_d)

In [11]:
possible_errors_df = names_uuid.loc[names_uuid.person_uuid.isin(possible_errors.keys())] 

In [12]:
names_uuid.loc[names_uuid.person_uuid==175.0]

Unnamed: 0,person_uuid,aamc_id,newsetnb,dno,clean_first_name,clean_middle_name,clean_last_name,bio_results,gale_id,clean_suffix,control_flag,application_year_min,application_year_max,eod_year,medical_school,birth_year
2045,175,,,,ANTHONY,,LOMONACO,,,,1,1971,1971,,SUNY,


In [13]:
def get_search_string(person_uuid, res_dict):
    if person_uuid in res_dict:
        return res_dict[person_uuid][1]
    return np.nan

def get_number_results(person_uuid, res_dict):
    if person_uuid in res_dict:
        return res_dict[person_uuid][0]
    return np.nan

In [14]:
no_errors_d[1]

(1,
 'Miller, Frances',
 [{'biography': u'Born: October 15, 1937 in New York, New York, United States\nNationality: American',
   'full_text': u'Nationality: American. Born: New York City, 15 October 1937. Education: Wellesley College, Wellesley, Massachusetts, B.A. 1959; California State University, Hayward, Teaching Credential, 1976; graduate study at San Jose State University, California. Family: Married John David Miller; two daughters and two sons. Career: Reading tutor and volunteer worker at public schools in Oakland and San Ramon, California, 1966-75; reading and English teacher at middle school in Hayward, California, 1976-77; member of executive board, Adult Literacy Program, Sydney, Australia, 1979-83; writer and public speaker, since 1983. Coordinator of "Aussie Books for Kids" exhibit, 1984-88. Awards: ALA Best Book for Young Adults, California Young Reader Medal, both 1985, both for The Truth Trap..'}])

In [15]:
def parse_occupation(raw_bio_dict):
     # check for occupation
    if raw_bio_dict is None:
        return None 
    lst = raw_bio_dict['biography'].split('\nOccupation: ')
    if len(lst) > 1:
        return lst[-1]
    return None

In [16]:
def _wrapper(person_id, fnc, res_dict):
    if person_id not in res_dict:
        return np.nan
    bios = res_dict[person_id][2]
    if bios is None or len(bios)==0:
        return 0
    # list could be a list with 1 item a dict
    bios = filter(None, bios)
    if len(bios) == 1 and isinstance(bios[0], dict):
        return fnc(bios[0])
    # otherwise, list is a list of tuples of two items
    # first is string matched and second is bio dict
    bios_text = filter(None, map(funcy.second, bios))
    if bios_text:
        res = filter(None, map(fnc, bios_text))
        return ' | '.join(res)
    return np.nan

In [17]:
parse_occupation(no_errors_d[1][2][0])

In [18]:
names_uuid['number_search_results'] = 0
names_uuid['search_string'] = np.nan
names_uuid.loc[:, 'search_string'] = names_uuid['person_uuid'].apply(funcy.rpartial(get_search_string, no_errors_d))
names_uuid.loc[:, 'number_search_results'] = names_uuid['person_uuid'].apply(funcy.rpartial(get_number_results, no_errors_d))

In [19]:
names_uuid['occupations'] = np.nan
names_uuid.loc[:, 'occupations'] = names_uuid.person_uuid.apply(funcy.rpartial(_wrapper, parse_occupation, no_errors_d))

In [21]:
names_uuid.to_csv(os.path.join(BIOGRAPHY_DATA_DIR, 'biographies_data_summary.csv'), index=False)

In [None]:
f1 = no_errors_d[1461]
# f1 = no_errors_d[632]

In [None]:
def write_bio_dict(filename, bio_name, bio_dict):
    with open(filename, 'w') as f:
        f.write(bio_name+'\n')
        for key, val in bio_dict.iteritems():
            try:
                f.write(str(key)+'\n')
                f.write(str(val)+'\n')
            except (UnicodeDecodeError, UnicodeEncodeError) as e:
                continue
    

In [None]:
def write_bio_tuple(filename, bio_tups):
    with open(filename, 'w') as f:
        for res_name, res_dict in bio_tups:
                f.write(res_name+'\n')
                f.write('\n')
                if res_dict is not None:
                    for key, val in res_dict.iteritems():
                        try:
                            f.write(str(key)+'\n')
                            f.write(str(val)+'\n')
                        except (UnicodeDecodeError, UnicodeEncodeError) as e:
                            continue
                
                    

In [None]:
def write_bios(bios_dict):
    bad_names = []
    for person_id, res_tup in bios_dict.iteritems():
        if res_tup is None or res_tup[0] is None:
            continue
        else:
            filename = '_'.join(res_tup[1].split(', '))
            filename2 = os.path.join(BIOGRAPHY_DATA_DIR, 'txt_files/{}.txt'.format(filename))
            results = []
            if res_tup[2] is not None:
                results = filter(None, res_tup[2])
            if not results:
                print res_tup
                bad_names.append((person_id, res_tup))
            else:
                if isinstance(results[0], dict):
                    write_bio_dict(filename2, res_tup[1], results[0])
                else:
                    # check if all results non
                    res = filter(None, map(funcy.second, results))
                    if res is not None:
                        write_bio_tuple(filename2, results)
                    else:
                        print res_tup
                        bad_names.append((person_id, res_tup))

In [None]:
write_bios(no_errors_d)

In [None]:
# for all bios in the data set, write to txt file
filename = '_'.join(f1[1].split(', '))
filename2 = os.path.join(BIOGRAPHY_DATA_DIR, 'txt_files/{}.txt'.format(filename))
results = f1[2]
print results
if isinstance(results[0], dict):
    write_bio_dict(filename2, f1[1], results[0])
else:
    write_bio_tuple(filename2, results)

In [None]:
# names_uuid.loc[pd.isnull(names_uuid.search_string)]
names_uuid.loc[((names_uuid.number_search_results>0) & (pd.isnull(names_uuid.occupations))), :]
# names_uuid.loc[~pd.isnull(names_uuid.occupations) & (names_uuid.occupations != 0), 'occupations']

In [None]:
no_errors_d[632]