In [None]:
import pickle
import pandas as pd
import numpy as np
import os
import math

In [None]:
BIOS_FILE_CRAWLED = "data/BIOS.pkl"
REVIEW_DIR = "data/review/"

In [None]:
with (open(BIOS_FILE_CRAWLED, "rb")) as openfile:
    full_data = pickle.load(openfile)
for i, entry in enumerate(full_data):
    entry.update({'review': 0, 'comment': '', 'valid': '', 'label': '', 'stylish valid': ''})

In [None]:
full_data[0]

## Label potentially ficticious, falsely labeled or samples that are not a biography for review

Every sample is checked for keyphrases or patterns that indicate it might not be a bio/ contain info relevant for classification/ refer to a fictitious person/ be falsely labeled. For every suspicion, the review counter is increased and comments are added to make it easier to review these samples.

In [None]:
def add_comment(comment, new_comment):
    if comment == '':
        comment = new_comment
    else:
        comment += ', '+new_comment
    return comment

In [None]:
jobs = []
raw_titles = []
for entry in full_data:
    title =  entry['title'].lower()
    if title not in jobs:
        jobs.append(title)
    raw_title =  entry['raw_title'].lower()
    if raw_title not in raw_titles:
        raw_titles.append(raw_title)

### Case 1: False entity
e.g. name that is not really a name or contains some prefix, or where job and bio refer to different persons

In [None]:
false_names = ["Dad", "Mom", "Brother", "Sister", "If", "The", "His", "Her", "Is", "Share", "What", "Why", "Who", "Where", "Would"]

def filter_entities(bio, name, review, comment):
    for wn in false_names:
        if wn in name:
            review += 1
            add_comment(comment, 'entity: check name')
    for job in jobs:
        if job.capitalize() in name:
            review += 1
            add_comment(comment, 'entity: check name')
    return review, comment

### Case 2: Ficticious person or movie/book review

We simply check for phrases which are more likely to occur in other texts, such as fiction/ move reviews etc., instead of biographies. Furthermore, we identified some URIs that are likely contain movie/book reviews, fan fiction or video descriptions.

In [None]:
# phrases implying that a book/movie/ficiton is referred, e.g. exaggerations are likely to occur for ficticious persons)
fiction_phrases = ["written by", "voiced by", "produced by", "comments", "watch the trailer", "starring", " in \"", "M for mature", "Crime", "Romance", "Mystery", "Fantasy", "played by",
                   "Rated", "Chapters", "Words", "Reviews", "Based on", "Realizado por", "narrated by", "imdb", "Avenger", "Description: ", "Vol.", "Season ", "Played by", "this DVD", "this movie", "this book",
                   ", but", "Why ", "and why ", "But ",
                   "the only problem is", "The only problem is", "Years later", "years later", "Until", "When", "Desperate", "paranormal", "superhero", "dwarv", "slutty", "cock", "stockings", "horny", "Fetish", "boobs",
                   "mysterious", "supernatural", "desperately", "outrageous", "pirate", "forced to", "Occasionally", "Then", "one day", "One day", "with a secret", "with a Secret", "wannabe", "having sex", 
                   "demon", "vampire",
                   "vicious killer", "they fall in love", "during the night", "blowjob", "througout this video", "this video", "what he is not expecting", "what she is not expecting", "naughty nurse", "hot model"]
# first sentence: e.g. is a question, exaggerations, amibious usage of model
start_phrases = ["unpopular", "?", "with a past", "struggling", "perfect", "famous", "brilliant", "poor", "talented", "heavy past", "gifted"]

fiction_uri = ['imdb', 'episodecalendar', 'filmschoolrejects', 'fanfiction', 'movie', 'tvtropes', 'marvel', 'idols69', 'sexgaypics', 'fleshandskin', 'lustyguide', 'gotgayporn', 'star-trek', 'criticker', 'goodreads', 'thebestporn', 'madtubes', 'naughty', 
               'tracktv', 'nude', 'bigboob', 'porn', 'tits', 'story', 'sexy', 'book', 'cinema', 'costume', 'novel', 'cinespider', 'mobileread', 'wikia', 'Characters', 'characters', 'filmjabber']

def filter_fiction(bio, name, start_pos, bio_uri, review, comment):
    for phrase in start_phrases:
        if phrase in bio[:start_pos]:
            review += 1
            add_comment(comment, 'fiction (start): '+phrase)
    for phrase in fiction_phrases:
        if phrase in bio:
            review += 1
            add_comment(comment, 'fiction (bio): '+phrase)
    for uri in fiction_uri:
        if uri in bio_uri:
            review += 1
            add_comment(comment, 'fiction (uri)')
    return review, comment

### Case 3: Mislabeled bios
e.g. bios that contain multiple job titles in the first sentence, or bios where the job refers to a third person ("pastor's wife")

In [None]:
# other usage of model 
model_phrases = ["role model", "model student", "model citizen", "model for", "model of", "model employee"]

# phrases that indicate change of occupation
label_phrases = [" turned ", " retired ", " former "]

# any suffix characters that we accept after jobs (e.g. "job." is fine, "job's wife" not because it likely refers to a third person's job)
job_suffixes = ['.', ' ', ',', ';', '/', '-']# '\'s wife', '\'s husband', '\'s son', '\'s daughter']

# these raw titles also contain another version of this job, so they would be counted twice
raw_titles_to_skip = ['certified public accountant', 'trial attorney', 'plastic surgeon', 'senior software engineer', 'orthopedic surgeon', 'certified personal trainer']

def check_label(bio, label, start_pos, review, comment):
    start_lower = bio[:start_pos].lower()
    for phrase in model_phrases:
        if phrase in start_lower:
            review += 1
            add_comment(comment, 'label (false model)')
    
    for phrase in label_phrases:
        if phrase in start_lower:
            review += 1
            add_comment(comment, 'label (changed jobs)')

    # count jobs
    n_jobs = 0
    for job in raw_titles:
        if job in raw_titles_to_skip:
            continue
        if (entry['title'] == 'software_engineer' and job == 'architect'): # software engineers/architects cause many false positives
            continue
        if job == 'cpa' and ' accountant' in start_lower: # often both "cpa" and "certificed public accountant" are mentioned
            continue
            
        job = job.replace('_', ' ') # for jobs as software_engineer
        found = False
        for suffix in job_suffixes:
            if ' '+job+suffix in start_lower:
                found = True
                break
        if found:
            n_jobs += 1
    
    if not n_jobs == 1:
        review += 1
        add_comment(comment, 'number of labels: '+str(n_jobs))
    
    return review, comment

### Case 4: Filter very short bios and those that contain text passages that do not belong to the bio
e.g. some bios are followed by disclaimers or lengthy contact information, other links or contain texts from other web elements that are not part of the bio.

In [None]:
web_phrases = ['download', 'copyright', ' Mb ', 'watch stream', ' Min ', 'More about', 'more about', 'facebook:', 'instagram:', 'https://www.', 'http://www.']
multi_phrases = ['@', '#']

def filter_content(bio, start_pos, review, comment):
    bio_lower = bio[start_pos:].lower()
    for phrase in web_phrases:
        if phrase in bio_lower:
            review += 1
            add_comment(comment, 'content: '+phrase)
    for phrase in multi_phrases:
        if bio_lower.count(phrase) > 3:
            review += 1
            add_comment(comment, 'content: '+phrase)
    if len(bio_lower.split(' ')) < 20:
        #print(bio_lower)
        review += 1
        add_comment(comment, 'content very short')
        
    return review, comment

### Go through all bios

In [None]:
count = 0
for entry in full_data:
    entry['review'] = 0
    entry['comment'] = ''
    
    entry['review'], entry['comment'] = filter_entities(entry['raw'], entry['name'], entry['review'], entry['comment'])
    entry['review'], entry['comment'] = filter_fiction(entry['raw'], entry['name'], entry['start_pos'], entry['URI'], entry['review'], entry['comment'])
    entry['review'], entry['comment'] = check_label(entry['raw'], entry['title'], entry['start_pos'], entry['review'], entry['comment'])
    entry['review'], entry['comment'] = filter_content(entry['raw'], entry['start_pos'], entry['review'], entry['comment'])
    if entry['review'] > 0:
        count += 1

        
print("counted ", count, " samples with potential problems (of ", len(full_data), " datapoints in total)")

## Save automatically labeled data with indices for 10 classes of choice

In [None]:
# these are the largest classes (except for professor, which accounts for a third of the dataset)
jobs10 = ['architect', 'surgeon', 'dentist', 'teacher', 'psychologist', 'nurse', 'journalist', 'photographer', 'physician', 'attorney']

In [None]:
df = pd.DataFrame(data=full_data)

In [None]:
# sort data by jobs
data_by_jobs = {}
for job in jobs:
    data_by_jobs.update({job: []})
    
for entry in full_data:
    raw_title =  entry['title'].lower()
    data_by_jobs[raw_title].append(entry)

In [None]:
n_jobs_review = 0
for job in jobs:
    if job in jobs10:
        dataframe = pd.DataFrame(data=data_by_jobs[job])
        df_review = dataframe[dataframe['review'] > 0]
        df_noreview = dataframe[dataframe['review'] == 0]
        print(job, ':', len(df_review), " samples for review (of", len(dataframe),")")
        n_jobs_review += len(df_review)
        df_review.to_csv(REVIEW_DIR+job+".csv", index=True, sep='\t')
        
print()
print("found", n_jobs_review, "samples for review in the 10 classes")