In [1]:
import pandas as pd
import numpy as np
import sys
import os
import re
from nltk.tokenize import TweetTokenizer
import string
import contractions
from nltk.stem import WordNetLemmatizer 
from textblob import TextBlob
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
lemmatizer = WordNetLemmatizer()
# Copy the line below to chunks where we want to set the column width to max
pd.set_option('display.max_colwidth', None)
import spacy
import torch
print(torch.__version__)

1.9.1


### Preparing Survey Data

In [2]:
survey_hr_df = pd.read_csv('/Users/Lara/Documents/CompCulture/spacespace/Coco/analyses_data/preprocessed_survey_hr.csv')
survey_text_df = survey_hr_df.dropna(subset=['pros', 'cons', 'story']).astype({'pros':'str',
                             'cons':'str',
                             'story':'str'})
survey_text_df = survey_text_df.reset_index()
survey_text_df['race'] = survey_text_df['Race'].apply(lambda x : 'Other' if x in ['Black or African American', 'Missing', 'Native Hawaiian or Other Pacific Islander'] else x)
survey_text_df = survey_text_df.drop(columns=['ResponseId', 'LocationLatitude', 'LocationLongitude', "LINK", "Race"]
                    + ["mael_"+str(i) for i in range(1, 7)])
                                
tweet_tokenizer = TweetTokenizer()
punctuation = string.punctuation + '–...…’“”'
re_number = r"[0-9]+(\.)?"
# identify whether there are three or more repeats of the same characters; used to get rid of filler words
re_repeat_digits = r'([a-z])\1{2}'
# necessary to make sure we don't confuse US the country with us the first person pronoun
us_str = 'u_s_a'
for col_name in ['pros', 'cons', 'story']:
    survey_text_df[col_name+'_cleaned'] = survey_text_df[col_name].apply(
        lambda x: x.replace('USA', us_str).replace('US', us_str).replace(
            'U.S.A.', us_str).replace('U.S.A', us_str).replace('U.S.', us_str).replace('U.S', us_str)
    ).apply(contractions.fix).str.lower().apply(lambda x: x.replace('n/a', ''))
    
    survey_text_df[col_name+'_toks'] = survey_text_df[col_name+'_cleaned'].apply(
        lambda x : x.replace('.', ' ')).apply(tweet_tokenizer.tokenize).apply(
        lambda toks : [t for t in toks if t not in punctuation
                       and re.match(re_number, t) is None and re.search(re_repeat_digits, t) is None])
    
    survey_text_df[col_name+'_toks_len'] = survey_text_df[col_name+'_toks'].apply(len)

survey_text_df['glassdoor_cleaned'] = survey_text_df.apply(lambda row: row['pros_cleaned'] + ' ' + row['cons_cleaned'], axis=1)

survey_text_df['low_quality_response'] = np.where((survey_text_df['pros'] == survey_text_df['cons']) |
                                                  (survey_text_df['cons'] == survey_text_df['story']) | 
                                                  (survey_text_df['pros'] == survey_text_df['story']),
                                                  1, 0)
# changed threshold from 3 in exploring_survey_responses.ipynb to 5 as we are not removing stop words here
survey_text_df['low_quality_pros'] = survey_text_df.apply(lambda row :
                                                          1 if (row['pros_toks_len'] < 5 | (re.search(re_repeat_digits, row['pros']) is not None))
                                                          else row['low_quality_response'], axis=1)
survey_text_df['low_quality_cons'] = survey_text_df.apply(lambda row :
                                                          1 if (row['cons_toks_len'] < 5 | (re.search(re_repeat_digits, row['cons']) is not None))
                                                          else row['low_quality_response'], axis=1)
survey_text_df['low_quality_story'] = survey_text_df.apply(lambda row :
                                                           1 if (row['story_toks_len'] < 5 | (re.search(re_repeat_digits, row['story']) is not None))
                                                           else row['low_quality_response'], axis=1)                                     

### Preparing Glassdoor Data for Finetuning

In [3]:
glassdoor_data_dir = "/Users/Lara/Documents/Stanford/Research/Glassdoor/"
glassdoor_reviews = pd.read_csv(os.path.join(glassdoor_data_dir, "reviews_new_processed.csv"))
glassdoor_reviews = glassdoor_reviews.loc[glassdoor_reviews['is_current_job'] == 1,]

glassdoor_reviews = glassdoor_reviews.dropna(subset=['pros', 'cons']).astype({'pros':'str', 'cons':'str'})

glassdoor_reviews['pros_cleaned'] = glassdoor_reviews['pros'].apply(
    lambda x: x.replace('USA', us_str).replace('US', us_str).replace(
        'U.S.A.', us_str).replace('U.S.A', us_str).replace('U.S.', us_str).replace('U.S', us_str)
).apply(contractions.fix).str.lower().apply(lambda x: x.replace('n/a', ''))

glassdoor_reviews['cons_cleaned'] = glassdoor_reviews['cons'].apply(
    lambda x: x.replace('USA', us_str).replace('US', us_str).replace(
        'U.S.A.', us_str).replace('U.S.A', us_str).replace('U.S.', us_str).replace('U.S', us_str)
).apply(contractions.fix).str.lower().apply(lambda x: x.replace('n/a', ''))


glassdoor_reviews['glassdoor_cleaned'] = glassdoor_reviews.apply(lambda row: row['pros_cleaned'] + ' ' + row['cons_cleaned'], axis=1)
filepath = '/Users/Lara/Documents/CompCulture/spacespace/Coco/helper_data/glassdoor_distilbert_corpus.txt'
np.savetxt(filepath, glassdoor_reviews['glassdoor_cleaned'], fmt="%s")


KeyboardInterrupt: 

### Finetuning Distilbert on Glassdoor Data 
This portion of the code is run Google Colab given slow training speed on CPU

trainer.save_model("./finetuned_distilbert")

### Calculating "We-ness" of Language

In [4]:
# Original Measure
type2pronouns = {'subj': ['i', 'you', 'he', 'she', 'we', 'they'],
           'obj': ['me', 'you', 'him', 'her', 'us', 'them'],
           'poss': ['my', 'your', 'his', 'her', 'our', 'their'],
           'attr': ['mine', 'yours', 'his', 'hers', 'ours', 'theirs'],
           'npadvmod': ['myself', 'yourself', 'yourselves', 'himself', 'herself', 'ourselves', 'themselves']}

def average_with_nan(elem1, elem2):
    if np.isnan(elem1) and np.isnan(elem2):
        return np.nan
    elif np.isnan(elem1):
        return elem2
    elif np.isnan(elem2):
        return elem1
    else:
        return (elem1+elem2)/2


In [5]:
nlp = spacy.load("en_core_web_sm")
def get_token_type(text):
    doc = nlp(text)
    index2type = {}
    for token in doc:
        index2type[(token.idx, token.idx+len(token.text))] = token.dep_
    return index2type

def we_prob(text, tokenizer, model, type2pronouns, filter_we=True, we_index=-2):
    """
    Returns a tuple containing the probability of we-words and probability of we-words
    as weighted by the total probability of relevant pronouns, where relevant pronouns
    refer to pronouns of the same type, i.e., subject to subject, possessive to possessive.
    The reason that type matching is important is due to the model's ability to 
    infer correct type based on context, which needs to be accounted for when weighting probabilities.
    """
    text = text.lower()
    encoded_dict = tokenizer(text, return_offsets_mapping=True)
    toks = tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'])
    pronoun2type = {pronoun : k for k, v in type2pronouns.items() for pronoun in v}
    pronouns = [pronoun for k, v in type2pronouns.items() for pronoun in v]

    found_we_word = True if len(set(toks) & set(we_words)) > 0 else False
    found_pronouns = True if len(set(toks) & set(pronouns)) > 0 else False
    if (filter_we and not found_we_word) or not found_pronouns:
        return np.nan
    index2type = get_token_type(text)
    # with pytorch format, the results can't be converted to tokens directly
    # if direct conversation needed, remove return_tensors argument
    input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
    # chopping input_ids to deal with cases where there are more than 512 tokens
    # there is currently only one instance with cons where more than 512 tokens exist
    # if this method is actually used to build model for glassdoor, should explore
    # other ways to deal with this problem - e.g., sliding window
    if input_ids.shape[1] > 512: 
        input_ids = input_ids[:, 0:511]
        toks = toks[0:511]
    masked_index = []
    skip = False
    # mask all pronouns to prevent leaking.
    for i in range(input_ids.shape[1]):
        if skip:
            skip = False
            continue
        curr_id = input_ids[0, i]
        curr_tok = tokenizer.convert_ids_to_tokens(curr_id.item())
        if curr_tok in pronouns:
            input_ids[0, i] = tokenizer.mask_token_id
            # the following word has to be a verb, which needs to be
            # masked due to verb tenses leaking the right pronoun
            if pronoun2type[curr_tok] == 'nsubj':
                input_ids[0, i+1] = tokenizer.mask_token_id
                skip = True             
            if not filter_we:
                masked_index.append(i)
            elif curr_tok in we_words:
                masked_index.append(i)
    outputs = model(input_ids)
    prediction_logits = t.logits
    raw_prob, prop_prob = [], []
    for m in masked_index:
        orig_word = toks[m]           
        target_words = type2pronouns[pronoun2type[orig_word]]
        # "you" could be a subject or an object
        if orig_word == 'you':
            if encoded_dict['offset_mapping'][m] not in index2type.keys():
                raise ValueError('Token has no matching type. See text %s:' % text)
            else:
                # obj, pobj, dobj, dative - indirect object
                if 'obj' in index2type[encoded_dict['offset_mapping'][m]] or 'dative' == index2type[encoded_dict['offset_mapping'][m]]:
                    target_words = type2pronouns['obj']
                elif 'subj' in index2type[encoded_dict['offset_mapping'][m]]:
                    target_words = type2pronouns['subj']
                else:
                    raise ValueError('Unexpected dependency type: %s' % index2type[encoded_dict['offset_mapping'][m]])
        we_word = target_words[we_index]
        logits = prediction_logits[0, m, :]
        # the original code for single mask use case uses dim=0 as there is only one dim
        # which also works here given that we are iterating across masks one at a time
        probs = logits.softmax(dim=-1)
        target_inds = np.array(tokenizer.convert_tokens_to_ids(target_words))
        # of shape number of we-word masks by number of target words
        values = probs[..., target_inds]
        we_word_prob = probs[..., tokenizer.convert_tokens_to_ids(we_word)].item()
        we_word_prop = we_word_prob / values.sum().item()
        raw_prob.append(we_word_prob)
        prop_prob.append(we_word_prop)
    return (sum(raw_prob)/len(raw_prob), sum(prop_prob)/len(prop_prob))

In [8]:
from transformers import DistilBertForMaskedLM, BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
i_words = ['i', 'me', 'my', 'mine', 'myself']
we_words = ['we', 'us', 'our', 'ours', 'ourselves']
they_words = ['they', 'them', 'their', 'theirs', 'themselves']

model = DistilBertForMaskedLM.from_pretrained("./glassdoor_finetuned_distilbert", return_dict=True)
# Uses All Pronouns (except for it - too many typos)
we_prob_pros = survey_text_df['pros_cleaned'].apply(lambda text : we_prob(text, tokenizer, model, type2pronouns, filter_we=False))
we_prob_cons = survey_text_df['cons_cleaned'].apply(lambda text : we_prob(text, tokenizer, model, type2pronouns, filter_we=False))
we_prob_pros_df = pd.DataFrame(we_prob_pros.tolist(), columns=['pros_finetuned_prob', 'pros_finetuned_prop'])
we_prob_cons_df = pd.DataFrame(we_prob_cons.tolist(), columns=['cons_finetuned_prob', 'cons_finetuned_prop'])
survey_text_df = pd.concat([survey_text_df, we_prob_pros_df, we_prob_cons_df], axis=1)

survey_text_df['glassdoor_finetuned_prob'] = survey_text_df.apply(lambda row : average_with_nan(row['pros_finetuned_prob'], row['cons_finetuned_prob']), axis=1)
survey_text_df['glassdoor_finetuned_prop'] = survey_text_df.apply(lambda row : average_with_nan(row['pros_finetuned_prop'], row['cons_finetuned_prop']), axis=1)

In [13]:
survey_text_df = survey_text_df.drop('index', axis=1)
survey_text_df.to_csv('~/Documents/CompCulture/spacespace/Coco/analyses_data/survey_hr_glassdoor_distilbert.csv')