In [1]:
i_words = ['i', 'me', 'my', 'mine', 'myself']
we_words = ['we', 'us', 'our', 'ours', 'ourselves']
they_words = ['they', 'them', 'their', 'theirs', 'theirselves']
def count_we_i(df):
    we = sum([1 for t in df['pros_toks'] if t in we_words])
    we += sum([1 for t in df['cons_toks'] if t in we_words])
    i = sum([1 for t in df['pros_toks'] if t in i_words])
    i += sum([1 for t in df['cons_toks'] if t in i_words])
    log_we_i = np.log(we/i) if i > 0 and we > 0 else np.nan
    return log_we_i

def count_we_they(df):
    we = sum([1 for t in df['pros_toks'] if t in we_words])
    we += sum([1 for t in df['cons_toks'] if t in we_words])
    they = sum([1 for t in df['pros_toks'] if t in they_words])
    they += sum([1 for t in df['cons_toks'] if t in they_words])
    log_we_they = np.log(we/they) if we > 0 and they > 0 else np.nan
    return log_we_they

def count(df, words, colname):
    return sum([1 for t in df[colname] if t in words])

In [5]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
import contractions
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import WordNetLemmatizer 
from textblob import TextBlob
import pandas as pd
import numpy as np
import re
tokenizer = TweetTokenizer()
survey_hr_df = pd.read_csv('~/Documents/CompCulture/spacespace/Coco/analyses_data/preprocessed_survey_hr.csv')
survey_text_df = survey_hr_df.dropna(subset=['pros', 'cons', 'story']).astype({'pros':'str',
                             'cons':'str',
                             'story':'str'})

survey_text_df['disengagement_3'] = 4 - survey_text_df['disengagement_3']
survey_text_df['exhaustion_2'] = 4 - survey_text_df['exhaustion_2']
survey_text_df['disengagement'] = survey_text_df[['disengagement_1', 'disengagement_2', 'disengagement_3']].mean(axis=1)
survey_text_df['exhaustion'] = survey_text_df[['exhaustion_1', 'exhaustion_2', 'exhaustion_3']].mean(axis=1)
survey_text_df['burnout'] = survey_text_df[['disengagement', 'exhaustion']].mean(axis=1)
survey_text_df['race'] = survey_text_df['Race'].apply(lambda x : 'Other' if x in ['Black or African American', 'Missing', 'Native Hawaiian or Other Pacific Islander'] else x)
survey_text_df = survey_text_df.drop(columns=['ResponseId', 'LocationLatitude', 'LocationLongitude', "LINK", "Race"]
                    + ["mael_"+str(i) for i in range(1, 7)]
                    + ["disengagement_"+str(i) for i in range(1, 4)]
                    + ["exhaustion_"+str(i) for i in range(1, 4)])

stop_words = STOP_WORDS
# edited to be consistent with stop words used in processing glassdoor reviews
custom_stop_words = ['people', 'collabera', 'employee', 'employees', 'collabera\'s', 'work', 'working', 'company', 'great', 'good', 'lot', 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa']

stop_words = set(list(stop_words) + custom_stop_words)
lemmatizer = WordNetLemmatizer()
# getting rid of these tokens that are not included in the pre-defined punctuation list
# no better lists existing
punctuation = string.punctuation + '–...…’“”'
survey_text_df['low_quality_response'] = np.where((survey_text_df['pros'] == survey_text_df['cons']) |
                                                  (survey_text_df['cons'] == survey_text_df['story']) | 
                                                  (survey_text_df['pros'] == survey_text_df['story']),
                                                  1, 0)

survey_text_df['pros_toks'] = survey_text_df['pros'].apply(contractions.fix).str.lower().apply(lambda x: x.replace('.',' ')).apply(tokenizer.tokenize)
survey_text_df['cons_toks'] = survey_text_df['cons'].apply(contractions.fix).str.lower().apply(lambda x: x.replace('.',' ')).apply(tokenizer.tokenize)
survey_text_df['story_toks'] = survey_text_df['story'].apply(contractions.fix).str.lower().apply(lambda x: x.replace('.',' ')).apply(tokenizer.tokenize)

survey_text_df['pros_we'] = survey_text_df.apply(count, args=(we_words, 'pros_toks',), axis=1)
survey_text_df['pros_i'] = survey_text_df.apply(count, args=(i_words, 'pros_toks'), axis=1)
survey_text_df['pros_they'] = survey_text_df.apply(count, args=(they_words, 'pros_toks'), axis=1)
survey_text_df['cons_we'] = survey_text_df.apply(count, args=(we_words, 'cons_toks'), axis=1)
survey_text_df['cons_i'] = survey_text_df.apply(count, args=(i_words, 'cons_toks'), axis=1)
survey_text_df['cons_they'] = survey_text_df.apply(count, args=(they_words, 'cons_toks'), axis=1)
survey_text_df['story_we'] = survey_text_df.apply(count, args=(we_words, 'story_toks'), axis=1)
survey_text_df['story_i'] = survey_text_df.apply(count, args=(i_words, 'story_toks'), axis=1)
survey_text_df['story_they'] = survey_text_df.apply(count, args=(they_words, 'story_toks'), axis=1)

# removing bullet points and numbers
re_number = r"[0-9]+(\.)?"
# doing this in two batches so we can get the correct count on pronouns first
survey_text_df['pros_toks'] = survey_text_df['pros_toks'].apply(
    lambda toks : [lemmatizer.lemmatize(t) for t in toks
                   if t not in punctuation and t not in stop_words and re.match(re_number, t) is None])
survey_text_df['cons_toks'] = survey_text_df['cons_toks'].apply(
    lambda toks : [lemmatizer.lemmatize(t) for t in toks
                   if t not in punctuation and t not in stop_words and re.match(re_number, t) is None])
survey_text_df['story_toks'] = survey_text_df['story_toks'].apply(
    lambda toks : [lemmatizer.lemmatize(t) for t in toks
                   if t not in punctuation and t not in stop_words and re.match(re_number, t) is None])

# note that given that this cleaned version does not retain stop words, any algorithm that needs 
# stop words, e.g., glove or sentence embeddings, should not use these columns defined as such
survey_text_df['pros_cleaned'] = survey_text_df['pros_toks'].apply(' '.join)
survey_text_df['cons_cleaned'] = survey_text_df['cons_toks'].apply(' '.join)
survey_text_df['story_cleaned'] = survey_text_df['story_toks'].apply(' '.join)

survey_text_df['pros_toks_len'] = survey_text_df['pros_toks'].apply(len)
survey_text_df['cons_toks_len'] = survey_text_df['cons_toks'].apply(len)
survey_text_df['story_toks_len'] = survey_text_df['story_toks'].apply(len)

survey_text_df['glassdoor_toks'] = survey_text_df.apply(lambda row : row['pros_toks'] + row['cons_toks'], axis=1)
survey_text_df['glassdoor_cleaned'] = survey_text_df['glassdoor_toks'].apply(' '.join)
survey_text_df['glassdoor_toks_len'] = survey_text_df['glassdoor_toks'].apply(len)

survey_text_df['low_quality_pros'] = survey_text_df.apply(lambda row : 1 if (row['pros_toks_len'] < 3) else row['low_quality_response'], axis=1)
survey_text_df['low_quality_cons'] = survey_text_df.apply(lambda row : 1 if (row['cons_toks_len'] < 3) else row['low_quality_response'], axis=1)
survey_text_df['low_quality_story'] = survey_text_df.apply(lambda row : 1 if (row['story_toks_len'] < 3) else row['low_quality_response'], axis=1)

In [6]:
survey_text_df.to_csv('~/Documents/CompCulture/spacespace/Coco/analyses_data/survey_hr_topic_modeling.csv', index=False)