In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Usage:
    3b_output_alternative_measuress.py test|actual
If a Negative Dimension Error occurs, check to see if there are empty embeddings
Two types of results are currently computed
1) Projections
2) Raw count frequencies
"""
import os
import sys
import multiprocessing
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import csv
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.matutils import cossim, any2sparse
from utils import *
import re
import random
from statistics import mean 
import ujson as json
from sklearn.decomposition import PCA

embedding_dim = 50
mincount = 150
home_dir = "/ifs/projects/amirgo-identification/"
email_dir = os.path.join(home_dir, "email_data/")
mittens_dir = os.path.join(home_dir, "mittens")
utils_dir = os.path.join(mittens_dir, "utils")
embeddings_dir = os.path.join(mittens_dir, "embeddings_{}d_mincount{}".format(embedding_dim, mincount))
email_file = os.path.join(email_dir, 'MessagesHashed.jsonl')
users_file = os.path.join(email_dir, 'Users.json')
activity_file = os.path.join(email_dir, 'Activities.json')
survey_dir = os.path.join(home_dir, "survey_hr_data")
user_qualtrics_file = os.path.join(survey_dir, "UsersQualtrics.csv")
perf_percentage = os.path.join(survey_dir, "perf_rating_percentages.csv")
perf_likert = os.path.join(survey_dir, "perf_rating_likert.csv")

analyses_data_dir = "/ifs/gsb/amirgo/spacespace/spacespace/Coco/analyses_data/"
survey_filename = os.path.join(analyses_data_dir, "preprocessed_survey_hr.csv")
company_embeddings_filename = "/ifs/gsb/amirgo/spacespace/spacespace/Coco/Embed/GloVe-master/vectors_{}d.txt".format(embedding_dim)

tmp_dir = os.path.join(mittens_dir, "tmp")
output_dir = os.path.join(home_dir, "email_idtf_data")


In [5]:
year_colname, quarter_colname = 'year', 'quarter'
hash2word = {
    '09f83385': 'mine', '20019fa4': 'i', '20b60145': 'us', '28969cb1': 'them', '3828d3d2': 'me', '4dd6d391': 'their', '5b4e27db': 'my',
    '64a505fc': 'ourselves', '6935bb23': 'ours', '6f75419e': 'myself', '86df0c8d': 'themselves', 'a7383e72': 'we', 'a9193217': 'theirs', 'b72a9dd7': 'our', 'fd0ccf1c': 'they', 
    'ce696289': 'home', 'b95eb14b': 'attached', '267430a0': 'good', '294fa7d1': 'collabera', '974811d0': 'pay', 'edbf568e': 'work', 'b71be0e8': 'team', '4c088971': 'great',
    'c74560f9': 'best', 'f18e6868': 'different', '1f4d7738': 'group', '255ddfcd': 'glad', 'aa829423': 'included', '17e1378b': 'money', '454ea538': 'salary', '311b8ad0': 'community',
    '3b75b927': 'happy', '9324aa22': 'organized', '63b8b7ea': 'bad', '643ce56f': 'responsive', 'f4732b84': 'enthusiastic', '2e32c475': 'competitive', 'b9625ccf': 'family',
    '900c73ff': 'unresponsive', 'cfe1bd08': 'income', '223deabb': 'worst', 'fa81b32a': 'pride', '1455e3bd': 'passionate', '9582e03b': 'awful', 'd9f0fe6c': 'promotion',
    'c40b5da1': 'excluded', 'cf9cb85a': 'ambitious', 'a0cb3a2b': 'sad', '8a4e04bd': 'honor', 'cafaa726': 'belong', '24cb6fe3': 'shame', 'b92208fc': 'disciplined', '68e0c9c9': 'undisciplined',
    '81bcf2f5': 'receptive', '8ca67680': 'disorganized', 'd22e4710': 'bitter', 'bf4db4c4': 'unenthusiastic', '8602bd25': 'dignity', '822f792d': 'detached', 'a7ca40f1': 'humiliation',
    '7911da73': 'noncompetitive', '627fcac3': 'dishonor', '84cadff4': 'unreceptive', '07ca39d6': 'lazy', '95a160e0': 'indifferent', '10a4d7ee': 'apathetic'}
word2hash = {v:k for k, v in hash2word.items()}
pronouns = ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves']
single_pronouns = ['i', 'we']
i_index, we_index = 0, 5
hash_pronouns = [word2hash[p] for p in pronouns]
hash_single_pronouns = [word2hash[p] for p in single_pronouns]

In [2]:
def build_dimension(words_start, words_end):
    """
    This method builds a dimension defined by words at separate end of a dimension.
    Multiple methods exist in previous literature when building such a dimension.
    1) Kozlowski et al. (2019) averages across differences between different word pairs, noted to be interchangeable with averaging words on each side of the dimension and
    then taking the difference between averages. They are empirically verified to be identical.
    2) Bolukbasi et al. (2016) defines gender direction using a simple difference between man and woman in the corresponding tutorial. In the same tutorial, 
    racial direction is defined as difference between two clusters of words that are each sum of the embeddings of its corresponding dimensions
    normalized by the L2 norm. Wang et al. (2020) note that normalization is unnecessary. If unnormalized, this method should be equivalent to #3.
    3) Bolukbasi et al. (2016) defines gender direction also by taking the differences across multiple pairs, doing PCA on these differences, and 
    taking the first component as the gender direction.
    Parameter
    ---------
    words_start : list
        List of hashed words at the positive end of the dimension, where positive implies more likely to affect identification positively
    words_end: list
        List of hashed words at the other end of dimension
    Returns
    -------
    (mean_dim, pca_dimension) : 2-tuple of numpy vector
        Two vector that represents the dimension of interest calculated using method #1 and #3.
    """
    assert len(words_start) == len(words_end)
    differences = [(np.array(words_start[i]) - np.array(words_end[i])) for i in range(len(words_start)) if not np.isnan(words_start[i]).any() and not np.isnan(words_end[i]).any()]
    mean_dim = np.array(differences).mean(axis=0)
    pca_dim = doPCA(words_start, words_end)
    if project(words_start[0], pca_dim) < 0:
        # convention used in the current script is that words_start should represent the positive dimension
        pca_dim = pca_dim * -1
    return (mean_dim, pca_dim)


def build_all_dimensions():
    """
    Returns a dictionary that matches dimension name to a 2-tuple of dimensions, where eacb dimension is represented using a numpy vector.
    """
    name2hashes = {'family_dim': ([word2hash[word] for word in ['family', 'home', 'community', 'team']],
        [word2hash[word] for word in ['money', 'pay', 'salary', 'income']]),
        'valence_dim': ([word2hash[word] for word in ["good", "great", "best"]],
            [word2hash[word] for word in ["bad", "awful", "worst"]]),
        'belonging_dim': ([word2hash[word] for word in ['included', 'attached']],
            [word2hash[word] for word in ['excluded', 'detached']]),
        'pride_dim': ([word2hash[word] for word in ["pride", "dignity", "honor"]],
            [word2hash[word] for word in ["shame", "humiliation", "dishonor"]]),
        'passionate_dim': ([word2hash[word] for word in ["passionate"]],
            [word2hash[word] for word in ["indifferent"]]),
        'competitive_dim': ([word2hash[word] for word in ["competitive"]], # noncompetitive is not included in GloVe, thus this word-pair is restricted to one word
            [word2hash[word] for word in ["lazy"]]),
        'responsive_dim': ([word2hash[word] for word in ["responsive"]],
            [word2hash[word] for word in ["unresponsive"]]),
        'disciplined_dim': ([word2hash[word] for word in ["disciplined"]],
            [word2hash[word] for word in ["undisciplined"]]),
        'we_dim': (hash_pronouns[we_index:], hash_pronouns[i_index:we_index])}
    dims = {k : build_dimension([company_model[h] for h in hashes[0]], [company_model[h] for h in hashes[1]]) for k, hashes in name2hashes.items()}
    return dims


def doPCA(words_start, words_end):
    """
    Performs PCA on differences between pairs of words and returns the first component
    Based on function doPCA in Bolukbasi et al. (2016) source code at https://github.com/tolga-b/debiaswe/blob/master/debiaswe/we.py
    Parameter
    ---------
    words_start : list
        List of hashed words at one end of interested dimension
    words_end: list
        List of hashed words at the other end of dimension
    Returns
    -------
    ndarray
        First component of PCA of differences between pairs of words
    """
    matrix = []
    for i in range(len(words_start)):
        center = (words_start[i] + words_end[i])/2
        matrix.append(words_end[i] - center)
        matrix.append(words_start[i] - center)
    matrix = np.array(matrix)
    # cannot have more components than the number of samples
    num_components = len(words_start)*2
    pca = PCA(n_components = num_components)
    pca.fit(matrix)
    return pca.components_[0]


In [3]:
sys.stderr.write("Building company model at %s.\n" % datetime.now())    
tmp_mittens = os.path.join(tmp_dir, "mittens_embeddings_all_word2vec.txt")
word2vec_mittens_file = get_tmpfile(tmp_mittens)
glove2word2vec(company_embeddings_filename, word2vec_mittens_file)
company_model = KeyedVectors.load_word2vec_format(word2vec_mittens_file)


Building company model at 2021-01-24 12:34:37.995404.


In [6]:
name2dims = build_all_dimensions()

In [75]:
vectors = extract_company_embedding(company_embeddings_filename, tmp_dir, name2hashes['belonging_dim'][0])
projs = [mean([project(v, dims[d][i]) for v in vectors]) for d in ['family_dim', 'belonging_dim'] for i in range(2)]

In [35]:
project(company_model[word2hash['belong']], name2dims['we_dim'][1])

1.2499301

In [96]:
model.wv['81bcf2f5']

  """Entry point for launching an IPython kernel.


array([ 0.22347 , -0.07838 , -0.156123, -0.631888, -1.007747,  0.772272,
       -0.313167,  1.189707,  0.213136,  0.686144, -0.148634,  0.280543,
       -0.905314,  0.104314,  0.530036, -1.37175 , -0.530577,  0.383711,
       -0.367105,  0.852505,  0.038253, -0.347726, -0.373564, -1.106648,
       -1.222034, -1.057687, -0.721559, -0.649887,  0.391958,  0.967776,
        1.256379, -0.173398, -1.677725, -1.276426, -0.438065, -1.254676,
        0.617702, -1.825377,  0.48235 , -0.734699,  0.955322,  1.889708,
       -1.071647, -0.34003 ,  1.383348,  0.571729,  0.295111, -1.366562,
       -0.222192,  1.937998], dtype=float32)

In [103]:
raw = [i for i in range(10)]
raw

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [112]:
tally=0

In [113]:
for i in raw:
    tally=(tally+i)

In [115]:
general_dims = ['i_we', 'family', 'belonging', "pride", "valence"]
post = ['_mean_proj', '_pca_proj']


In [116]:
cols = ['num_i_words', 'num_we_words'] + [d+p for d in general_dims for p in post]

In [126]:
toks = ['311b8ad0']
relevant_dims = ['we_dim', 'family_dim', 'belonging_dim', 'pride_dim', 'valence_dim']
[sum([project(model[t], dims[d][i]) for t in toks if t in model.vocab]) for d in relevant_dims for i in range(2)]

[0.23779483139514923,
 0.1259683072566986,
 2.5891361236572266,
 2.673535108566284,
 1.373583436012268,
 1.3155254125595093,
 2.7780447006225586,
 2.7770185470581055,
 2.6363561153411865,
 2.64739727973938]

In [124]:
toks

[array([ 0.467495, -0.889436, -0.919916, -0.471172,  0.732633, -0.018749,
         1.488797, -0.198742,  0.533597, -0.609984, -1.079914,  0.27499 ,
         0.352249,  0.343783, -1.058495,  0.346726, -0.856571,  0.706777,
         1.158149,  1.287304,  0.392948, -1.009327, -0.24711 ,  0.466427,
         0.381957, -0.69002 ,  0.157154,  0.802895,  0.559304, -0.007793,
        -0.615379, -0.621199, -1.14191 ,  0.529251,  0.982216, -0.463823,
         1.05352 , -1.32042 , -0.891966, -0.955758,  0.281336, -0.308861,
         0.617961, -0.129833, -0.214225, -0.394984, -0.932921,  0.770892,
        -0.175157,  1.052361], dtype=float32),
 array([ 0.804172, -0.161137, -0.113271, -0.102526, -0.832099, -0.463523,
        -0.18788 , -0.214295,  0.853695,  0.06118 ,  0.263742, -0.020479,
         0.175829,  0.13101 , -0.172181, -0.033704,  0.084849,  0.483015,
        -0.119548, -0.056045, -0.739555,  0.149796, -0.179506, -0.048233,
         0.030957,  0.587561, -0.43645 ,  0.136448, -0.287946,  0

In [11]:
import os
import ujson as json

total_emails = 0
non_english = 0
english = 0
ling_thres = 0.9

email_dir = "/ifs/projects/amirgo-identification/email_data/"
out_dir = "/ifs/gsb/amirgo/spacespace/spacespace/Coco/Embed/GloVe-master/"
email_file = os.path.join(email_dir, 'MessagesHashed.jsonl')
activity_file = os.path.join(email_dir, 'Activities.json')

def read_emails(in_file, target_users):
    sid2activity = {}
    with open(activity_file, encoding='utf-8') as f:
        for line in f:
            activity = json.loads(line)
            if activity['UserId'] in target_users:
                sid2activity[activity['MailSummarySid']] = activity
    target_sids = sid2activity.keys()
    
    with open(in_file, encoding='utf-8') as f:            
        for i, line in enumerate(f):
            if i % 1000000 == 0:
                print("Processed {} emails".format(i))
            global total_emails, english, non_english
            total_emails += 1
            email = json.loads(line)
            if email['sid'] in target_sids:
                lang = email['l']
                if len(body) > 0:
                    # original - if lang[0] == "__label__en" and (lang[1] > 0.5 or len(email['liwc']) > 0):
                    if lang[0] == "__label__en" and lang[1] > ling_thres:
                        english += 1
                    elif len(lang[0]) > 0:
                        non_english += 1
    return


In [12]:
target_file = 'target_users.txt'
with open(target_file, "r") as file:
    userids = []
    for line in file:
        userids.append(line.strip()) # removing newline



In [13]:
read_emails(email_file, userids)

Processed 0 emails


NameError: name 'body' is not defined