In [85]:
from gensim.utils import tokenize
import pandas as pd
import numpy as np
import sys
import os
from lexicalrichness import LexicalRichness
import lmppl
scorer = lmppl.MaskedLM('distilbert-base-uncased')


In [86]:
window_size = 5
def mtld(text):
    lex = LexicalRichness(text)
    return lex.mtld()

def mattr(text):
    lex = LexicalRichness(text)
    return lex.mattr(window_size)

def hdd(text):
    lex = LexicalRichness(text)
    return lex.hdd(window_size)



In [87]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch

model_id = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
def ppl(text):
    max_length = model.config.n_positions
    stride = 512
    encodings = tokenizer(text, return_tensors="pt")
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    
    return ppl.item()

In [88]:
df = pd.read_csv("/Users/Lara/Dropbox/LanguageGender/Data/Advocates/Cleaned/cleaned_Advocates_Full_Apr13.csv")
df = df.drop(columns=['Unnamed: 0'])
coded = pd.read_csv("~/Documents/Stanford/Research/Culture_Survey/Data/Coded_Apr13_Devon_Apr21.csv")

In [89]:
coded = coded.rename(columns={'coding (1 = relevant to accountabiliy, 0 = irrelevant to accountability)':'relevance'})

In [90]:
df = pd.merge(df, coded[['ResponseId', 'relevance']], on=['ResponseId'])

In [91]:
df['text_length'] = df['text'].apply(lambda s : len(list(tokenize(s))))
df = df.loc[df['relevance']==1]
df = df.loc[df['text_length'] >= window_size]

In [92]:
df.sort_values('text_length')

Unnamed: 0,ResponseId,Intro.Question.1,Intro.Question.2,Education,Age,Gender,Gender_3_TEXT,Attention.Check,Ethnicity,Ethnicity_6_TEXT,...,NeedChange,Prior.Experience,Advocate,Implementation,Outcome,Comments,text,treatment,relevance,text_length
733,R_0ian77UBWVwno89,accountability means taking responsibility for...,apologizing without excuses,Some college,27,Female,,Purple,"Latino or Latina,White or Caucasian",,...,Moderately,No,,,,no,take pride in being accountable,Treatment 2,1,5
190,R_x2Qd2x0JxjvZDmF,Accountability means that the customer can hol...,An example could be missing a deadline on a pr...,Some college,30,Male,,Purple,White or Caucasian,,...,Not at all,Yes,Moderately,Extremely\nProfessional,Extremely\nPositive,,Get your credit! Be accountable!,Treatment 2,1,5
361,R_1psago8ffhvqEU9,it means to keep your word as your honor and t...,when you accidentally delete a bunch of softwa...,Bachelor degree,27,Female,,Purple,Asian or Asian American,,...,Slightly,No,,,,,always be responsible and accountable,Treatment 2,1,5
106,R_ZjIHQjKWLdx0rst,Having to answer for the choices you make.,Someone sends an email that causes confusion a...,Graduate degree,41,Male,,Purple,White or Caucasian,,...,Moderately,Yes,Not at all,Neutral,Neutral,,Accountability is everyone's responsibility.,Treatment 2,1,5
694,R_31ynQWpZKubrO2S,taking responsibility for ones actions,admitting a mistake that you made,Some college,47,Female,,Purple,White or Caucasian,,...,Slightly,Yes,Extremely,Extremely\nProfessional,Extremely\nPositive,no,Accountability goes a long way,Treatment 2,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,R_1joH3CG8BGMdWdC,Accountability is being responsible for your o...,If somebody does poorly on a high school test ...,"High school degree of equivalent (e.g., GED)",46,Male,,Purple,White or Caucasian,,...,Not at all,No,,,,"Farm and Ranch labor, I selected the option th...",If you work hard and do good things as far as ...,Treatment 3,1,117
567,R_2pYmw1Ge1TjUfIV,taking responsibility for one's own actions,Let's say an employee has been assigned a proj...,Some college,44,Female,,Purple,Asian or Asian American,,...,Slightly,Yes,Somewhat,Somewhat Unprofessional,Somewhat\nPositive,no,"Sure, as an informal advocate for the adoption...",Treatment 1,1,126
378,R_2fq6H6azf0ACzex,Accountability means admitting when you're in ...,Admitting that you were wrong after fighting w...,Some college,31,Male,,Purple,White or Caucasian,,...,Slightly,Yes,Extremely,Somewhat Professional,Extremely\nPositive,No,AT Metrics is introducing a new core value to ...,Control,1,159
530,R_3MAnlveCqhco3o4,Accountability refers to the willingness and a...,One example would be an employee taking owners...,Bachelor degree,43,Male,,Purple,Black of African American,,...,Moderately,No,,,,,"Hello colleagues, I would like to talk to you ...",Treatment 3,1,161


In [93]:
df['mtld'] = df['text'].apply(mtld)
df['mattr'] = df['text'].apply(mattr)
df['hdd'] = df['text'].apply(hdd)

In [94]:
df['perplexity'] = df['text'].apply(ppl)

In [95]:
df.to_csv("/Users/Lara/Dropbox/LanguageGender/Data/Advocates/Cleaned/responses_linguistic_measures.csv", index=False)