In [6]:
import pandas as pd
import numpy as np
from nrclex import NRCLex
import nltk

nltk.download('punkt')

def generate_prompt(essay, gender, education, ethnicity, age, income, empathy, distress):
    if gender == 1: gender_str = "male"
    else: gender_str = "female"

    if education == 1: education_str = "with less than a high school diploma"
    elif education == 2: education_str = "with a high school diploma"
    elif education == 3: education_str = "went to a technical/vocational school"
    elif education == 4: education_str = "went to college"
    elif education == 5: education_str = "with a two year associate degree"
    elif education == 6: education_str = "with a four year bachelor's degree"
    else: education_str = "postgradute or with a professional degree"

    if ethnicity == 1: ethnicity_str = " white"
    elif ethnicity == 2: ethnicity_str = " hispanic or latino"
    elif ethnicity == 3: ethnicity_str = " black or african american"
    elif ethnicity == 4: ethnicity_str = " native american or american indian"
    elif ethnicity == 5: ethnicity_str = " asian/pacific islander"
    else: ethnicity_str = ""

    text_prompt_bio = "An essay written by a {} years old{} {}, {}, with an income of {}$.".format(
        age, ethnicity_str,
        gender_str,
        education_str,
        income
        )
    
    if empathy is not None:
        if empathy < 3: empathy_value = "low"
        elif empathy < 5: empathy_value = "medium"
        else: empathy_value = "high"
        if distress < 3: distress_value = "low"
        elif distress < 5: distress_value = "medium"
        else: distress_value = "high"
        text_prompt_emp = "The essay expresses {} empathy and {} distress levels.".format(
            empathy_value,
            distress_value
            )

    emotions = NRCLex(essay).top_emotions
    if (sum(np.array([emo[1] for emo in emotions])))==0:
        emotions = {'neutral': 1}
    n_emo = len(emotions)
    emo_string = ""
    for i, emo in enumerate(emotions):
        emo_string += emo[0]
        if i < n_emo-1:
            emo_string += ", "
    text_prompt_emo = " The top emotions expressed in the essay are: {}.".format(emo_string)
    
    return text_prompt_bio, text_prompt_emp, text_prompt_emo

def add_prompt(dataframe):
    dataframe["prompt_bio"] = ""
    dataframe["prompt_emp"] = ""
    dataframe["prompt_emo"] = ""
    for idx, row in dataframe.iterrows():
        bio_prompt, emp_prompt, emo_prompt = generate_prompt(
            row['essay'],
            row['gender'],
            row['education'],
            row['race'],
            row['age'],
            row['income'],
            row['empathy'] if 'empathy' in row else None,
            row['distress'] if 'empathy' in row else None
            )
        dataframe.at[idx, "prompt_bio"] = bio_prompt
        dataframe.at[idx, "prompt_emp"] = emp_prompt
        dataframe.at[idx, "prompt_emo"] = emo_prompt
    return dataframe

year=23

TRAIN = f"datasets/WASSA{year}_essay_level_original_internal_train_preproc.tsv"
VAL = f"datasets/WASSA{year}_essay_level_original_internal_val_preproc.tsv"
DEV = f"datasets/WASSA{year}_essay_level_dev_preproc.tsv"

train_df = pd.read_csv(TRAIN, sep='\t')
val_df = pd.read_csv(VAL, sep='\t')
dev_df = pd.read_csv(DEV, sep='\t')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/irenetesta/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
new_train = add_prompt(train_df)
new_train.to_csv(f"datasets/WASSA{year}_essay_level_original_internal_train_preproc_prompt.tsv", index=False, sep='\t')
new_val = add_prompt(val_df)
new_val.to_csv(f"datasets/WASSA{year}_essay_level_original_internal_val_preproc_prompt.tsv", index=False, sep='\t')
new_dev = add_prompt(dev_df)
new_dev.to_csv(f"datasets/WASSA{year}_essay_level_dev_preproc_prompt.tsv", index=False, sep='\t')



In [1]:
print(len(['Reading', 'about', 'the', 'attack', 'on', 'Paris', 'that', 'happened', 'years', 'ago', 'brought', 'up', 'a', 'lot', 'of', 'bad', 'feelings', 'and', 'thoughts', '.', 'I', 'had', 'completely', 'forgot', '##ton', 'about', 'it', 'because', 'of', 'how', 'often', 'things', 'like', 'that', 'happen', '.', 'It', 'makes', 'me', 'upset', 'to', 'think', 'that', 'we', 'are', 'becoming', 'numb', 'to', 'terror', 'attacks', 'in', 'a', 'way', '.', 'i', 'can', 'only', 'hope', 'that', 'the', 'people', 'affected', 'have', 'found', 'peace', 'and', 'eventually', 'we', 'all', 'will', 'find', 'peace']))

72


In [2]:
s = 'This was sad and I hope they can do something about it.  I hate that the wildlife is suffering since it is so innocent.  Maybe the government should make more efforts to fix this situation or at least improve it.  I get that it takes resources to do things but maybe they could find some small way to make a difference.  It seems like the ecosystem is in serious danger if nothing is done so someone needs to act fast for the sake of the animals who are helpless victims of this situation.'

In [8]:
s = "There was this crazy story in the news the other day. Two guys were shot in Pennsylvania. One was shot twice in the stomach and the other was shot in the back. There were no arrests or anything and there was not stated motive. I wonder if it was a random thing or if they were involved with drugs or something? I guess we'll never know..."

In [9]:
len(s.split())

66

In [10]:
len([
   4,
   4,
   3.05373436517482,
   2.42988039575994,
   3.76593087701793,
   2.59070569509,
   2.65099708126283,
   3.20764554684173,
   2.65099708126283,
   1.55666500530337,
   3.00144740042587,
   4,
   1.67445183677988,
   3.86634088988836,
   3.75049209572393,
   2.59070569509,
   4,
   4,
   4,
   3.75049209572393,
   3.21761047844568,
   2.59070569509,
   2.65099708126283,
   4.41641556729266,
   4.33489631791226,
   2.65099708126283,
   1.55666500530337,
   4,
   3.75049209572393,
   2.59070569509,
   2.65099708126283,
   3.59280108512106,
   4,
   3.86634088988836,
   1.8611702712882,
   4.01371924304526,
   3.02533050744174,
   1.18825635366757,
   4.33489631791226,
   2.43931093628622,
   4,
   1.68941456950517,
   2.79064480038931,
   3.6521765490467,
   4,
   3.08356133567987,
   1.85268951336544,
   3.19037090118426,
   4,
   2.69468883289707,
   1.94668636581287,
   1.9245234038762,
   3.02533050744174,
   1.85268951336544,
   3.5475058347319,
   3.86634088988836,
   2.59824727666639,
   3.41062457369878,
   3.03678746418871,
   3.02533050744174,
   2.29580716028414,
   4,
   1.17586902799098,
   4.79368933807434,
   1.78987218211962,
   3.63079591065561
  ])

66