In [1]:
from transformers import pipeline
from tqdm.auto import tqdm
import numpy as np

def roberta_model(classifier, text):
    out = classifier(text)
    
    emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
    out = {d['label']: d['score'] for d in out[0]}
    out = [out[em] for em in emotions]
    out = np.asarray(out)
    #print(out.shape)
    return out

def fake_model(input):
    return [0.1, 0.2, 0.3, 0.4, 0.5]

  from .autonotebook import tqdm as notebook_tqdm
2023-12-13 13:42:55.381075: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-13 13:42:55.931739: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
import numpy as np
import pickle 
from functools import partial

def compute_emotions_per_segment(phase, model):

    player_df = phase[phase["Person"] == "Player"].drop(columns=["Person"])
    #print(player_df)
    player_text = player_df["Text"].values
    
    y = []
    for text in player_text:
        # Compute emotion vector (this should be a vector of floats (bert scores, avg of llm outputs))
        out = model(text)
        y.append(out)
        #print(out)
    y = np.array(y)
    return y

def compute_emotions_per_word_window(phase, model, window_size=10):

    player_df = phase[phase["Person"] == "Player"].drop(columns=["Person"])
    player_text = player_df["Text"].values

    full_text = ". ".join(player_text)
    print(full_text)
    
    words = full_text.split(" ")

    y = []
    for i in range(len(words) - window_size):
        # Compute emotion vector (this should be a vector of floats (bert scores, avg of llm outputs))
        text = " ".join(words[i:i+window_size])
        out = model(text)
        y.append(out)
        #print(out)
    y = np.array(y)
    return y


def compute_scores_from_convologs(filename, model, output_info=""):

    # Load data
    df = pd.read_csv(filename, sep=",")
    df.head()
    
    sessions = df["Session"].unique()
    print(sessions)

    for session_id in sessions:
        print(session_id)

        # Split text in phases
        session_df = df[df["Session"] == session_id].drop(columns=["Session"])
        #print(session_df)
        phases = [session_df[session_df["Phase"] == i+1].drop(columns=["Phase"]) for i in range(3)]

        # Compute emotions per line / per segment
        # Save output sequence s{s_id}_ph{i}_model{m}.csv
        # Can also have p_line/p_seg in the name to test both
        #seg_emotions = [compute_emotions_per_segment(phases[i], model) for i in range(3)]
        seg_emotions = [compute_emotions_per_word_window(phases[i], model) for i in range(3)]

        intro_seg_avg = seg_emotions[0]#.mean(axis=0)
        outro_seg_avg = seg_emotions[2]#.mean(axis=0)
        middle_per_seg = seg_emotions[1]

        # check if any value is nan
        has_nan_in_scores = np.isnan(np.sum(intro_seg_avg)) or np.isnan(np.sum(outro_seg_avg)) or np.isnan(np.sum(middle_per_seg))
        has_empty_scores = np.sum(intro_seg_avg) == 0 or np.sum(outro_seg_avg) == 0 or np.sum(middle_per_seg) == 0
        if has_nan_in_scores or has_empty_scores:
            print(f"[s{session_id} - Some values are missing, skipping this session")
            continue

        intro_seg_avg = intro_seg_avg.mean(axis=0)
        outro_seg_avg = outro_seg_avg.mean(axis=0)
        
        # NPC emotion
        NPC_name = phases[1]["Phase2 NPC emotion"].unique()
        print(NPC_name)
        assert len(NPC_name) <= 1
        NPC_name = NPC_name[0]
        NPC_emotion = NPC_name.split("(")[1].split(")")[0]
        print(NPC_name, NPC_emotion)

        # Session scores
        scores = {
            "intro": intro_seg_avg,
            "outro": outro_seg_avg,
            "middle": middle_per_seg,
            "middle_emotion": NPC_emotion,
        }
        #print(scores)

        filename = f"data/results/{session_id}_scores.pkl" if output_info == "" else f"data/results/{session_id}_{output_info}_scores.pkl"
        with open(filename, 'wb') as f:
            pickle.dump(scores, f)



classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None, max_length=512, truncation=True)
filename = "./data/VG4R-Blackstories-convologs - Data(3).csv"

compute_scores_from_convologs(filename, partial(roberta_model, classifier), output_info="roberta_wordwindow10")

[61553240 55512844 32510682 78628126 40270253 50166190 76338465 62716964
 72424260  6501131 82997924 56305147 83855551 40661289 39788420  5814582
 97899560 66789081 51191418  2953150 78691368 69154657 33519628 33632988
 55312925  4952450 44585814 88275308 47180179 27800623 34979361 89803947
 64638453 67305395  4569003 64600148  2741789]
61553240
yeah! super excited. probably time travel. i dont care too much about traveling in time, i would use it for trivial things, but it could be cool to meet people of different times. maybe some scientist of the past, to see their perspective and maybe reveal some of the knowledge of the future. mmmm not sure... maybe i would like to know theories about medicine and the human body, and see what people of the past were thinking. but i would also enjoy just talking to normal people and see how they lived, it could be very interesting
yeah lets go. hmmm maybe what the person was doing when it happened?. Yeah, lets ask it. . or no?. were the young woma