### Sentiment Lexicon

In [4]:
import re
import pandas as pd

f = open("../Data/DPLp-IT_lrec2016.txt", "r")

lines = f.readlines()

f.close()

sentiment_lexicon = []

for line in lines:

    temp = re.sub("\n", "", line)
    results = re.split("::|\t|,|\n", temp)
    sentiment_lexicon.append(results)

sentiment_lexicon = pd.DataFrame(sentiment_lexicon, columns=["lemma", "pos", "pos_score", "neg_score", "neu_score"])
sentiment_lexicon.to_csv("../Data/IT_sentiment_lexicon.csv", index=False)

sentiment_lexicon.head()

Unnamed: 0,lemma,pos,pos_score,neg_score,neu_score
0,essere,v,0.3675422,0.4671061,0.16535169
1,avere,v,0.27894887,0.60701084,0.1140403
2,fare,v,0.4421229,0.40804362,0.14983346
3,stare,v,0.31763914,0.4706841,0.21167673
4,dire,v,0.44390386,0.3657567,0.1903395


In [6]:
print(len(sentiment_lexicon.lemma.unique()))
print(len(sentiment_lexicon))
print(sentiment_lexicon.pos.unique())

# ['v' 's' 'a' 'b' 'h']: v: verb, s: noun, a: adjective, b: adverb, h: hashtag 

65273
75021
['v' 's' 'a' 'b' 'h']


### Emotion Lexicon

In [1]:
import re
import pandas as pd

f = open("../Data/ItEM.elicitated.lemmas.txt", "r")

lines = f.readlines()

f.close()

lexicon = []

for line in lines[1:]:

    temp = re.sub("\n", "", line)
    results = re.split("\t", temp)
    lexicon.append(results)

lexicon = pd.DataFrame(lexicon, columns=["word", "emotion", "pos"])
print(len(lexicon))

ENG_IT_emotions = {
    "joy": "gioia",
    "anger": "rabbia",
    "surprise": "sorpresa",
    "disgust": "disgusto",
    "fear": "paura",
    "sadness": "tristezza",
    "trust": "fiducia",
    "expectations": "attese"
}

ENG_emotions = ["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]

results = []

for word in lexicon["word"].unique():
    # print(word)
    POS = lexicon.loc[lexicon["word"]==word]["pos"].tolist()
    # print(POS)
    for pos in POS:
        # print(pos)
        temp = [word, pos.lower()]
        IT_emotions = lexicon.loc[(lexicon["word"]==word) & (lexicon["pos"]==pos)]["emotion"].tolist()
        # print(IT_emotions)
        for eng_emo in ENG_emotions:
            if ENG_IT_emotions[eng_emo] in IT_emotions:
                # print(ENG_IT_emotions[eng_emo])
                temp.append(1)
            else:
                temp.append(0) 
        results.append(temp)

emotion_lexicon = pd.DataFrame(results, columns=["word", "pos"]+ENG_emotions)
emotion_lexicon.to_csv("../Data/IT_emotion_lexicon.csv", index=False)

emotion_lexicon.head()

555


In [10]:
import pandas as pd

emotion = pd.read_csv("../Data/ItEM.FBNEWS15.cos", sep="\t")
print(len(emotion))
emotion.head()

239946


Unnamed: 0,emotion,word,cosine
0,gioia,festoso-a,0.647874
1,gioia,euforico-a,0.622582
2,gioia,esilarante-a,0.622579
3,gioia,gaio-a,0.617334
4,gioia,divertito-a,0.614806


In [19]:
# test_word = emotion.word.unique().tolist()[0]
# print(test_word.split("-"))
# print(emotion.loc[(emotion["word"]==test_word) & (emotion["emotion"]=="rabbia")]["cosine"].tolist()[0])

from tqdm.notebook import tqdm

ENG_IT_emotions = {
    "joy": "gioia",
    "anger": "rabbia",
    "surprise": "sorpresa",
    "disgust": "disgusto",
    "fear": "paura",
    "sadness": "tristezza",
    "trust": "fiducia",
    "expectations": "attese" # expectations -> expectancy
}

ENG_emotions = ["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]

results = []
for word in tqdm(emotion["word"].unique()):
    actual_word, pos = word.split("-")
    temp = [actual_word, pos]
    IT_emotions = emotion.loc[emotion["word"]==word]["emotion"].tolist()
    for eng_emo in ENG_emotions:
        emo_cosine = emotion.loc[(emotion["word"]==word) & (emotion["emotion"]==ENG_IT_emotions[eng_emo])]["cosine"].tolist()
        if len(emo_cosine) == 0:
            temp.append(0)
        else:
            temp.append(emo_cosine[0])
    results.append(temp)

emotion_cosine = pd.DataFrame(results, columns=["word", "pos", "joy_cosine", "anger_cosine", "surprise_cosine", "disgust_cosine", "fear_cosine", "sadness_cosine", "trust_cosine", "expectations_cosine"])
emotion_cosine.to_csv("../Data/IT_emotion_lexicon_cosine.csv", index=False)
emotion_cosine.head()

  0%|          | 0/29999 [00:00<?, ?it/s]

Unnamed: 0,word,pos,joy_cosine,anger_cosine,surprise_cosine,disgust_cosine,fear_cosine,sadness_cosine,trust_cosine,expectations_cosine
0,festoso,a,0.647874,0.207943,0.232334,0.199992,0.220228,0.33385,0.214038,0.187132
1,euforico,a,0.622582,0.313017,0.382015,0.296726,0.333425,0.397094,0.31092,0.283578
2,esilarante,a,0.622579,0.239305,0.238832,0.325539,0.284841,0.347426,0.236829,0.235305
3,gaio,a,0.617334,0.242006,0.213137,0.238919,0.294861,0.322882,0.333053,0.225397
4,divertito,a,0.614806,0.362644,0.451704,0.327857,0.34244,0.434995,0.330037,0.302547


In [31]:
emotion_cosine.pos.unique()

# ['a', 'v', 's'], a: adjective, v: verb, s: noun

array(['a', 'v', 's'], dtype=object)

### Computing emotion profile vectors

In [6]:
import spacy
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

emotion_lexicon = pd.read_csv("../Data/IT_emotion_lexicon_cosine.csv")
sentiment_lexicon = pd.read_csv("../Data/IT_sentiment_lexicon.csv")
nlp = spacy.load("it_core_news_sm")

def calc_emotion_profile(text):

    scores = {
        "positive": [],
        "negative": [],
        "neutral": [],
        "joy": [],
        "anger": [],
        "surprise": [],
        "disgust": [],
        "fear": [],
        "sadness": [],
        "trust": [],
        "expectations": []
    }

    doc = nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha == True:
            word = token.text.lower()
            lemma = token.lemma_.lower()
            pos = token.tag_[0].lower()
            sentiments = sentiment_lexicon.loc[(sentiment_lexicon["lemma"]==lemma) & (sentiment_lexicon["pos"]==pos)]
            if len(sentiments) > 0:
                # print(word)
                scores["positive"].append(sentiments["pos_score"].values[0])
                scores["negative"].append(sentiments["neg_score"].values[0])
                scores["neutral"].append(sentiments["neu_score"].values[0])
            
            emotions = emotion_lexicon.loc[(emotion_lexicon["word"]==word) & (emotion_lexicon["pos"]==pos)]
            if len(emotions) > 0:
                # print(word)
                scores["joy"].append((emotions["joy_cosine"].values[0] + 1)/2)
                scores["anger"].append((emotions["anger_cosine"].values[0] + 1)/2)
                scores["surprise"].append((emotions["surprise_cosine"].values[0] + 1)/2)
                scores["disgust"].append((emotions["disgust_cosine"].values[0] + 1)/2)
                scores["fear"].append((emotions["fear_cosine"].values[0] + 1)/2)
                scores["sadness"].append((emotions["sadness_cosine"].values[0] + 1)/2)
                scores["trust"].append((emotions["trust_cosine"].values[0] + 1)/2)
                scores["expectations"].append((emotions["expectations_cosine"].values[0] + 1)/2)

    emo_profile = []
    for key in scores.keys():
        if len(scores[key]) > 0:
            emo_profile.append(np.average(scores[key])*100)
            # if key in ["positive", "negative", "neutral"]:
            #     emo_profile.append(np.average(scores[key]))
            # else:
            #     normalised_sim = (np.average(scores[key])+1)/2
            #     emo_profile.append(normalised_sim)
        else:
            emo_profile.append(0.0)

    # for i in range(3, len(emo_profile)):
    #     if sum(emo_profile[3:]) > 0:
    #         emo_profile[i] = emo_profile[i]/sum(emo_profile[3:])

    return emo_profile

In [7]:
import pandas as pd

gemma = pd.read_csv("../Data/Gemma_2b_response.csv")

test = gemma.gemma_2b_resp[0]
calc_emotion_profile(test)

[30.216516,
 28.448605499999996,
 41.334880166666665,
 63.23012,
 66.03608,
 63.6542,
 60.48443999999999,
 63.29338,
 63.02679,
 70.01613,
 62.794819999999994]

In [8]:
import pandas as pd
from tqdm.notebook import tqdm

queries = pd.read_csv("../Data/Queries_IT_final.csv")
query_emotional_profile = []

for _, row in tqdm(queries.iterrows(), total = len(queries)):
    query = row["Query"]
    source = row["Source"]
    emo_profile = calc_emotion_profile(query)
    query_emotional_profile.append([query, source]+emo_profile)

query_EP_df = pd.DataFrame(query_emotional_profile, columns=["query", "source", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
query_EP_df.to_csv("../Results/query_EP.csv", index=False)
query_EP_df.head()

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,query,source,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Chi era il padre di Micerino ?,IT-GUI,25.310403,30.146357,44.54324,64.76845,68.67025,65.5393,60.31605,62.6497,66.26995,67.7922,61.06825
1,Il figlio di Chefren,IT-GUI,26.91683,32.38262,40.700552,60.345,63.4265,59.32315,58.2606,59.5719,60.73895,60.1501,57.51105
2,Il padre di Micerino,IT-GUI,25.310403,30.146357,44.54324,64.76845,68.67025,65.5393,60.31605,62.6497,66.26995,67.7922,61.06825
3,Micerino,IT-GUI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Quanto e alta la tomba di Cheope?,IT-GUI,28.057936,31.374883,40.567177,64.8446,66.9227,65.53205,62.8362,63.68465,65.92305,62.94775,63.59865


In [9]:
gemma = pd.read_csv("../Data/Gemma_2b_response.csv")
gemma_emotion_profile = []

for _, row in tqdm(gemma.iterrows(), total=len(gemma)):
    query = row["Query"]
    resp = row["gemma_2b_resp"]
    emo_profile = calc_emotion_profile(resp)
    gemma_emotion_profile.append([query, resp, "Gemma"]+emo_profile)
    
gemma_EP_df = pd.DataFrame(gemma_emotion_profile, columns=["query", "response", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
gemma_EP_df.to_csv("../Results/gemma_EP.csv", index=False)
gemma_EP_df.head()

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,query,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Chi era il padre di Micerino ?,Il padre di Micerino è Sostanza. Micerino era ...,Gemma,30.216516,28.448605,41.33488,63.23012,66.03608,63.6542,60.48444,63.29338,63.02679,70.01613,62.79482
1,Il figlio di Chefren,Il figlio di Chefren è **Herod** (19 a.C. - 4 ...,Gemma,26.91683,32.38262,40.700552,60.345,63.4265,59.32315,58.2606,59.5719,60.73895,60.1501,57.51105
2,Il padre di Micerino,Il padre di Micerino è **Papa Alessandro I**.\...,Gemma,25.437826,18.784768,55.777404,65.101856,66.545262,64.989269,63.514575,64.881488,64.493206,68.59255,64.635319
3,Micerino,"**Micerino** is a small, round, oval-shaped ob...",Gemma,24.373077,26.157535,49.469387,53.598045,53.152507,53.704803,52.480095,53.391532,53.0025,52.84391,53.336598
4,Quanto e alta la tomba di Cheope?,La tomba di Cheope è un sito archeologico in E...,Gemma,23.994172,25.168305,50.837523,61.6153,61.57903,63.06017,59.46562,62.2573,61.87732,62.7943,63.24337


In [10]:
gemma_RQ = pd.read_csv("../Data/Gemma_2b_response_RQ.csv")
gemma_emotion_profile = []

for _, row in tqdm(gemma_RQ.iterrows(), total=len(gemma_RQ)):
    query = row["Original Query"]
    resp = row["gemma_2b_resp"]
    gender = row["Gender"]
    emo_profile = calc_emotion_profile(resp)
    gemma_emotion_profile.append([query, gender, resp, "Gemma"]+emo_profile)
    
gemma_EP_df = pd.DataFrame(gemma_emotion_profile, columns=["query", "gender", "response", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
gemma_EP_df.to_csv("../Results/gemma_RQ_EP.csv", index=False)
gemma_EP_df.head()

  0%|          | 0/220 [00:00<?, ?it/s]

Unnamed: 0,query,gender,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Chi era il padre di Micerino ?,male,L'uomo che era il padre di Micerino non è desc...,Gemma,36.967342,26.112255,36.920401,64.761987,67.708063,67.294588,62.19905,65.25025,65.73725,73.803463,68.133337
1,Chi era il padre di Micerino ?,female,L'altra domanda è rivolta da una bambina di 10...,Gemma,32.945155,29.586058,37.468786,66.08175,71.77625,72.69655,63.76295,68.8467,67.10695,78.69425,73.01875
2,Il figlio di Chefren,male,"Certo, sono pronto per aiutarti! Qual'altra do...",Gemma,37.434453,22.651055,39.914489,64.81715,69.94995,67.433625,64.1889,68.56465,66.4142,74.329275,67.9236
3,Il figlio di Chefren,female,"Certo, la domanda è rivolta da una bambina di ...",Gemma,31.134702,28.604629,40.260666,66.08175,71.77625,72.69655,63.76295,68.8467,67.10695,78.69425,73.01875
4,Il padre di Micerino,male,"Il testo è rivolta da un bambino di 10 anni, c...",Gemma,27.775114,27.18256,45.042325,63.473175,64.2178,62.470775,60.795225,62.3373,62.984125,66.051175,62.846275


In [11]:
gpt = pd.read_csv("../Data/GPT_response.csv")

gpt_emotion_profile = []
for _, row in tqdm(gpt.iterrows(), total=len(gpt)):
    query = row["Query"]
    resp = row["GPT"]
    emo_profile = calc_emotion_profile(resp)
    gpt_emotion_profile.append([query, resp, "GPT"]+emo_profile)
    
gpt_EP_df = pd.DataFrame(gpt_emotion_profile, columns=["query", "response", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
gpt_EP_df.to_csv("../Results/gpt_EP.csv", index=False)
gpt_EP_df.head()

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,query,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Chi era il padre di Micerino ?,Il padre di Micerino era Chefren.,GPT,25.310403,30.146357,44.54324,64.76845,68.67025,65.5393,60.31605,62.6497,66.26995,67.7922,61.06825
1,Il figlio di Chefren,"Il figlio di Chefren era Cheope, il faraone ch...",GPT,25.94082,20.336255,53.722925,61.249833,62.46455,61.45505,61.122142,62.370042,61.077425,62.998425,62.581475
2,Il padre di Micerino,"Il padre di Micerino era Chefren, faraone dell...",GPT,34.518486,20.13407,45.347442,64.1867,63.630613,62.089613,61.119512,61.6787,62.16975,63.065963,62.292175
3,Micerino,Micerino è stato un faraone dell'Antico Egitto...,GPT,26.451107,22.475452,51.07344,62.4027,63.04623,62.04556,62.72029,63.60115,61.83978,64.99685,63.85654
4,Quanto e alta la tomba di Cheope?,"La tomba di Cheope, la più grande delle tre pi...",GPT,24.581703,28.126154,47.292145,65.95075,67.251475,70.276975,62.438575,65.351975,66.163425,66.02935,65.613475


In [12]:
gpt_RQ = pd.read_csv("../Data/GPT_response_RQ.csv")

gpt_emotion_profile = []
for _, row in tqdm(gpt_RQ.iterrows(), total=len(gpt_RQ)):
    query = row["Original Query"]
    resp = row["GPT"]
    gender = row["Gender"]
    emo_profile = calc_emotion_profile(resp)
    gpt_emotion_profile.append([query, gender, resp, "GPT"]+emo_profile)
    
gpt_EP_df = pd.DataFrame(gpt_emotion_profile, columns=["query", "gender", "response", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
gpt_EP_df.to_csv("../Results/gpt_RQ_EP.csv", index=False)
gpt_EP_df.head()

  0%|          | 0/220 [00:00<?, ?it/s]

Unnamed: 0,query,gender,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Chi era il padre di Micerino ?,male,Il padre di Micerino era Chefren.,GPT,25.310403,30.146357,44.54324,64.76845,68.67025,65.5393,60.31605,62.6497,66.26995,67.7922,61.06825
1,Chi era il padre di Micerino ?,female,Il padre di Micerino era Chefren. Chefren è co...,GPT,27.803982,20.830385,51.365632,63.9722,63.976567,62.787025,61.368683,62.364033,62.725275,63.762592,62.541033
2,Il figlio di Chefren,male,"Mi dispiace, ma non posso fornire risposte a d...",GPT,35.877874,33.690702,30.431425,61.06485,60.415975,60.38665,61.204525,61.597975,60.926325,66.91415,62.87
3,Il figlio di Chefren,female,Il figlio di Chefren è Micerino. Chefren è sta...,GPT,26.442813,19.931005,53.626182,63.005656,63.674969,62.729412,62.2163,63.318856,61.867956,64.242431,64.6099
4,Il padre di Micerino,male,Il padre di Micerino si chiamava Chefren ed er...,GPT,33.697001,27.752175,38.550824,63.794683,64.105433,63.749506,61.8752,63.922383,63.750028,65.764806,64.687378


### ROUGE

In [6]:
from torchmetrics.text.rouge import ROUGEScore
from tqdm.notebook import tqdm

def compute_rouge(LLM, resp_col_name, og_resp_df, RQ_resp_df):

    RQ_male_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="male"]
    RQ_female_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="female"]

    rouge = ROUGEScore(rouge_keys=('rougeL', 'rougeLsum'))

    rouge_results = []

    for _, row in tqdm(og_resp_df.iterrows(), total=len(og_resp_df)):
        query = row["Query"]
        OG_resp = row[resp_col_name]
        RQ_resp_male = RQ_male_resp_df.loc[RQ_male_resp_df["Original Query"]==query][resp_col_name].values[0]
        RQ_resp_female = RQ_female_resp_df.loc[RQ_female_resp_df["Original Query"]==query][resp_col_name].values[0]
        
        OG_male_rougeL = float(rouge(RQ_resp_male, OG_resp)['rougeL_fmeasure'])
        OG_male_rougeLsum = float(rouge(RQ_resp_male, OG_resp)['rougeLsum_fmeasure'])
        
        OG_female_rougeL = float(rouge(RQ_resp_female, OG_resp)['rougeL_fmeasure'])
        OG_female_rougeLsum = float(rouge(RQ_resp_female, OG_resp)['rougeLsum_fmeasure'])
        
        female_male_rougeL = float(rouge(RQ_resp_female, RQ_resp_male)['rougeL_fmeasure'])
        female_male_rougeLsum = float(rouge(RQ_resp_female, RQ_resp_male)['rougeLsum_fmeasure'])

        rouge_results.append([query, OG_male_rougeL, OG_male_rougeLsum, OG_female_rougeL, OG_female_rougeLsum, female_male_rougeL, female_male_rougeLsum])

    rouge_df = pd.DataFrame(rouge_results, columns=["Query", "OG_male_rougeL", "OG_male_rougeLsum", "OG_female_rougeL", "OG_female_rougeLsum", "female_male_rougeL", "female_male_rougeLsum"])
    rouge_df.to_csv("../Results/rouge_scores_" + LLM + ".csv", index=False)

In [7]:
import pandas as pd

gemma_OG = pd.read_csv("../Data/Gemma_2b_response.csv")
gemma_RQ = pd.read_csv("../Data/Gemma_2b_response_RQ.csv")
compute_rouge(LLM="Gemma", resp_col_name="gemma_2b_resp", og_resp_df=gemma_OG, RQ_resp_df=gemma_RQ)

gpt_OG = pd.read_csv("../Data/GPT_response.csv")
gpt_RQ = pd.read_csv("../Data/GPT_response_RQ.csv")
compute_rouge(LLM="GPT", resp_col_name="GPT", og_resp_df=gpt_OG, RQ_resp_df=gpt_RQ)

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]