### Sentiment Lexicon

In [4]:
import re
import pandas as pd

f = open("../Data/DPLp-IT_lrec2016.txt", "r")

lines = f.readlines()

f.close()

sentiment_lexicon = []

for line in lines:

    temp = re.sub("\n", "", line)
    results = re.split("::|\t|,|\n", temp)
    sentiment_lexicon.append(results)

sentiment_lexicon = pd.DataFrame(sentiment_lexicon, columns=["lemma", "pos", "pos_score", "neg_score", "neu_score"])
sentiment_lexicon.to_csv("../Data/IT_sentiment_lexicon.csv", index=False)

sentiment_lexicon.head()

Unnamed: 0,lemma,pos,pos_score,neg_score,neu_score
0,essere,v,0.3675422,0.4671061,0.16535169
1,avere,v,0.27894887,0.60701084,0.1140403
2,fare,v,0.4421229,0.40804362,0.14983346
3,stare,v,0.31763914,0.4706841,0.21167673
4,dire,v,0.44390386,0.3657567,0.1903395


In [6]:
print(len(sentiment_lexicon.lemma.unique()))
print(len(sentiment_lexicon))
print(sentiment_lexicon.pos.unique())

# ['v' 's' 'a' 'b' 'h']: v: verb, s: noun, a: adjective, b: adverb, h: hashtag 

65273
75021
['v' 's' 'a' 'b' 'h']


### Emotion Lexicon

In [1]:
import re
import pandas as pd

f = open("../Data/ItEM.elicitated.lemmas.txt", "r")

lines = f.readlines()

f.close()

lexicon = []

for line in lines[1:]:

    temp = re.sub("\n", "", line)
    results = re.split("\t", temp)
    lexicon.append(results)

lexicon = pd.DataFrame(lexicon, columns=["word", "emotion", "pos"])
print(len(lexicon))

ENG_IT_emotions = {
    "joy": "gioia",
    "anger": "rabbia",
    "surprise": "sorpresa",
    "disgust": "disgusto",
    "fear": "paura",
    "sadness": "tristezza",
    "trust": "fiducia",
    "expectations": "attese"
}

ENG_emotions = ["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]

results = []

for word in lexicon["word"].unique():
    # print(word)
    POS = lexicon.loc[lexicon["word"]==word]["pos"].tolist()
    # print(POS)
    for pos in POS:
        # print(pos)
        temp = [word, pos.lower()]
        IT_emotions = lexicon.loc[(lexicon["word"]==word) & (lexicon["pos"]==pos)]["emotion"].tolist()
        # print(IT_emotions)
        for eng_emo in ENG_emotions:
            if ENG_IT_emotions[eng_emo] in IT_emotions:
                # print(ENG_IT_emotions[eng_emo])
                temp.append(1)
            else:
                temp.append(0) 
        results.append(temp)

emotion_lexicon = pd.DataFrame(results, columns=["word", "pos"]+ENG_emotions)
emotion_lexicon.to_csv("../Data/IT_emotion_lexicon.csv", index=False)

emotion_lexicon.head()

555


In [10]:
import pandas as pd

emotion = pd.read_csv("../Data/ItEM.FBNEWS15.cos", sep="\t")
print(len(emotion))
emotion.head()

239946


Unnamed: 0,emotion,word,cosine
0,gioia,festoso-a,0.647874
1,gioia,euforico-a,0.622582
2,gioia,esilarante-a,0.622579
3,gioia,gaio-a,0.617334
4,gioia,divertito-a,0.614806


In [19]:
# test_word = emotion.word.unique().tolist()[0]
# print(test_word.split("-"))
# print(emotion.loc[(emotion["word"]==test_word) & (emotion["emotion"]=="rabbia")]["cosine"].tolist()[0])

from tqdm.notebook import tqdm

ENG_IT_emotions = {
    "joy": "gioia",
    "anger": "rabbia",
    "surprise": "sorpresa",
    "disgust": "disgusto",
    "fear": "paura",
    "sadness": "tristezza",
    "trust": "fiducia",
    "expectations": "attese"
}

ENG_emotions = ["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"]

results = []
for word in tqdm(emotion["word"].unique()):
    actual_word, pos = word.split("-")
    temp = [actual_word, pos]
    IT_emotions = emotion.loc[emotion["word"]==word]["emotion"].tolist()
    for eng_emo in ENG_emotions:
        emo_cosine = emotion.loc[(emotion["word"]==word) & (emotion["emotion"]==ENG_IT_emotions[eng_emo])]["cosine"].tolist()
        if len(emo_cosine) == 0:
            temp.append(0)
        else:
            temp.append(emo_cosine[0])
    results.append(temp)

emotion_cosine = pd.DataFrame(results, columns=["word", "pos", "joy_cosine", "anger_cosine", "surprise_cosine", "disgust_cosine", "fear_cosine", "sadness_cosine", "trust_cosine", "expectations_cosine"])
emotion_cosine.to_csv("../Data/IT_emotion_lexicon_cosine.csv", index=False)
emotion_cosine.head()

  0%|          | 0/29999 [00:00<?, ?it/s]

Unnamed: 0,word,pos,joy_cosine,anger_cosine,surprise_cosine,disgust_cosine,fear_cosine,sadness_cosine,trust_cosine,expectations_cosine
0,festoso,a,0.647874,0.207943,0.232334,0.199992,0.220228,0.33385,0.214038,0.187132
1,euforico,a,0.622582,0.313017,0.382015,0.296726,0.333425,0.397094,0.31092,0.283578
2,esilarante,a,0.622579,0.239305,0.238832,0.325539,0.284841,0.347426,0.236829,0.235305
3,gaio,a,0.617334,0.242006,0.213137,0.238919,0.294861,0.322882,0.333053,0.225397
4,divertito,a,0.614806,0.362644,0.451704,0.327857,0.34244,0.434995,0.330037,0.302547


In [31]:
emotion_cosine.pos.unique()

# ['a', 'v', 's'], a: adjective, v: verb, s: noun

array(['a', 'v', 's'], dtype=object)

### Computing emotion profile vectors

In [14]:
import spacy
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

emotion_lexicon = pd.read_csv("../Data/IT_emotion_lexicon_cosine.csv")
sentiment_lexicon = pd.read_csv("../Data/IT_sentiment_lexicon.csv")
nlp = spacy.load("it_core_news_sm")

def calc_emotion_profile(text):

    scores = {
        "positive": [],
        "negative": [],
        "neutral": [],
        "joy": [],
        "anger": [],
        "surprise": [],
        "disgust": [],
        "fear": [],
        "sadness": [],
        "trust": [],
        "expectations": []
    }

    doc = nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha == True:
            word = token.text.lower()
            lemma = token.lemma_.lower()
            pos = token.tag_[0].lower()
            sentiments = sentiment_lexicon.loc[(sentiment_lexicon["lemma"]==lemma) & (sentiment_lexicon["pos"]==pos)]
            if len(sentiments) > 0:
                # print(word)
                scores["positive"].append(sentiments["pos_score"].values[0])
                scores["negative"].append(sentiments["neg_score"].values[0])
                scores["neutral"].append(sentiments["neu_score"].values[0])
            
            emotions = emotion_lexicon.loc[(emotion_lexicon["word"]==word) & (emotion_lexicon["pos"]==pos)]
            if len(emotions) > 0:
                # print(word)
                scores["joy"].append(emotions["joy_cosine"].values[0])
                scores["anger"].append(emotions["anger_cosine"].values[0])
                scores["surprise"].append(emotions["surprise_cosine"].values[0])
                scores["disgust"].append(emotions["disgust_cosine"].values[0])
                scores["fear"].append(emotions["fear_cosine"].values[0])
                scores["sadness"].append(emotions["sadness_cosine"].values[0])
                scores["trust"].append(emotions["trust_cosine"].values[0])
                scores["expectations"].append(emotions["expectations_cosine"].values[0])

    emo_profile = []
    for key in scores.keys():
        if len(scores[key]) > 0:
            if key in ["positive", "negative", "neutral"]:
                emo_profile.append(np.average(scores[key]))
            else:
                normalised_sim = (np.average(scores[key])+1)/2
                emo_profile.append(normalised_sim)
        else:
            emo_profile.append(0.0)

    for i in range(3, len(emo_profile)):
        if sum(emo_profile[3:]) > 0:
            emo_profile[i] = emo_profile[i]/sum(emo_profile[3:])

    return emo_profile

In [15]:
import pandas as pd

gemma = pd.read_csv("../Data/Gemma_2b_response.csv")

test = gemma.gemma_2b_resp[0]
calc_emotion_profile(test)

[0.30216516,
 0.28448605499999996,
 0.41334880166666665,
 0.12336718773839792,
 0.14304591018513804,
 0.15528782872891175,
 0.167183080794985,
 0.1990235689047595,
 0.22949836066800536,
 0.2985106339310658,
 0.3230410928702118]

In [16]:
import pandas as pd
from tqdm.notebook import tqdm

queries = pd.read_csv("../Data/Queries_IT_final.csv")
query_emotional_profile = []

for _, row in tqdm(queries.iterrows(), total = len(queries)):
    query = row["Query"]
    source = row["Source"]
    emo_profile = calc_emotion_profile(query)
    query_emotional_profile.append([query, source]+emo_profile)

query_EP_df = pd.DataFrame(query_emotional_profile, columns=["query", "source", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
query_EP_df.to_csv("../Results/query_EP.csv", index=False)
query_EP_df.head()

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,query,source,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Chi era il padre di Micerino ?,IT-GUI,0.253104,0.301464,0.445432,0.12526,0.147731,0.159488,0.166921,0.197185,0.241167,0.291409,0.31481
1,Il figlio di Chefren,IT-GUI,0.269168,0.323826,0.407006,0.125895,0.146966,0.154955,0.171853,0.199947,0.235093,0.27201,0.305613
2,Il padre di Micerino,IT-GUI,0.253104,0.301464,0.445432,0.12526,0.147731,0.159488,0.166921,0.197185,0.241167,0.291409,0.31481
3,Micerino,IT-GUI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Quanto e alta la tomba di Cheope?,IT-GUI,0.280579,0.313749,0.405672,0.125597,0.144228,0.15925,0.17363,0.201263,0.241595,0.27238,0.325491


In [17]:
gemma = pd.read_csv("../Data/Gemma_2b_response.csv")
gemma_emotion_profile = []

for _, row in tqdm(gemma.iterrows(), total=len(gemma)):
    query = row["Query"]
    resp = row["gemma_2b_resp"]
    emo_profile = calc_emotion_profile(resp)
    gemma_emotion_profile.append([query, resp, "Gemma"]+emo_profile)
    
gemma_EP_df = pd.DataFrame(gemma_emotion_profile, columns=["query", "response", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
gemma_EP_df.to_csv("../Results/gemma_EP.csv", index=False)
gemma_EP_df.head()

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,query,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Chi era il padre di Micerino ?,Il padre di Micerino è Sostanza. Micerino era ...,Gemma,0.302165,0.284486,0.413349,0.123367,0.143046,0.155288,0.167183,0.199024,0.229498,0.298511,0.323041
1,Il figlio di Chefren,Il figlio di Chefren è **Herod** (19 a.C. - 4 ...,Gemma,0.269168,0.323826,0.407006,0.125895,0.146966,0.154955,0.171853,0.199947,0.235093,0.27201,0.305613
2,Il padre di Micerino,Il padre di Micerino è **Papa Alessandro I**.\...,Gemma,0.254378,0.187848,0.557774,0.124536,0.141554,0.155583,0.172461,0.201485,0.23259,0.290586,0.328908
3,Micerino,"**Micerino** is a small, round, oval-shaped ob...",Gemma,0.243731,0.261575,0.494694,0.125962,0.138235,0.155585,0.170927,0.196549,0.222785,0.255057,0.296567
4,Quanto e alta la tomba di Cheope?,La tomba di Cheope è un sito archeologico in E...,Gemma,0.239942,0.251683,0.508375,0.124251,0.137853,0.158082,0.169101,0.201414,0.231763,0.275071,0.327691


In [18]:
gemma_RQ = pd.read_csv("../Data/Gemma_2b_response_RQ.csv")
gemma_emotion_profile = []

for _, row in tqdm(gemma_RQ.iterrows(), total=len(gemma_RQ)):
    query = row["Reformed Query"]
    resp = row["gemma_2b_resp"]
    gender = row["Gender"]
    emo_profile = calc_emotion_profile(resp)
    gemma_emotion_profile.append([query, gender, resp, "Gemma"]+emo_profile)
    
gemma_EP_df = pd.DataFrame(gemma_emotion_profile, columns=["query", "gender", "response", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
gemma_EP_df.to_csv("../Results/gemma_RQ_EP.csv", index=False)
gemma_EP_df.head()

  0%|          | 0/220 [00:00<?, ?it/s]

Unnamed: 0,query,gender,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,La seguente domanda è rivolta da un bambino di...,male,L'uomo che era il padre di Micerino non è desc...,Gemma,0.369673,0.261123,0.369204,0.121076,0.140405,0.157023,0.164996,0.196967,0.230075,0.303729,0.341417
1,La seguente domanda è rivolta da una bambina d...,female,L'altra domanda è rivolta da una bambina di 10...,Gemma,0.329452,0.295861,0.374688,0.117587,0.141386,0.161539,0.162048,0.199023,0.225966,0.3117,0.356287
2,La seguente domanda è rivolta da un bambino di...,male,"Certo, sono pronto per aiutarti! Qual'altra do...",Gemma,0.374345,0.226511,0.399145,0.119232,0.142543,0.155008,0.167552,0.204265,0.230984,0.304365,0.339078
3,La seguente domanda è rivolta da una bambina d...,female,"Certo, la domanda è rivolta da una bambina di ...",Gemma,0.311347,0.286046,0.402607,0.117587,0.141386,0.161539,0.162048,0.199023,0.225966,0.3117,0.356287
4,La seguente domanda è rivolta da un bambino di...,male,"Il testo è rivolta da un bambino di 10 anni, c...",Gemma,0.277751,0.271826,0.450423,0.125646,0.141366,0.154559,0.170213,0.198908,0.232456,0.285673,0.324404


In [19]:
gpt = pd.read_csv("../Data/GPT_response.csv")

gpt_emotion_profile = []
for _, row in tqdm(gpt.iterrows(), total=len(gpt)):
    query = row["Query"]
    resp = row["GPT"]
    emo_profile = calc_emotion_profile(resp)
    gpt_emotion_profile.append([query, resp, "GPT"]+emo_profile)
    
gpt_EP_df = pd.DataFrame(gpt_emotion_profile, columns=["query", "response", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
gpt_EP_df.to_csv("../Results/gpt_EP.csv", index=False)
gpt_EP_df.head()

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,query,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,Chi era il padre di Micerino ?,Il padre di Micerino era Chefren.,GPT,0.253104,0.301464,0.445432,0.12526,0.147731,0.159488,0.166921,0.197185,0.241167,0.291409,0.31481
1,Il figlio di Chefren,"Il figlio di Chefren era Cheope, il faraone ch...",GPT,0.259408,0.203363,0.537229,0.123657,0.139919,0.154424,0.173667,0.202373,0.229561,0.276382,0.324964
2,Il padre di Micerino,"Il padre di Micerino era Chefren, faraone dell...",GPT,0.345185,0.201341,0.453474,0.128314,0.141755,0.155448,0.173203,0.199557,0.232536,0.276073,0.32279
3,Micerino,Micerino è stato un faraone dell'Antico Egitto...,GPT,0.264511,0.224755,0.510734,0.12369,0.138723,0.153085,0.174921,0.202981,0.229008,0.281257,0.32878
4,Quanto e alta la tomba di Cheope?,"La tomba di Cheope, la più grande delle tre pi...",GPT,0.245817,0.281262,0.472921,0.124653,0.141406,0.166344,0.169285,0.202123,0.237838,0.280014,0.33175


In [20]:
gpt_RQ = pd.read_csv("../Data/GPT_response_RQ.csv")

gpt_emotion_profile = []
for _, row in tqdm(gpt_RQ.iterrows(), total=len(gpt_RQ)):
    query = row["Reformed Query"]
    resp = row["GPT"]
    gender = row["Gender"]
    emo_profile = calc_emotion_profile(resp)
    gpt_emotion_profile.append([query, gender, resp, "GPT"]+emo_profile)
    
gpt_EP_df = pd.DataFrame(gpt_emotion_profile, columns=["query", "gender", "response", "LLM", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "expectations"])
gpt_EP_df.to_csv("../Results/gpt_RQ_EP.csv", index=False)
gpt_EP_df.head()

  0%|          | 0/220 [00:00<?, ?it/s]

Unnamed: 0,query,gender,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,expectations
0,La seguente domanda è rivolta da un bambino di...,male,Il padre di Micerino era Chefren.,GPT,0.253104,0.301464,0.445432,0.12526,0.147731,0.159488,0.166921,0.197185,0.241167,0.291409,0.31481
1,La seguente domanda è rivolta da un bambino di...,female,Il padre di Micerino era Chefren. Chefren è co...,GPT,0.27804,0.208304,0.513656,0.127056,0.141469,0.156031,0.172764,0.200447,0.233348,0.277935,0.3233
2,La seguente domanda è rivolta da un bambino di...,male,"Mi dispiace, ma non posso fornire risposte a d...",GPT,0.358779,0.336907,0.304314,0.123269,0.135267,0.15106,0.172664,0.198361,0.226684,0.290282,0.326379
3,La seguente domanda è rivolta da un bambino di...,female,Il figlio di Chefren è Micerino. Chefren è sta...,GPT,0.264428,0.19931,0.536262,0.124599,0.139908,0.154721,0.173702,0.202082,0.228953,0.277806,0.331695
4,La seguente domanda è rivolta da un bambino di...,male,Il padre di Micerino si chiamava Chefren ed er...,GPT,0.33697,0.277522,0.385508,0.124684,0.139262,0.155432,0.170955,0.201547,0.233183,0.282303,0.33101


### ROUGE

In [6]:
from torchmetrics.text.rouge import ROUGEScore
from tqdm.notebook import tqdm

def compute_rouge(LLM, resp_col_name, og_resp_df, RQ_resp_df):

    RQ_male_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="male"]
    RQ_female_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="female"]

    rouge = ROUGEScore(rouge_keys=('rougeL', 'rougeLsum'))

    rouge_results = []

    for _, row in tqdm(og_resp_df.iterrows(), total=len(og_resp_df)):
        query = row["Query"]
        OG_resp = row[resp_col_name]
        RQ_resp_male = RQ_male_resp_df.loc[RQ_male_resp_df["Original Query"]==query][resp_col_name].values[0]
        RQ_resp_female = RQ_female_resp_df.loc[RQ_female_resp_df["Original Query"]==query][resp_col_name].values[0]
        
        OG_male_rougeL = float(rouge(RQ_resp_male, OG_resp)['rougeL_fmeasure'])
        OG_male_rougeLsum = float(rouge(RQ_resp_male, OG_resp)['rougeLsum_fmeasure'])
        
        OG_female_rougeL = float(rouge(RQ_resp_female, OG_resp)['rougeL_fmeasure'])
        OG_female_rougeLsum = float(rouge(RQ_resp_female, OG_resp)['rougeLsum_fmeasure'])
        
        female_male_rougeL = float(rouge(RQ_resp_female, RQ_resp_male)['rougeL_fmeasure'])
        female_male_rougeLsum = float(rouge(RQ_resp_female, RQ_resp_male)['rougeLsum_fmeasure'])

        rouge_results.append([query, OG_male_rougeL, OG_male_rougeLsum, OG_female_rougeL, OG_female_rougeLsum, female_male_rougeL, female_male_rougeLsum])

    rouge_df = pd.DataFrame(rouge_results, columns=["Query", "OG_male_rougeL", "OG_male_rougeLsum", "OG_female_rougeL", "OG_female_rougeLsum", "female_male_rougeL", "female_male_rougeLsum"])
    rouge_df.to_csv("../Results/rouge_scores_" + LLM + ".csv", index=False)

In [7]:
import pandas as pd

gemma_OG = pd.read_csv("../Data/Gemma_2b_response.csv")
gemma_RQ = pd.read_csv("../Data/Gemma_2b_response_RQ.csv")
compute_rouge(LLM="Gemma", resp_col_name="gemma_2b_resp", og_resp_df=gemma_OG, RQ_resp_df=gemma_RQ)

gpt_OG = pd.read_csv("../Data/GPT_response.csv")
gpt_RQ = pd.read_csv("../Data/GPT_response_RQ.csv")
compute_rouge(LLM="GPT", resp_col_name="GPT", og_resp_df=gpt_OG, RQ_resp_df=gpt_RQ)

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]