### Sentiment Lexicon

In [4]:
import re
import pandas as pd

f = open("../Data/DPLp-IT_lrec2016.txt", "r")

lines = f.readlines()

f.close()

sentiment_lexicon = []

for line in lines:

    temp = re.sub("\n", "", line)
    results = re.split("::|\t|,|\n", temp)
    sentiment_lexicon.append(results)

sentiment_lexicon = pd.DataFrame(sentiment_lexicon, columns=["lemma", "pos", "pos_score", "neg_score", "neu_score"])
sentiment_lexicon.to_csv("../Data/IT_sentiment_lexicon.csv", index=False)

sentiment_lexicon.head()

Unnamed: 0,lemma,pos,pos_score,neg_score,neu_score
0,essere,v,0.3675422,0.4671061,0.16535169
1,avere,v,0.27894887,0.60701084,0.1140403
2,fare,v,0.4421229,0.40804362,0.14983346
3,stare,v,0.31763914,0.4706841,0.21167673
4,dire,v,0.44390386,0.3657567,0.1903395


In [6]:
print(len(sentiment_lexicon.lemma.unique()))
print(len(sentiment_lexicon))
print(sentiment_lexicon.pos.unique())

# ['v' 's' 'a' 'b' 'h']: v: verb, s: noun, a: adjective, b: adverb, h: hashtag 

65273
75021
['v' 's' 'a' 'b' 'h']


In [7]:
import pandas as pd

sent_lexicon = pd.read_csv("../Data/IT_sentiment_lexicon.csv")
len(sent_lexicon)

75021

### Emotion Lexicon

In [39]:
import pandas as pd

emotion = pd.read_csv("../Data/ItEM.FBNEWS15.cos", sep="\t")
print(len(emotion))
emotion.head()

239946


Unnamed: 0,emotion,word,cosine
0,gioia,festoso-a,0.647874
1,gioia,euforico-a,0.622582
2,gioia,esilarante-a,0.622579
3,gioia,gaio-a,0.617334
4,gioia,divertito-a,0.614806


In [40]:
# test_word = emotion.word.unique().tolist()[0]
# print(test_word.split("-"))
# print(emotion.loc[(emotion["word"]==test_word) & (emotion["emotion"]=="rabbia")]["cosine"].tolist()[0])

from tqdm.notebook import tqdm
import spacy

nlp = spacy.load("it_core_news_sm")

ENG_IT_emotions = {
    "joy": "gioia",
    "anger": "rabbia",
    "surprise": "sorpresa",
    "disgust": "disgusto",
    "fear": "paura",
    "sadness": "tristezza",
    "trust": "fiducia",
    "anticipation": "attese"
}

ENG_emotions = ["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]

results = []
for word in tqdm(emotion["word"].unique()):
    actual_word, pos = word.split("-")
    lemma = nlp(actual_word)[0].lemma_.lower()
    IT_emotions = emotion.loc[emotion["word"]==word]["emotion"].tolist()
    temp = []
    norm_factor = 0
    for eng_emo in ENG_emotions:
        emo_cosine = emotion.loc[(emotion["word"]==word) & (emotion["emotion"]==ENG_IT_emotions[eng_emo])]["cosine"].tolist()
        if len(emo_cosine) == 0:
            temp.append(0)
            norm_factor += 0
        else:
            temp.append(emo_cosine[0])
            norm_factor += emo_cosine[0]
    
    temp_new = [t/norm_factor for t in temp]
    emo_norm_score = [actual_word, lemma, pos] + temp_new
    results.append(emo_norm_score)

emotion_cosine = pd.DataFrame(results, columns=["actual_word", "lemma", "pos", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
emotion_cosine.to_csv("../Data/IT_emotion_lexicon_normalised_new.csv", index=False)
emotion_cosine.head()

  0%|          | 0/29999 [00:00<?, ?it/s]

Unnamed: 0,actual_word,lemma,pos,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,festoso,festoso,a,0.288792,0.092691,0.103564,0.089147,0.098167,0.148815,0.095408,0.083415
1,euforico,euforico,a,0.211809,0.106492,0.129965,0.100949,0.113435,0.135096,0.105778,0.096476
2,esilarante,esilarare,a,0.246015,0.094562,0.094376,0.128638,0.112556,0.137287,0.093584,0.092982
3,gaio,gaio,a,0.248166,0.097285,0.08568,0.096044,0.118533,0.129797,0.133886,0.090609
4,divertito,divertire,a,0.194127,0.114506,0.142627,0.103522,0.108127,0.137351,0.10421,0.09553


In [41]:
emotion_cosine.pos.unique()

# ['a', 'v', 's'], a: adjective, v: verb, s: noun

array(['a', 'v', 's'], dtype=object)

In [42]:
len(emotion_cosine)

29999

In [44]:
sum(emotion_cosine.drop(columns=["actual_word","lemma", "pos"]).iloc[50])

1.0

### Computing emotion profile vectors

### Lexicon-based

In [19]:
import nltk
from nltk.corpus import stopwords
import spacy

nlp = spacy.load("it_core_news_sm")
print(len(nlp.Defaults.stop_words))

nltk.download('stopwords')
stopwords_list = stopwords.words('italian')
print(stopwords_list)
print(len(stopwords_list))

624
['ad', 'al', 'allo', 'ai', 'agli', 'all', 'agl', 'alla', 'alle', 'con', 'col', 'coi', 'da', 'dal', 'dallo', 'dai', 'dagli', 'dall', 'dagl', 'dalla', 'dalle', 'di', 'del', 'dello', 'dei', 'degli', 'dell', 'degl', 'della', 'delle', 'in', 'nel', 'nello', 'nei', 'negli', 'nell', 'negl', 'nella', 'nelle', 'su', 'sul', 'sullo', 'sui', 'sugli', 'sull', 'sugl', 'sulla', 'sulle', 'per', 'tra', 'contro', 'io', 'tu', 'lui', 'lei', 'noi', 'voi', 'loro', 'mio', 'mia', 'miei', 'mie', 'tuo', 'tua', 'tuoi', 'tue', 'suo', 'sua', 'suoi', 'sue', 'nostro', 'nostra', 'nostri', 'nostre', 'vostro', 'vostra', 'vostri', 'vostre', 'mi', 'ti', 'ci', 'vi', 'lo', 'la', 'li', 'le', 'gli', 'ne', 'il', 'un', 'uno', 'una', 'ma', 'ed', 'se', 'perché', 'anche', 'come', 'dov', 'dove', 'che', 'chi', 'cui', 'non', 'più', 'quale', 'quanto', 'quanti', 'quanta', 'quante', 'quello', 'quelli', 'quella', 'quelle', 'questo', 'questi', 'questa', 'queste', 'si', 'tutto', 'tutti', 'a', 'c', 'e', 'i', 'l', 'o', 'ho', 'hai', 'ha',

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hrishitachakra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import spacy
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import spacy

nlp = spacy.load("it_core_news_sm")

emotion_lexicon = pd.read_csv("../Data/IT_emotion_lexicon_normalised_new.csv")
sentiment_lexicon = pd.read_csv("../Data/IT_sentiment_lexicon.csv")
print(nlp("sono")[0].lemma_)
emotion_lexicon.loc[emotion_lexicon["lemma"]==nlp("sono")[0].lemma_]

essere


Unnamed: 0,actual_word,lemma,pos,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
331,sono,essere,a,0.15097,0.093324,0.103912,0.118336,0.129336,0.137559,0.107852,0.158711
6302,siete,essere,v,0.142785,0.12294,0.124728,0.139037,0.13647,0.120004,0.096479,0.117556
10523,sia,essere,s,0.127201,0.128075,0.108722,0.133005,0.125958,0.116055,0.120767,0.140217
11263,sono,essere,s,0.121728,0.148994,0.128282,0.094522,0.114921,0.111865,0.15882,0.120867
11531,state,essere,s,0.119226,0.158921,0.123238,0.095413,0.131816,0.122438,0.130755,0.118194
11581,esser,essere,s,0.127164,0.140194,0.106988,0.134255,0.127347,0.108279,0.128846,0.126926
11958,era,essere,s,0.127665,0.141613,0.149483,0.074146,0.114996,0.124307,0.154265,0.113524
13782,fosse,essere,s,0.132333,0.124531,0.13469,0.120681,0.127702,0.123359,0.10889,0.127814
14276,siete,essere,s,0.130849,0.173141,0.11656,0.123775,0.113511,0.11666,0.122738,0.102765
15271,ero,essere,s,0.195968,0.110389,0.14191,0.106685,0.109451,0.126707,0.094534,0.114356


In [44]:
import spacy
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords_list = stopwords.words('italian')

emotion_lexicon = pd.read_csv("../Data/IT_emotion_lexicon_normalised_new.csv")
sentiment_lexicon = pd.read_csv("../Data/IT_sentiment_lexicon.csv")
nlp = spacy.load("it_core_news_sm")
emotion_names = ["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]  

def calc_emotion_profile(text):

    scores = {
        "positive": [],
        "negative": [],
        "neutral": [],
        "joy": [],
        "anger": [],
        "surprise": [],
        "disgust": [],
        "fear": [],
        "sadness": [],
        "trust": [],
        "anticipation": []
    }

    doc = nlp(text)
    for token in doc:
        if token.is_alpha == True and token.text.lower() not in stopwords_list:
            word = token.text.lower()
            lemma = token.lemma_.lower()
            pos = token.tag_[0].lower()
            sentiments = sentiment_lexicon.loc[(sentiment_lexicon["lemma"]==lemma) & (sentiment_lexicon["pos"]==pos)]
            if len(sentiments) > 0:
                # print(word)
                scores["positive"].append(sentiments["pos_score"].values[0])
                scores["negative"].append(sentiments["neg_score"].values[0])
                scores["neutral"].append(sentiments["neu_score"].values[0])
            
            emotions = emotion_lexicon.loc[(emotion_lexicon["actual_word"]==word) & (emotion_lexicon["lemma"]==lemma) & (emotion_lexicon["pos"]==pos)]
            if len(emotions) > 0:
                # print(word)
                for emo in emotion_names:
                    scores[emo].append(emotions[emo].values[0])

    emo_profile = []
    for key in scores.keys():
        if len(scores[key]) > 0:
            emo_profile.append(round(np.average(scores[key])*100, 2))
        else:
            emo_profile.append(0.00)

    return emo_profile

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hrishitachakra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
## sanity check of EP computation

text_examples = [
    "sono felice",
    "La deforestazione è un male",
    "Questo è spaventoso",
    "Questa è una mela marcia"
]

results = []

for text in text_examples:
    EP_score = calc_emotion_profile(text)
    sent_sum = sum(EP_score[:3])
    emo_sum = sum(EP_score[3:])
    results.append([text]+EP_score+[sent_sum, emo_sum])
    for word in text.split(" "):
        EP_score = calc_emotion_profile(word)
        sent_sum = sum(EP_score[:3])
        emo_sum = sum(EP_score[3:])
        results.append([word]+EP_score+[sent_sum, emo_sum])

results_df = pd.DataFrame(results, columns=["text", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation", "sent_sum", "emo_sum"])
results_df

Unnamed: 0,text,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation,sent_sum,emo_sum
0,sono felice,54.34,27.06,18.6,15.48,13.02,9.12,8.87,12.84,14.01,17.96,8.71,100.0,100.01
1,sono,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,felice,54.34,27.06,18.6,15.48,13.02,9.12,8.87,12.84,14.01,17.96,8.71,100.0,100.01
3,La deforestazione è un male,24.64,27.07,48.29,11.76,12.86,9.15,12.53,15.04,13.47,13.91,11.29,100.0,100.01
4,La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,deforestazione,23.09,18.56,58.35,11.15,11.74,8.18,14.2,17.58,11.5,12.49,13.17,100.0,100.01
6,è,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,un,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,male,26.74,50.32,22.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
9,Questo è spaventoso,26.36,38.84,34.8,8.9,7.9,10.01,13.53,20.91,12.85,11.87,14.03,100.0,100.0


In [46]:
import pandas as pd

gemma = pd.read_csv("../Data/SIGIR_gemma_resp.csv")

test = gemma.Resp[0]
emo_vals = calc_emotion_profile(test)
print(emo_vals)
print(sum(emo_vals[:3]))
sum(emo_vals[3:])

[np.float64(30.89), np.float64(35.87), np.float64(33.24), np.float64(10.95), np.float64(12.52), np.float64(13.94), np.float64(9.95), np.float64(11.72), np.float64(12.96), np.float64(15.94), np.float64(12.02)]
100.0


np.float64(99.99999999999999)

In [47]:
import pandas as pd
from tqdm.notebook import tqdm

queries = pd.read_csv("../Data/SIGIR_queries_IT.csv")
query_emotional_profile = []

for _, row in tqdm(queries.iterrows(), total = len(queries)):
    qid = row["QID"]
    query = row["Query"]
    # prompt_type = row["Prompt Type"]
    emo_profile = calc_emotion_profile(query)
    query_emotional_profile.append([qid]+emo_profile)

query_EP_df = pd.DataFrame(query_emotional_profile, columns=["QID", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
query_EP_df.to_csv("../Results/SIGIR_lexicon_queryWise_queryEP.csv", index=False)
query_EP_df.head()

  0%|          | 0/176 [00:00<?, ?it/s]

Unnamed: 0,QID,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,qGEN1,25.31,30.15,44.54,12.61,15.95,13.27,8.81,10.8,13.9,15.2,9.45
1,qGEN2,26.92,32.38,40.7,13.04,16.93,11.75,10.41,12.07,13.54,12.8,9.47
2,qGEN3,25.31,30.15,44.54,12.61,15.95,13.27,8.81,10.8,13.9,15.2,9.45
3,qGEN4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,qGEN5,28.06,31.37,40.57,12.77,14.55,13.36,11.04,11.77,13.69,11.13,11.69


In [48]:
bing = pd.read_csv("../Data/SIGIR_bing_resp.csv")
bing_emotion_profile = []

for _, row in tqdm(bing.iterrows(), total=len(bing)):
    qid = row["QID"]
    query = row["Query"]
    resp = row["Resp"]
    rank = row["Rank"]
    emo_profile = calc_emotion_profile(resp)
    bing_emotion_profile.append([qid, rank, "Bing"]+emo_profile)
    
bing_EP_df = pd.DataFrame(bing_emotion_profile, columns=["QID", "rank", "IAS", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
bing_EP_df.to_csv("../Results/SIGIR_lexicon_queryWise_bingEP.csv", index=False)
bing_EP_df.head()

  0%|          | 0/874 [00:00<?, ?it/s]

Unnamed: 0,QID,rank,IAS,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,qGEN1,1,Bing,30.08,27.79,42.14,13.05,12.82,13.4,10.77,12.51,11.66,13.03,12.77
1,qGEN1,2,Bing,33.23,31.46,35.31,12.57,13.08,11.3,11.76,13.26,12.44,13.69,11.91
2,qGEN1,3,Bing,29.5,22.54,47.96,11.55,12.65,12.21,10.18,12.46,10.54,15.66,14.75
3,qGEN1,4,Bing,31.1,23.7,45.2,12.15,12.1,11.84,10.98,13.18,12.03,14.36,13.37
4,qGEN1,5,Bing,26.93,29.98,43.09,12.56,12.89,12.02,11.35,13.08,11.99,12.59,13.53


In [53]:
gemma = pd.read_csv("../Data/SIGIR_gemma_resp.csv")
gemma_emotion_profile = []

for _, row in tqdm(gemma.iterrows(), total=len(gemma)):
    qid = row["QID"]
    query = row["Query"]
    resp = row["Resp"]
    emo_profile = calc_emotion_profile(resp)
    gemma_emotion_profile.append([qid, "Gemma"]+emo_profile)
    
gemma_EP_df = pd.DataFrame(gemma_emotion_profile, columns=["QID", "IAS", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
gemma_EP_df.to_csv("../Results/SIGIR_lexicon_queryWise_gemmaEP.csv", index=False)
gemma_EP_df.head()

  0%|          | 0/176 [00:00<?, ?it/s]

Unnamed: 0,QID,IAS,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,qGEN1,Gemma,30.89,35.87,33.24,10.95,12.52,13.94,9.95,11.72,12.96,15.94,12.02
1,qGEN2,Gemma,31.83,20.25,47.92,13.05,13.87,10.79,10.42,12.45,12.55,16.46,10.4
2,qGEN3,Gemma,26.06,22.59,51.35,12.8,14.64,12.49,10.65,11.13,12.32,15.3,10.65
3,qGEN4,Gemma,23.67,46.01,30.32,12.31,13.49,12.93,6.71,13.04,11.8,17.11,12.61
4,qGEN5,Gemma,33.7,33.14,33.16,11.39,12.08,13.35,10.38,12.14,12.87,15.01,12.77


In [48]:
gpt = pd.read_csv("../Data/SIGIR_gpt_resp.csv")

gpt_emotion_profile = []
for _, row in tqdm(gpt.iterrows(), total=len(gpt)):
    qid = row["QID"]
    query = row["Query"]
    resp = row["GPT"]
    emo_profile = calc_emotion_profile(resp)
    gpt_emotion_profile.append([qid, "GPT"]+emo_profile)
    
gpt_EP_df = pd.DataFrame(gpt_emotion_profile, columns=["QID", "IAS", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
gpt_EP_df.to_csv("../Results/SIGIR_lexicon_queryWise_gptEP.csv", index=False)
gpt_EP_df.head()

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,query,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,Chi era il padre di Micerino ?,Il padre di Micerino era Chefren.,GPT,25.31,30.15,44.54,12.53,13.28,12.68,11.66,12.12,12.82,13.11,11.81
1,Il figlio di Chefren,"Il figlio di Chefren era Cheope, il faraone ch...",GPT,25.94,20.34,53.72,12.37,12.61,12.4,12.34,12.59,12.34,12.72,12.62
2,Il padre di Micerino,"Il padre di Micerino era Chefren, faraone dell...",GPT,34.52,20.13,45.35,12.83,12.71,12.41,12.23,12.34,12.42,12.6,12.46
3,Micerino,Micerino è stato un faraone dell'Antico Egitto...,GPT,26.45,22.48,51.07,12.37,12.5,12.3,12.43,12.6,12.26,12.88,12.66
4,Quanto e alta la tomba di Cheope?,"La tomba di Cheope, la più grande delle tre pi...",GPT,24.58,28.13,47.29,12.47,12.72,13.27,11.81,12.35,12.51,12.47,12.4


#### ROUGE

In [16]:
from torchmetrics.text.rouge import ROUGEScore
from tqdm.notebook import tqdm

def compute_rouge(LLM, resp_col_name, og_resp_df, RQ_resp_df):

    RQ_male_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="male"]
    RQ_female_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="female"]
    RQ_neutral_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="neutral"]

    rouge = ROUGEScore(rouge_keys=('rougeL', 'rougeLsum'))

    rouge_results = []

    for _, row in tqdm(og_resp_df.iterrows(), total=len(og_resp_df)):
        query = row["Query"]
        OG_resp = row[resp_col_name]
        RQ_resp_male = RQ_male_resp_df.loc[RQ_male_resp_df["Original Query"]==query][resp_col_name].values[0]
        RQ_resp_female = RQ_female_resp_df.loc[RQ_female_resp_df["Original Query"]==query][resp_col_name].values[0]
        RQ_resp_neutral = RQ_neutral_resp_df.loc[RQ_neutral_resp_df["Original Query"]==query][resp_col_name].values[0]
        
        OG_male_rougeL = float(rouge(RQ_resp_male, OG_resp)['rougeL_fmeasure'])
        OG_male_rougeLsum = float(rouge(RQ_resp_male, OG_resp)['rougeLsum_fmeasure'])
        
        OG_female_rougeL = float(rouge(RQ_resp_neutral, OG_resp)['rougeL_fmeasure'])
        OG_female_rougeLsum = float(rouge(RQ_resp_neutral, OG_resp)['rougeLsum_fmeasure'])

        OG_neutral_rougeL = float(rouge(RQ_resp_female, OG_resp)['rougeL_fmeasure'])
        OG_neutral_rougeLsum = float(rouge(RQ_resp_female, OG_resp)['rougeLsum_fmeasure'])
        
        # female_male_rougeL = float(rouge(RQ_resp_female, RQ_resp_male)['rougeL_fmeasure'])
        # female_male_rougeLsum = float(rouge(RQ_resp_female, RQ_resp_male)['rougeLsum_fmeasure'])

        rouge_results.append([query, OG_male_rougeL, OG_male_rougeLsum, OG_female_rougeL, OG_female_rougeLsum, OG_neutral_rougeL, OG_neutral_rougeLsum])

    rouge_df = pd.DataFrame(rouge_results, columns=["Query", "OG_male_rougeL", "OG_male_rougeLsum", "OG_female_rougeL", "OG_female_rougeLsum", "OG_neutral_rougeL", "OG_neutral_rougeLsum"])
    rouge_df.to_csv("../Results/rouge_scores_" + LLM + ".csv", index=False)

In [19]:
import pandas as pd

gemma_OG = pd.read_csv("../Data/Gemma_2b_response.csv")
gemma_RQ = pd.read_csv("../Data/Gemma_2b_response_RQ.csv")
compute_rouge(LLM="Gemma", resp_col_name="gemma_2b_resp", og_resp_df=gemma_OG, RQ_resp_df=gemma_RQ)

gpt_OG = pd.read_csv("../Data/GPT_response.csv")
gpt_RQ = pd.read_csv("../Data/GPT_response_RQ.csv")
compute_rouge(LLM="GPT", resp_col_name="GPT", og_resp_df=gpt_OG, RQ_resp_df=gpt_RQ)

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

### Semantic approach

In [50]:
# santiy check

from transformers import pipeline
import pandas as pd

text_examples = [
    "sono felice",
    "La deforestazione è un male",
    "Questo è spaventoso",
    "Questa è una mela marcia",
    "mostro nero",
    "lei è una bellezza terribile"
]
emo_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-emotion',top_k=4)
sent_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-sentiment',top_k=2)

emo_scores = emo_classifier(text_examples)
sent_scores = sent_classifier(text_examples)

EP_semantic_results = {
    "text": [],
    "positive": [],
    "negative": [],
    "sent_sum": [],
    "joy": [],
    "sadness": [],
    "anger": [],
    "fear": [],
    "emo_sum": []
}
print(emo_scores)
for text, sent_score, emo_score in zip(text_examples, sent_scores, emo_scores):
    EP_semantic_results["text"].append(text)
    sent_sum = 0
    emo_sum = 0
    for sent_dict in sent_score:
        EP_semantic_results[sent_dict["label"]].append(sent_dict["score"])
        sent_sum += sent_dict["score"]
    EP_semantic_results["sent_sum"].append(sent_sum)
    for emo_dict in emo_score:
        EP_semantic_results[emo_dict["label"]].append(emo_dict["score"])
        emo_sum += emo_dict["score"]
    EP_semantic_results["emo_sum"].append(emo_sum)

print(EP_semantic_results)
df = pd.DataFrame.from_dict(EP_semantic_results)
df

Device set to use cpu
Device set to use cpu


[[{'label': 'joy', 'score': 0.9990108013153076}, {'label': 'sadness', 'score': 0.00041838770266622305}, {'label': 'fear', 'score': 0.0004000047920271754}, {'label': 'anger', 'score': 0.00017075150390155613}], [{'label': 'sadness', 'score': 0.9292973279953003}, {'label': 'anger', 'score': 0.0687011107802391}, {'label': 'fear', 'score': 0.0017922078259289265}, {'label': 'joy', 'score': 0.0002093376824632287}], [{'label': 'fear', 'score': 0.9970308542251587}, {'label': 'sadness', 'score': 0.0010273955995216966}, {'label': 'joy', 'score': 0.0010075614554807544}, {'label': 'anger', 'score': 0.0009342431440018117}], [{'label': 'sadness', 'score': 0.7056401968002319}, {'label': 'anger', 'score': 0.2918979525566101}, {'label': 'fear', 'score': 0.0020267446525394917}, {'label': 'joy', 'score': 0.0004351035167928785}], [{'label': 'joy', 'score': 0.9361270070075989}, {'label': 'sadness', 'score': 0.035196058452129364}, {'label': 'fear', 'score': 0.02076711133122444}, {'label': 'anger', 'score': 0

Unnamed: 0,text,positive,negative,sent_sum,joy,sadness,anger,fear,emo_sum
0,sono felice,0.999726,0.000274,1.0,0.999011,0.000418,0.000171,0.0004,1.0
1,La deforestazione è un male,0.000216,0.999784,1.0,0.000209,0.929297,0.068701,0.001792,1.0
2,Questo è spaventoso,0.000217,0.999783,1.0,0.001008,0.001027,0.000934,0.997031,1.0
3,Questa è una mela marcia,0.000219,0.999781,1.0,0.000435,0.70564,0.291898,0.002027,1.0
4,mostro nero,0.000346,0.999654,1.0,0.936127,0.035196,0.00791,0.020767,1.0
5,lei è una bellezza terribile,0.000226,0.999774,1.0,0.998647,0.001071,0.000144,0.000137,1.0


In [51]:
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm

emo_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-emotion',top_k=4)
sent_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-sentiment',top_k=2)

def get_semantic_EP(texts, text_name):
    emo_scores = emo_classifier(texts)
    sent_scores = sent_classifier(texts)

    EP_semantic_results = {
        text_name: [],
        "positive": [],
        "negative": [],
        "joy": [],
        "sadness": [],
        "anger": [],
        "fear": [],
    }

    for text, sent_score, emo_score in tqdm(zip(texts, sent_scores, emo_scores), total=len(texts)):
        EP_semantic_results[text_name].append(text)
        for sent_dict in sent_score:
            EP_semantic_results[sent_dict["label"]].append(sent_dict["score"])
        for emo_dict in emo_score:
            EP_semantic_results[emo_dict["label"]].append(emo_dict["score"])
    df = pd.DataFrame.from_dict(EP_semantic_results)
    return df

Device set to use cpu
Device set to use cpu


In [57]:
queries = pd.read_csv("../Data/SIGIR_queries_IT.csv")
queries_df = get_semantic_EP(queries["Query"].tolist(), "Query")
queries_df["QID"] = queries["QID"]
queries_df = queries_df.loc[:, ["QID", "positive", "negative", "joy", "sadness", "anger", "fear"]]
queries_df.to_csv("../Results/SIGIR_semantic_queryWise_queryEP.csv", index=False)
queries_df.head()

  0%|          | 0/176 [00:00<?, ?it/s]

Unnamed: 0,QID,positive,negative,joy,sadness,anger,fear
0,qGEN1,0.000538,0.999462,0.003824,0.991401,0.001317,0.003458
1,qGEN2,0.999281,0.000719,0.994618,0.001798,0.000308,0.003275
2,qGEN3,0.002662,0.997338,0.571502,0.369252,0.004709,0.054537
3,qGEN4,0.000338,0.999662,0.003196,0.117101,0.287267,0.592436
4,qGEN5,0.000252,0.999748,0.000818,0.993707,0.003692,0.001782


In [55]:
bing_resp = pd.read_csv("../Data/SIGIR_bing_resp.csv")
bing_df = get_semantic_EP(bing_resp["Resp"].tolist(), "Resp")
bing_df["QID"] = bing_resp["QID"]
bing_df["rank"] = bing_resp["Rank"]
bing_df["IAS"] = ["Bing"]*len(bing_resp)
bing_df = bing_df.loc[:, ["QID", "rank", "IAS", "positive", "negative", "joy", "sadness", "anger", "fear"]]
bing_df.to_csv("../Results/SIGIR_semantic_queryWise_bingEP.csv", index=False)
bing_df.head()

  0%|          | 0/874 [00:00<?, ?it/s]

Unnamed: 0,QID,rank,IAS,positive,negative,joy,sadness,anger,fear
0,qGEN1,1,Bing,0.946896,0.053104,0.991536,0.005931,0.000301,0.002232
1,qGEN1,2,Bing,0.02517,0.97483,0.313124,0.678771,0.002188,0.005918
2,qGEN1,3,Bing,0.000221,0.999779,0.000316,0.612202,0.385068,0.002414
3,qGEN1,4,Bing,0.962078,0.037922,0.997938,0.000693,8.1e-05,0.001288
4,qGEN1,5,Bing,0.99902,0.00098,0.995422,0.003079,0.000194,0.001305


In [56]:
gemma_resp = pd.read_csv("../Data/SIGIR_gemma_resp.csv")
gemma_df = get_semantic_EP(gemma_resp["Resp"].tolist(), "Resp")
gemma_df["QID"] = gemma_resp["QID"]
gemma_df["IAS"] = ["Gemma"]*len(gemma_resp)
gemma_df = gemma_df.loc[:, ["QID", "IAS", "positive", "negative", "joy", "sadness", "anger", "fear"]]
gemma_df.to_csv("../Results/SIGIR_semantic_queryWise_gemmaEP.csv", index=False)
gemma_df.head()

  0%|          | 0/176 [00:00<?, ?it/s]

Unnamed: 0,QID,IAS,positive,negative,joy,sadness,anger,fear
0,qGEN1,Gemma,0.000426,0.999574,0.000447,0.998147,0.000668,0.000738
1,qGEN2,Gemma,0.999763,0.000237,0.998516,0.001097,8.3e-05,0.000305
2,qGEN3,Gemma,0.100981,0.899019,0.320656,0.664181,0.003621,0.011543
3,qGEN4,Gemma,0.000654,0.999346,0.000461,0.998269,0.0003,0.00097
4,qGEN5,Gemma,0.000412,0.999588,0.000396,0.997958,0.000765,0.000881


In [None]:
gpt_resp = pd.read_csv("../Data/SIGIR_gpt_resp.csv")
gpt_df = get_semantic_EP(gpt_resp["Resp"].tolist(), "Resp")
gpt_df["QID"] = gpt_resp["QID"]
gpt_df["IAS"] = ["GPT"]*len(gpt_resp)
gpt_df = gpt_df.loc[:, ["QID", "IAS", "positive", "negative", "joy", "sadness", "anger", "fear"]]
gpt_df.to_csv("../Results/SIGIR_semantic_queryWise_gptEP.csv", index=False)
gpt_df.head()