## Adjusting Lexicons

### EmoLex - Italian

In [25]:
import re
import pandas as pd

f = open("../Data/Italian-NRC-EmoLex.txt", "r")

lines = f.readlines()

f.close()

emolex = []

for line in lines[1:]:
    temp = re.sub("\n", "", line)
    results = re.split("\t", temp)
    if len(re.split(", ", results[-1])) > 1:
        words = re.split(", ", results[-1])
        for word in words:
            emolex.append(results[:-1]+[word])
    else:
        emolex.append(results)

emolex = pd.DataFrame(emolex, columns=["English word", "Anger", "Anticipation", "Disgust", "Fear", "Joy", "Negative", "Positive", "Sadness", "Surprise", "Trust", "word"])
emolex = emolex.loc[: , [ "word", "Anger", "Anticipation", "Disgust", "Fear", "Joy", "Negative", "Positive", "Sadness", "Surprise", "Trust"]]
emolex.to_csv("../Data/IT_emolex.csv", index=False)

emolex.head()

Unnamed: 0,word,Anger,Anticipation,Disgust,Fear,Joy,Negative,Positive,Sadness,Surprise,Trust
0,sconcertato,0,0,0,0,0,0,0,0,0,0
1,abaco,0,0,0,0,0,0,0,0,0,1
2,abbandono,0,0,0,1,0,1,0,1,0,0
3,abbandonato,1,0,0,1,0,1,0,1,0,0
4,abbandono,1,0,0,1,0,1,0,1,1,0


### Sentiment Lexicon - DPLp-IT

In [4]:
import re
import pandas as pd

f = open("../Data/DPLp-IT_lrec2016.txt", "r")

lines = f.readlines()

f.close()

sentiment_lexicon = []

for line in lines:

    temp = re.sub("\n", "", line)
    results = re.split("::|\t|,|\n", temp)
    sentiment_lexicon.append(results)

sentiment_lexicon = pd.DataFrame(sentiment_lexicon, columns=["lemma", "pos", "pos_score", "neg_score", "neu_score"])
sentiment_lexicon.to_csv("../Data/IT_sentiment_lexicon.csv", index=False)

sentiment_lexicon.head()

Unnamed: 0,lemma,pos,pos_score,neg_score,neu_score
0,essere,v,0.3675422,0.4671061,0.16535169
1,avere,v,0.27894887,0.60701084,0.1140403
2,fare,v,0.4421229,0.40804362,0.14983346
3,stare,v,0.31763914,0.4706841,0.21167673
4,dire,v,0.44390386,0.3657567,0.1903395


In [6]:
print(len(sentiment_lexicon.lemma.unique()))
print(len(sentiment_lexicon))
print(sentiment_lexicon.pos.unique())

# ['v' 's' 'a' 'b' 'h']: v: verb, s: noun, a: adjective, b: adverb, h: hashtag 

65273
75021
['v' 's' 'a' 'b' 'h']


In [7]:
import pandas as pd

sent_lexicon = pd.read_csv("../Data/IT_sentiment_lexicon.csv")
len(sent_lexicon)

75021

### Emotion Lexicon - ITeM

In [39]:
import pandas as pd

emotion = pd.read_csv("../Data/ItEM.FBNEWS15.cos", sep="\t")
print(len(emotion))
emotion.head()

239946


Unnamed: 0,emotion,word,cosine
0,gioia,festoso-a,0.647874
1,gioia,euforico-a,0.622582
2,gioia,esilarante-a,0.622579
3,gioia,gaio-a,0.617334
4,gioia,divertito-a,0.614806


In [40]:
# test_word = emotion.word.unique().tolist()[0]
# print(test_word.split("-"))
# print(emotion.loc[(emotion["word"]==test_word) & (emotion["emotion"]=="rabbia")]["cosine"].tolist()[0])

from tqdm.notebook import tqdm
import spacy

nlp = spacy.load("it_core_news_sm")

ENG_IT_emotions = {
    "joy": "gioia",
    "anger": "rabbia",
    "surprise": "sorpresa",
    "disgust": "disgusto",
    "fear": "paura",
    "sadness": "tristezza",
    "trust": "fiducia",
    "anticipation": "attese"
}

ENG_emotions = ["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]

results = []
for word in tqdm(emotion["word"].unique()):
    actual_word, pos = word.split("-")
    lemma = nlp(actual_word)[0].lemma_.lower()
    IT_emotions = emotion.loc[emotion["word"]==word]["emotion"].tolist()
    temp = []
    norm_factor = 0
    for eng_emo in ENG_emotions:
        emo_cosine = emotion.loc[(emotion["word"]==word) & (emotion["emotion"]==ENG_IT_emotions[eng_emo])]["cosine"].tolist()
        if len(emo_cosine) == 0:
            temp.append(0)
            norm_factor += 0
        else:
            temp.append(emo_cosine[0])
            norm_factor += emo_cosine[0]
    
    temp_new = [t/norm_factor for t in temp]
    emo_norm_score = [actual_word, lemma, pos] + temp_new
    results.append(emo_norm_score)

emotion_cosine = pd.DataFrame(results, columns=["actual_word", "lemma", "pos", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
emotion_cosine.to_csv("../Data/IT_emotion_lexicon_normalised_new.csv", index=False)
emotion_cosine.head()

  0%|          | 0/29999 [00:00<?, ?it/s]

Unnamed: 0,actual_word,lemma,pos,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,festoso,festoso,a,0.288792,0.092691,0.103564,0.089147,0.098167,0.148815,0.095408,0.083415
1,euforico,euforico,a,0.211809,0.106492,0.129965,0.100949,0.113435,0.135096,0.105778,0.096476
2,esilarante,esilarare,a,0.246015,0.094562,0.094376,0.128638,0.112556,0.137287,0.093584,0.092982
3,gaio,gaio,a,0.248166,0.097285,0.08568,0.096044,0.118533,0.129797,0.133886,0.090609
4,divertito,divertire,a,0.194127,0.114506,0.142627,0.103522,0.108127,0.137351,0.10421,0.09553


In [41]:
emotion_cosine.pos.unique()

# ['a', 'v', 's'], a: adjective, v: verb, s: noun

array(['a', 'v', 's'], dtype=object)

In [42]:
len(emotion_cosine)

29999

In [44]:
sum(emotion_cosine.drop(columns=["actual_word","lemma", "pos"]).iloc[50])

1.0

## Computing emotion profile vectors

### Lexicon-based

In [17]:
import spacy
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords_list = stopwords.words('italian')

emotion_lexicon = pd.read_csv("../Data/IT_emotion_lexicon_normalised_new.csv")
sentiment_lexicon = pd.read_csv("../Data/IT_sentiment_lexicon.csv")
nlp = spacy.load("it_core_news_sm")
emotion_names = ["joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"]  

def calc_emotion_profile(text):

    scores = {
        "positive": [],
        "negative": [],
        "neutral": [],
        "joy": [],
        "anger": [],
        "surprise": [],
        "disgust": [],
        "fear": [],
        "sadness": [],
        "trust": [],
        "anticipation": []
    }

    doc = nlp(text)
    for token in doc:
        # if token.is_alpha == True and token.text.lower() not in stopwords_list:
        if token.is_alpha == True:
            word = token.text.lower()
            lemma = token.lemma_.lower()
            pos = token.tag_[0].lower()
            sentiments = sentiment_lexicon.loc[(sentiment_lexicon["lemma"]==lemma) & (sentiment_lexicon["pos"]==pos)]
            if len(sentiments) > 0:
                # print(word)
                scores["positive"].append(sentiments["pos_score"].values[0])
                scores["negative"].append(sentiments["neg_score"].values[0])
                scores["neutral"].append(sentiments["neu_score"].values[0])
            else:
                scores["positive"].append(0)
                scores["negative"].append(0)
                scores["neutral"].append(1)
            
            emotions = emotion_lexicon.loc[(emotion_lexicon["actual_word"]==word) & (emotion_lexicon["lemma"]==lemma) & (emotion_lexicon["pos"]==pos)]
            if len(emotions) > 0:
                # print(word)
                for emo in emotion_names:
                    scores[emo].append(emotions[emo].values[0])

    emo_profile = []
    for key in scores.keys():
        if len(scores[key]) > 0:
            emo_profile.append(round(np.average(scores[key])*100, 2))
        else:
            emo_profile.append(0.00)

    return emo_profile

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hrishitachakra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
## sanity check of EP computation

text_examples = [
    "sono felice",
    "La deforestazione è un male",
    "Questo è spaventoso",
    "Questa è una mela marcia"
]

results = []

for text in text_examples:
    EP_score = calc_emotion_profile(text)
    sent_sum = sum(EP_score[:3])
    emo_sum = sum(EP_score[3:])
    results.append([text]+EP_score+[sent_sum, emo_sum])
    for word in text.split(" "):
        EP_score = calc_emotion_profile(word)
        sent_sum = sum(EP_score[:3])
        emo_sum = sum(EP_score[3:])
        results.append([word]+EP_score+[sent_sum, emo_sum])

results_df = pd.DataFrame(results, columns=["text", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation", "sent_sum", "emo_sum"])
results_df

Unnamed: 0,text,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation,sent_sum,emo_sum
0,sono felice,45.55,36.88,17.57,15.48,13.02,9.12,8.87,12.84,14.01,17.96,8.71,100.0,100.01
1,sono,36.75,46.71,16.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
2,felice,54.34,27.06,18.6,15.48,13.02,9.12,8.87,12.84,14.01,17.96,8.71,100.0,100.01
3,La deforestazione è un male,17.21,20.17,62.62,11.76,12.86,9.15,12.53,15.04,13.47,13.91,11.29,100.0,100.01
4,La,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
5,deforestazione,23.09,18.56,58.35,11.15,11.74,8.18,14.2,17.58,11.5,12.49,13.17,100.0,100.01
6,è,36.75,46.71,16.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
7,un,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
8,male,26.74,50.32,22.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0
9,Questo è spaventoso,21.04,28.52,50.45,8.9,7.9,10.01,13.53,20.91,12.85,11.87,14.03,100.01,100.0


In [19]:
# trial run

import pandas as pd

gemma = pd.read_csv("../Data/SIGIR_gemma_resp.csv")

test = gemma.Resp[0]
emo_vals = calc_emotion_profile(test)
print(emo_vals)
print(sum(emo_vals[:3]))
sum(emo_vals[3:])

[np.float64(16.55), np.float64(29.55), np.float64(53.9), np.float64(10.95), np.float64(12.52), np.float64(13.94), np.float64(9.95), np.float64(11.72), np.float64(12.96), np.float64(15.94), np.float64(12.02)]
100.0


np.float64(99.99999999999999)

In [20]:
# sanity check with Inside Out SERP

import pandas as pd

InsideOut = pd.read_csv("../Data/QL_IT_oct_8_exp_on_results.csv")
IO_emotional_profile = []
for _, row in tqdm(InsideOut.iterrows(), total = len(InsideOut)):
    # qid = row["QID"]
    query = row["user_query"]
    text = row["TextSample"]
    position = row["position"]
    # prompt_type = row["Prompt Type"]
    emo_profile = calc_emotion_profile(text)
    IO_emotional_profile.append([query, text, position]+emo_profile)

IO_EP_df = pd.DataFrame(IO_emotional_profile, columns=["Query", "TextSample", "position", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
IO_EP_df.to_csv("../Results/InsideOut_lexicon_EP.csv", index=False)
IO_EP_df.head()

  0%|          | 0/218 [00:00<?, ?it/s]

Unnamed: 0,Query,TextSample,position,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,Cosa Ã¨ Ä¾isola di plastica,Cos'è la plastica e come si produce. La plasti...,1,16.43,13.85,69.71,11.0,12.51,12.16,11.13,12.8,11.0,16.87,12.53
1,Cosa Ã¨ Ä¾isola di plastica,Pacific Trash Vortex - Wikipedia. Il Pacific T...,2,15.86,15.47,68.67,12.36,12.6,11.87,11.78,13.31,11.98,13.26,12.84
2,Cosa Ã¨ Ä¾isola di plastica,isola di plastica che cos'è e perchè non può e...,3,15.07,16.05,68.87,12.62,12.91,12.4,10.21,13.41,12.08,14.15,12.22
3,Cosa Ã¨ Ä¾isola di plastica,Cosa sono e come si creano le isole di plastic...,4,14.21,14.79,71.01,12.73,12.08,12.67,10.16,12.76,12.97,13.39,13.23
4,Cosa Ã¨ Ä¾isola di plastica,Saldatura plastica fai da te - YouTube. Creazi...,5,13.74,12.93,73.33,12.12,12.04,14.0,10.13,12.96,11.89,13.96,12.91


In [21]:
import pandas as pd
from tqdm.notebook import tqdm

queries = pd.read_csv("../Data/SIGIR_queries_IT.csv")
query_emotional_profile = []

for _, row in tqdm(queries.iterrows(), total = len(queries)):
    qid = row["QID"]
    query = row["Query"]
    # prompt_type = row["Prompt Type"]
    emo_profile = calc_emotion_profile(query)
    query_emotional_profile.append([qid]+emo_profile)

query_EP_df = pd.DataFrame(query_emotional_profile, columns=["QID", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
query_EP_df.to_csv("../Results/SIGIR_lexicon_queryWise_queryEP.csv", index=False)
query_EP_df.head()

  0%|          | 0/176 [00:00<?, ?it/s]

Unnamed: 0,QID,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,qGEN1,10.34,12.81,76.85,12.61,15.95,13.27,8.81,10.8,13.9,15.2,9.45
1,qGEN2,6.73,8.1,85.18,13.04,16.93,11.75,10.41,12.07,13.54,12.8,9.47
2,qGEN3,6.33,7.54,86.14,12.61,15.95,13.27,8.81,10.8,13.9,15.2,9.45
3,qGEN4,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,qGEN5,8.02,8.96,83.02,12.77,14.55,13.36,11.04,11.77,13.69,11.13,11.69


In [22]:
bing = pd.read_csv("../Data/SIGIR_bing_resp.csv")
bing_emotion_profile = []

for _, row in tqdm(bing.iterrows(), total=len(bing)):
    qid = row["QID"]
    query = row["Query"]
    resp = row["Resp"]
    rank = row["Rank"]
    emo_profile = calc_emotion_profile(resp)
    bing_emotion_profile.append([qid, rank, "Bing"]+emo_profile)
    
bing_EP_df = pd.DataFrame(bing_emotion_profile, columns=["QID", "rank", "IAS", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
bing_EP_df.to_csv("../Results/SIGIR_lexicon_queryWise_bingEP.csv", index=False)
bing_EP_df.head()

  0%|          | 0/1756 [00:00<?, ?it/s]

Unnamed: 0,QID,rank,IAS,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,qGEN1,1,Bing,11.97,12.82,75.21,13.05,12.82,13.4,10.77,12.51,11.66,13.03,12.77
1,qGEN1,2,Bing,11.84,9.61,78.55,11.55,12.65,12.21,10.18,12.46,10.54,15.66,14.75
2,qGEN1,3,Bing,15.62,15.53,68.85,12.57,13.08,11.3,11.76,13.26,12.44,13.69,11.91
3,qGEN1,4,Bing,7.26,7.31,85.42,12.25,12.01,11.78,11.82,13.12,10.7,13.88,14.44
4,qGEN1,5,Bing,15.02,12.1,72.88,12.44,11.98,11.78,10.98,13.16,12.14,14.26,13.25


In [23]:
gemma = pd.read_csv("../Data/SIGIR_gemma_resp.csv")
gemma_emotion_profile = []

for _, row in tqdm(gemma.iterrows(), total=len(gemma)):
    qid = row["QID"]
    query = row["Query"]
    resp = row["Resp"]
    emo_profile = calc_emotion_profile(resp)
    gemma_emotion_profile.append([qid, "Gemma"]+emo_profile)
    
gemma_EP_df = pd.DataFrame(gemma_emotion_profile, columns=["QID", "IAS", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
gemma_EP_df.to_csv("../Results/SIGIR_lexicon_queryWise_gemmaEP.csv", index=False)
gemma_EP_df.head()

  0%|          | 0/176 [00:00<?, ?it/s]

Unnamed: 0,QID,IAS,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,qGEN1,Gemma,16.55,29.55,53.9,10.95,12.52,13.94,9.95,11.72,12.96,15.94,12.02
1,qGEN2,Gemma,15.56,12.64,71.8,13.05,13.87,10.79,10.42,12.45,12.55,16.46,10.4
2,qGEN3,Gemma,14.1,13.71,72.19,12.8,14.64,12.49,10.65,11.13,12.32,15.3,10.65
3,qGEN4,Gemma,16.01,27.51,56.48,12.31,13.49,12.93,6.71,13.04,11.8,17.11,12.61
4,qGEN5,Gemma,17.53,26.91,55.56,11.39,12.08,13.35,10.38,12.14,12.87,15.01,12.77


In [48]:
gpt = pd.read_csv("../Data/SIGIR_gpt_resp.csv")

gpt_emotion_profile = []
for _, row in tqdm(gpt.iterrows(), total=len(gpt)):
    qid = row["QID"]
    query = row["Query"]
    resp = row["GPT"]
    emo_profile = calc_emotion_profile(resp)
    gpt_emotion_profile.append([qid, "GPT"]+emo_profile)
    
gpt_EP_df = pd.DataFrame(gpt_emotion_profile, columns=["QID", "IAS", "positive", "negative", "neutral", "joy", "anger", "surprise", "disgust", "fear", "sadness", "trust", "anticipation"])
gpt_EP_df.to_csv("../Results/SIGIR_lexicon_queryWise_gptEP.csv", index=False)
gpt_EP_df.head()

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,query,response,LLM,positive,negative,neutral,joy,anger,surprise,disgust,fear,sadness,trust,anticipation
0,Chi era il padre di Micerino ?,Il padre di Micerino era Chefren.,GPT,25.31,30.15,44.54,12.53,13.28,12.68,11.66,12.12,12.82,13.11,11.81
1,Il figlio di Chefren,"Il figlio di Chefren era Cheope, il faraone ch...",GPT,25.94,20.34,53.72,12.37,12.61,12.4,12.34,12.59,12.34,12.72,12.62
2,Il padre di Micerino,"Il padre di Micerino era Chefren, faraone dell...",GPT,34.52,20.13,45.35,12.83,12.71,12.41,12.23,12.34,12.42,12.6,12.46
3,Micerino,Micerino è stato un faraone dell'Antico Egitto...,GPT,26.45,22.48,51.07,12.37,12.5,12.3,12.43,12.6,12.26,12.88,12.66
4,Quanto e alta la tomba di Cheope?,"La tomba di Cheope, la più grande delle tre pi...",GPT,24.58,28.13,47.29,12.47,12.72,13.27,11.81,12.35,12.51,12.47,12.4


#### ROUGE

In [16]:
from torchmetrics.text.rouge import ROUGEScore
from tqdm.notebook import tqdm

def compute_rouge(LLM, resp_col_name, og_resp_df, RQ_resp_df):

    RQ_male_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="male"]
    RQ_female_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="female"]
    RQ_neutral_resp_df = RQ_resp_df.loc[RQ_resp_df["Gender"]=="neutral"]

    rouge = ROUGEScore(rouge_keys=('rougeL', 'rougeLsum'))

    rouge_results = []

    for _, row in tqdm(og_resp_df.iterrows(), total=len(og_resp_df)):
        query = row["Query"]
        OG_resp = row[resp_col_name]
        RQ_resp_male = RQ_male_resp_df.loc[RQ_male_resp_df["Original Query"]==query][resp_col_name].values[0]
        RQ_resp_female = RQ_female_resp_df.loc[RQ_female_resp_df["Original Query"]==query][resp_col_name].values[0]
        RQ_resp_neutral = RQ_neutral_resp_df.loc[RQ_neutral_resp_df["Original Query"]==query][resp_col_name].values[0]
        
        OG_male_rougeL = float(rouge(RQ_resp_male, OG_resp)['rougeL_fmeasure'])
        OG_male_rougeLsum = float(rouge(RQ_resp_male, OG_resp)['rougeLsum_fmeasure'])
        
        OG_female_rougeL = float(rouge(RQ_resp_neutral, OG_resp)['rougeL_fmeasure'])
        OG_female_rougeLsum = float(rouge(RQ_resp_neutral, OG_resp)['rougeLsum_fmeasure'])

        OG_neutral_rougeL = float(rouge(RQ_resp_female, OG_resp)['rougeL_fmeasure'])
        OG_neutral_rougeLsum = float(rouge(RQ_resp_female, OG_resp)['rougeLsum_fmeasure'])
        
        # female_male_rougeL = float(rouge(RQ_resp_female, RQ_resp_male)['rougeL_fmeasure'])
        # female_male_rougeLsum = float(rouge(RQ_resp_female, RQ_resp_male)['rougeLsum_fmeasure'])

        rouge_results.append([query, OG_male_rougeL, OG_male_rougeLsum, OG_female_rougeL, OG_female_rougeLsum, OG_neutral_rougeL, OG_neutral_rougeLsum])

    rouge_df = pd.DataFrame(rouge_results, columns=["Query", "OG_male_rougeL", "OG_male_rougeLsum", "OG_female_rougeL", "OG_female_rougeLsum", "OG_neutral_rougeL", "OG_neutral_rougeLsum"])
    rouge_df.to_csv("../Results/rouge_scores_" + LLM + ".csv", index=False)

In [19]:
import pandas as pd

gemma_OG = pd.read_csv("../Data/Gemma_2b_response.csv")
gemma_RQ = pd.read_csv("../Data/Gemma_2b_response_RQ.csv")
compute_rouge(LLM="Gemma", resp_col_name="gemma_2b_resp", og_resp_df=gemma_OG, RQ_resp_df=gemma_RQ)

gpt_OG = pd.read_csv("../Data/GPT_response.csv")
gpt_RQ = pd.read_csv("../Data/GPT_response_RQ.csv")
compute_rouge(LLM="GPT", resp_col_name="GPT", og_resp_df=gpt_OG, RQ_resp_df=gpt_RQ)

  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/110 [00:00<?, ?it/s]

### Semantic approach

In [28]:
# santiy check

from transformers import pipeline
import pandas as pd
import re

emo_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-emotion',top_k=4)
sent_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-sentiment',top_k=2)

text_examples = [
    "sono felice",
    "La deforestazione è un male",
    "Questo è spaventoso",
    "Questa è una mela marcia",
    "mostro nero?",
    "lei è una bellezza terribile",
    "L'Cleanup Ocean Ã¨ un movimento internazionale che si Ã¨ formato per combattere la pollution del mare.",
    """La (tigre) 

    bianca Ã¨ una rara variante genetica della tigre reale (Panthera tigris), 
    
    caratterizzata da una particolare colorazione del mantello causata da una mutazione genetica. Questi bellissimi felini sono spesso oggetto di ammirazione e curiositÃ  per la loro singolare bellezza. Vive principalmente in India e in alcune parti del sud-est asiatico."""
]

def clean_text(text):
    text = re.sub("(\s?\n){1,}", ".", text) # remove new lines and replace with .
    text = re.sub("\t{1,}", " ", text) # remove tab spaces and replace with a singular space
    text = re.sub("[^\w\-(\.{1})'\!\?]", " ", text) # remove non-alphanumeric symbols, except for ., ', !, -
    text = re.sub("[\(\)\[\]\{\}]", "", text) # remove brackets of any kind
    text = re.sub("\s{2,}", " ", text) # remove any multiple white spaces
    text = text.strip() # remove any leading or ending white spaces
    return text

cleaned_text_examples = [clean_text(text) for text in text_examples]

emo_scores = emo_classifier(cleaned_text_examples)
sent_scores = sent_classifier(cleaned_text_examples)

EP_semantic_results = {
    "text": [],
    "cleaned_text": [],
    "positive": [],
    "negative": [],
    "sent_sum": [],
    "joy": [],
    "sadness": [],
    "anger": [],
    "fear": [],
    "emo_sum": []
}
# print(emo_scores)


for text, cleaned_text, sent_score, emo_score in zip(text_examples, cleaned_text_examples, sent_scores, emo_scores):
    EP_semantic_results["text"].append(text)
    EP_semantic_results["cleaned_text"].append(cleaned_text)
    sent_sum = 0
    emo_sum = 0
    for sent_dict in sent_score:
        EP_semantic_results[sent_dict["label"]].append(sent_dict["score"]*100)
        sent_sum += sent_dict["score"]
    EP_semantic_results["sent_sum"].append(sent_sum)
    for emo_dict in emo_score:
        EP_semantic_results[emo_dict["label"]].append(emo_dict["score"]*100)
        emo_sum += emo_dict["score"]
    EP_semantic_results["emo_sum"].append(emo_sum)

# print(EP_semantic_results)
df = pd.DataFrame.from_dict(EP_semantic_results)
df

Device set to use cpu
Device set to use cpu


Unnamed: 0,text,cleaned_text,positive,negative,sent_sum,joy,sadness,anger,fear,emo_sum
0,sono felice,sono felice,99.9726,0.027401,1.0,99.90108,0.041839,0.017075,0.04,1.0
1,La deforestazione è un male,La deforestazione è un male,0.021573,99.978429,1.0,0.020934,92.929733,6.870111,0.179221,1.0
2,Questo è spaventoso,Questo è spaventoso,0.02175,99.97825,1.0,0.100756,0.10274,0.093424,99.703085,1.0
3,Questa è una mela marcia,Questa è una mela marcia,0.02188,99.978119,1.0,0.04351,70.56402,29.189795,0.202674,1.0
4,mostro nero?,mostro nero?,0.023057,99.976939,1.0,0.033823,3.354666,94.834334,1.777177,1.0
5,lei è una bellezza terribile,lei è una bellezza terribile,0.022565,99.97744,1.0,99.864703,0.107149,0.014439,0.013711,1.0
6,L'Cleanup Ocean Ã¨ un movimento internazionale...,L'Cleanup Ocean Ã un movimento internazionale ...,73.182118,26.817882,1.0,18.575227,48.435813,0.285598,32.70337,1.0
7,La (tigre) \n\n bianca Ã¨ una rara variante...,La tigre. bianca Ã una rara variante genetica ...,99.962687,0.037313,1.0,99.923551,0.029315,0.010889,0.036246,1.0


In [32]:
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm

emo_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-emotion',top_k=4)
sent_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-sentiment',top_k=2)

def clean_text(text):
    text = re.sub("(\s?\n){1,}", ".", text) # remove new lines and replace with .
    text = re.sub("\t{1,}", " ", text) # remove tab spaces and replace with a singular space
    text = re.sub("[^\w\-(\.{1})'\!\?]", " ", text) # remove non-alphanumeric symbols, except for ., ', !, -
    text = re.sub("[\(\)\[\]\{\}]", "", text) # remove brackets of any kind
    text = re.sub("\s{2,}", " ", text) # remove any multiple white spaces
    text = text.strip() # remove any leading or ending white spaces
    return text

def get_semantic_EP(texts, text_name):

    cleaned_texts = [clean_text(text) for text in texts]

    emo_scores = emo_classifier(cleaned_texts)
    sent_scores = sent_classifier(cleaned_texts)

    cleaned_text_name = text_name+"_cleaned"

    EP_semantic_results = {
        cleaned_text_name: [],
        "positive": [],
        "negative": [],
        "joy": [],
        "sadness": [],
        "anger": [],
        "fear": [],
    }

    for cleaned_text, sent_score, emo_score in tqdm(zip(cleaned_texts, sent_scores, emo_scores), total=len(cleaned_texts)):
        EP_semantic_results[cleaned_text_name].append(cleaned_text)
        for sent_dict in sent_score:
            EP_semantic_results[sent_dict["label"]].append(sent_dict["score"]*100)
        for emo_dict in emo_score:
            EP_semantic_results[emo_dict["label"]].append(emo_dict["score"]*100)
    df = pd.DataFrame.from_dict(EP_semantic_results)
    return cleaned_text_name, df

Device set to use cpu
Device set to use cpu


In [34]:
queries = pd.read_csv("../Data/SIGIR_queries_IT.csv")
col_name, queries_df = get_semantic_EP(queries["Query"].tolist(), "Query")
queries_df["QID"] = queries["QID"]
queries_df = queries_df.loc[:, ["QID", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
queries_df.to_csv("../Results/SIGIR_semantic_queryWise_queryEP.csv", index=False)
queries_df.head()

  0%|          | 0/176 [00:00<?, ?it/s]

Unnamed: 0,QID,Query_cleaned,positive,negative,joy,sadness,anger,fear
0,qGEN1,Chi era il padre di Micerino ?,0.053798,99.946207,0.382418,99.14009,0.131704,0.345787
1,qGEN2,Il figlio di Chefren,99.928087,0.07191,99.461848,0.17983,0.03083,0.327498
2,qGEN3,Il padre di Micerino,0.266242,99.733752,57.150197,36.925179,0.470886,5.453737
3,qGEN4,Micerino,0.033772,99.966228,0.319637,11.710105,28.72667,59.243584
4,qGEN5,Quanto e alta la tomba di Cheope?,0.025216,99.974781,0.081842,99.370712,0.369247,0.178202


In [37]:
bing_resp = pd.read_csv("../Data/SIGIR_bing_resp.csv")
col_name, bing_df = get_semantic_EP(bing_resp["Resp"].tolist(), "Resp")
bing_df["QID"] = bing_resp["QID"]
bing_df["rank"] = bing_resp["Rank"]
bing_df["IAS"] = ["Bing"]*len(bing_resp)
bing_df = bing_df.loc[:, ["QID", "IAS", "rank", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
bing_df.to_csv("../Results/SIGIR_semantic_queryWise_bingEP.csv", index=False)
bing_df.head()

  0%|          | 0/1756 [00:00<?, ?it/s]

Unnamed: 0,QID,IAS,rank,Resp_cleaned,positive,negative,joy,sadness,anger,fear
0,qGEN1,Bing,1,Micerino - Wikipedia. Micerino è figlio del fa...,96.868145,3.131854,99.423629,0.318133,0.027023,0.231219
1,qGEN1,Bing,2,Chi erano Chefren e Micerino? - in3giorni.com....,0.022283,99.977714,0.02934,61.225837,38.4316,0.313222
2,qGEN1,Bing,3,LA PIRAMIDE DI MICERINO - LA CIVILTA' EGIZIA. ...,0.043857,99.956137,19.139087,79.771292,0.326051,0.763573
3,qGEN1,Bing,4,Piramide di Micerino - Wikipedia. La Piramide ...,99.822468,0.177528,98.040146,0.594994,0.34861,1.016254
4,qGEN1,Bing,5,La piramide di Micerino l ultima delle tre pir...,49.890733,50.109267,99.359554,0.116022,0.012682,0.51175


In [38]:
gemma_resp = pd.read_csv("../Data/SIGIR_gemma_resp.csv")
col_name, gemma_df = get_semantic_EP(gemma_resp["Resp"].tolist(), "Resp")
gemma_df["QID"] = gemma_resp["QID"]
gemma_df["IAS"] = ["Gemma"]*len(gemma_resp)
gemma_df = gemma_df.loc[:, ["QID", "IAS", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
gemma_df.to_csv("../Results/SIGIR_semantic_queryWise_gemmaEP.csv", index=False)
gemma_df.head()

  0%|          | 0/176 [00:00<?, ?it/s]

Unnamed: 0,QID,IAS,Resp_cleaned,positive,negative,joy,sadness,anger,fear
0,qGEN1,Gemma,Non ho informazioni sul padre di Micerino quin...,0.041847,99.958152,0.04507,99.773383,0.070231,0.111311
1,qGEN2,Gemma,Il figlio di Chefren è un giovane che vive a F...,99.976319,0.023676,99.851626,0.10965,0.008256,0.030461
2,qGEN3,Gemma,Il padre di Micerino era un uomo di nome Gioac...,10.098097,89.9019,32.065579,66.418064,0.362106,1.15425
3,qGEN4,Gemma,Mi chiamo Micerino..Come posso essere d'aiuto ...,0.067661,99.932337,0.047084,99.817777,0.0292,0.105938
4,qGEN5,Gemma,Non ho informazioni sul luogo della tomba di C...,0.044995,99.955004,0.040039,99.763119,0.074519,0.122324


In [None]:
gpt_resp = pd.read_csv("../Data/SIGIR_gpt_resp.csv")
col_name, gpt_df = get_semantic_EP(gpt_resp["Resp"].tolist(), "Resp")
gpt_df["QID"] = gpt_resp["QID"]
gpt_df["IAS"] = ["GPT"]*len(gpt_resp)
gpt_df = gpt_df.loc[:, ["QID", "IAS", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
gpt_df.to_csv("../Results/SIGIR_semantic_queryWise_gptEP.csv", index=False)
gpt_df.head()