# Importing Libraries

In [1]:
!pip install --upgrade spacy
!python -m spacy download pt_core_news_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pt-core-news-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.3.0/pt_core_news_lg-3.3.0-py3-none-any.whl (568.2 MB)
[K     |████████████████████████████████| 568.2 MB 8.8 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_lg')


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from google.colab import drive
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import collections
import pandas as pd
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score, balanced_accuracy_score
from ast import literal_eval
import spacy

In [4]:
nlp = spacy.load("pt_core_news_lg")

In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Dataset

In [6]:
!gdown --id 1-E218dG9enVozV8oESpOQRZOC0knmPvd

Downloading...
From: https://drive.google.com/uc?id=1-E218dG9enVozV8oESpOQRZOC0knmPvd
To: /content/train.csv
100% 1.38M/1.38M [00:00<00:00, 71.9MB/s]


In [7]:
df = pd.read_csv('/content/train.csv', sep=';')
df.head()

Unnamed: 0,id,review,polarity,aspect,start_position,end_position
0,0,O hotel é perto de todos os pontos principais ...,0,quarto,152,158
1,1,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71
2,2,Estive por 8 dias hospedado neste hotel com mi...,1,café da manhã,209,222
3,3,Adorei a estadia. Porto Alegre foi sensacional...,-1,hotel,378,383
4,4,"O hotel tem ótima localização, fizemos vários ...",1,internet,216,224


# GoEmotions

In [8]:
#Folder path containing the fine-tuned model files
model_path = '/content/drive/MyDrive/IC/Inferencia_carga_afetiva/GoEmotions_portuguese-main/fine_tunel_model'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, return_all_scores=True)



In [9]:
def goEmotionsInference(file_name, texts, infer=False):
  if infer:
    output = classifier(texts)
    out_all = str(output)
    with open('/content/drive/MyDrive/Shared Task - ABSAPT 2022/go_emotions_output/' + file_name, 'w') as f:
      f.write(out_all)
  with open('/content/drive/MyDrive/Shared Task - ABSAPT 2022/go_emotions_output/' + file_name) as f:
    out_task2_train = f.read()
    out_task2_train = literal_eval(out_task2_train)
  return out_task2_train

In [10]:
# Get the df with the top n predictions

def get_predictions(output):
  predictions = {}
  for id, l in enumerate(output):
    predictions[id] = {}
    for prediction in l:
      predictions[id][prediction['label']] = prediction['score']
  return predictions

def sort_dic(dic):
  sorted_dic = {}
  for i in sorted(dic, key=dic.get, reverse=True):
    sorted_dic[i] = dic[i]
  return {i: sorted_dic[i] for i in list(sorted_dic)}

def top_n_predictions(predictions, n):
  top_n = {}
  for id in predictions:
    sorted_dic = sort_dic(predictions[id])
    aux_dic = {}
    for i, key in enumerate(sorted_dic):
      if i < n:
        aux_dic[key] = predictions[id][key]
      top_n[id] = aux_dic
  return top_n

def get_result(top_n, df, sentences):
  dic_result = dict(df)
  dic_result['sentence'] = []
  for id in top_n:
    dic_result['sentence'].append(sentences[id])
    keys = list(top_n[id].keys())
    for i, key in enumerate(keys):
      n_prediction = 'prediction ' + str(i+1)
      n_score = 'score ' + str(i+1)
      if n_prediction not in dic_result.keys():
        dic_result[n_prediction] = []
        dic_result[n_score] = [] 
      dic_result[n_prediction].append(keys[i])
      dic_result[n_score].append(top_n[id][keys[i]])
  return dic_result

def get_df(df, output, n, sentences):
  predictions = get_predictions(output)
  top_n = top_n_predictions(predictions, n)
  dic_result = get_result(top_n, df, sentences)
  df_result = pd.DataFrame(data=dic_result)
  return df_result

In [11]:
mapping = {"admiração": 1,
            "diversão": 1,
            "raiva": -1,
            "aborrecimento": -1,
            "aprovação": 1,
            "zelo": 1,
            "confusão": -1,
            "curiosidade": 1,
            "desejo": 1,
            "decepção": -1,
            "desaprovação": -1,
            "nojo": -1,
            "constrangimento": -1,
            "entusiasmo": 1,
            "medo": -1,
            "gratidão": 1,
            "luto": -1,
            "alegria": 1,
            "amor": 1,
            "nervosismo": -1,
            "otimismo": 1,
            "orgulho": 1,
            "percepção": 0,
            "alívio": 1,
            "remorso": -1,
            "tristeza": -1,
            "surpresa": 1,
            "neutro": 0
}

In [12]:
def map_goEmotions(n, df):
  for i in range(n):
    df['prediction ' + str(i+1)] = df['prediction ' + str(i+1)].map(mapping)
  return df

In [13]:
def annull_obl(predictions, obl):
  for index in obl:
    if index < len(predictions):
      predictions[index] = 0
  return predictions

## 1) By Sentence

In [14]:
aspect_sentences = []
for i in df.iloc():
  doc = nlp(i['review'])
  for sentence in doc.sents:
    if sentence[0].idx <= i['start_position'] and sentence[-1].idx + len(sentence[-1]) >= i['end_position']:
        aspect_sentences.append(sentence.text)

In [15]:
out_task2_train = goEmotionsInference('out_task2_train', aspect_sentences, False)

In [16]:
n = 3
df_top3 = get_df(df, out_task2_train, n, aspect_sentences)
df_top3.head()

Unnamed: 0,id,review,polarity,aspect,start_position,end_position,sentence,prediction 1,score 1,prediction 2,score 2,prediction 3,score 3
0,0,O hotel é perto de todos os pontos principais ...,0,quarto,152,158,"O Hotel é charmoso, aconselho ficar nos quarto...",admiração,0.937315,aprovação,0.026432,neutro,0.009182
1,1,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71,Elevadores modernos e rápidos.,neutro,0.874292,admiração,0.056796,aprovação,0.030855
2,2,Estive por 8 dias hospedado neste hotel com mi...,1,café da manhã,209,222,Estive por 8 dias hospedado neste hotel com mi...,admiração,0.795297,aprovação,0.139609,neutro,0.016754
3,3,Adorei a estadia. Porto Alegre foi sensacional...,-1,hotel,378,383,pode fazer umas comprinhas e encontrar produto...,neutro,0.804162,aprovação,0.095762,alegria,0.014814
4,4,"O hotel tem ótima localização, fizemos vários ...",1,internet,216,224,"O quarto e o banheiro sao espaçosos, novos, li...",admiração,0.885069,aprovação,0.070494,neutro,0.014737


In [17]:
df_top3 = map_goEmotions(n, df_top3)
df_top3.head()

Unnamed: 0,id,review,polarity,aspect,start_position,end_position,sentence,prediction 1,score 1,prediction 2,score 2,prediction 3,score 3
0,0,O hotel é perto de todos os pontos principais ...,0,quarto,152,158,"O Hotel é charmoso, aconselho ficar nos quarto...",1,0.937315,1,0.026432,0,0.009182
1,1,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71,Elevadores modernos e rápidos.,0,0.874292,1,0.056796,1,0.030855
2,2,Estive por 8 dias hospedado neste hotel com mi...,1,café da manhã,209,222,Estive por 8 dias hospedado neste hotel com mi...,1,0.795297,1,0.139609,0,0.016754
3,3,Adorei a estadia. Porto Alegre foi sensacional...,-1,hotel,378,383,pode fazer umas comprinhas e encontrar produto...,0,0.804162,1,0.095762,1,0.014814
4,4,"O hotel tem ótima localização, fizemos vários ...",1,internet,216,224,"O quarto e o banheiro sao espaçosos, novos, li...",1,0.885069,1,0.070494,0,0.014737


### Prediction 1

In [18]:
prediction = list(df_top3['prediction 1'])
true = list(df_top3['polarity'])
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.73      0.46      0.57       527
           0       0.28      0.45      0.35       472
           1       0.86      0.83      0.85      2112

    accuracy                           0.71      3111
   macro avg       0.62      0.58      0.59      3111
weighted avg       0.75      0.71      0.72      3111

0.5808073194070444


#### Annull OBL Aspects

In [19]:
obl_index = []
for index, i in enumerate(df.iloc()):
  doc = nlp(i['review'])
  for token in doc:
    if token.idx == i['start_position'] and token.dep_ == 'obl':
        obl_index.append(index)

In [20]:
prediction = annull_obl(prediction, obl_index)
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.75      0.41      0.53       527
           0       0.28      0.59      0.38       472
           1       0.88      0.77      0.82      2112

    accuracy                           0.68      3111
   macro avg       0.64      0.59      0.58      3111
weighted avg       0.77      0.68      0.71      3111

0.5890200549378258


### Top 3 without treshold

In [21]:
def get_top3(df):
  prediction = []
  for i in df.iloc():
    if i['prediction 1'] == i['prediction 2'] or i['prediction 1'] == i['prediction 3']:
      prediction.append(i['prediction 1'])
    elif i['prediction 2'] == i['prediction 3']:
      prediction.append(i['prediction 2'])
    else:
      prediction.append(i['prediction 1'])
  return prediction

In [22]:
prediction = get_top3(df_top3)
true = list(df_top3['polarity'])
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.69      0.66      0.68       527
           0       0.34      0.21      0.26       472
           1       0.83      0.92      0.87      2112

    accuracy                           0.77      3111
   macro avg       0.62      0.59      0.60      3111
weighted avg       0.73      0.77      0.75      3111

0.5948047434756661


#### Annull OBL Aspects

In [23]:
prediction = annull_obl(prediction, obl_index)
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.72      0.58      0.64       527
           0       0.32      0.40      0.36       472
           1       0.85      0.85      0.85      2112

    accuracy                           0.73      3111
   macro avg       0.63      0.61      0.62      3111
weighted avg       0.75      0.73      0.74      3111

0.6098408251540589


### top 3 with treshold

In [24]:
def get_top3_t(df, t):
  prediction = []
  for i in df.iloc():
    if i['score 1'] >= t:
      prediction.append(i['prediction 1'])
    else:
      if i['prediction 1'] == i['prediction 2'] or i['prediction 1'] == i['prediction 3']:
        prediction.append(i['prediction 1'])
      elif i['prediction 2'] == i['prediction 3']:
        prediction.append(i['prediction 2'])
      else:
        prediction.append(i['prediction 1'])
  return prediction

In [25]:
prediction = get_top3_t(df_top3, 0.9)
true = list(df_top3['polarity'])
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.69      0.64      0.67       527
           0       0.31      0.25      0.27       472
           1       0.84      0.89      0.87      2112

    accuracy                           0.75      3111
   macro avg       0.62      0.59      0.60      3111
weighted avg       0.73      0.75      0.74      3111

0.5944797825893451


#### Anull OBL Aspects

In [26]:
prediction = annull_obl(prediction, obl_index)
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.72      0.56      0.63       527
           0       0.31      0.44      0.36       472
           1       0.86      0.83      0.84      2112

    accuracy                           0.72      3111
   macro avg       0.63      0.61      0.61      3111
weighted avg       0.75      0.72      0.73      3111

0.607006662076332


## 2) By Meaningful Surroundings

In [27]:
def get_surroundings(index, review):
  doc = nlp(review)
  for token in doc:
    surroundings = [' ']
    if token.idx == index:
      surroundings = {}
      surroundings[token.idx] = token.text
      surroundings[token.head.idx] = token.head.text
      for child in token.children:
        surroundings[child.idx] = child.text
        for c in child.children:
          surroundings[c.idx] = c.text
      for child in token.head.children:
        surroundings[child.idx] = child.text
        for c in child.children:
          surroundings[c.idx] = c.text
      surroundings = dict(sorted(surroundings.items()))
      s = [surroundings[i] for i in surroundings]
      return ' '.join(s)
  return ' '.join(surroundings)

In [28]:
aspect_surroundings = []
for i in df.iloc():
    aspect_surroundings.append(get_surroundings(i['start_position'], i['review']))

In [29]:
out_task2_train_surroundings1 = goEmotionsInference('out_task2_train_surroundings1', aspect_surroundings, False)

In [30]:
n = 3
df_top3_s = get_df(df, out_task2_train_surroundings1, n, aspect_surroundings)
df_top3_s.head()

Unnamed: 0,id,review,polarity,aspect,start_position,end_position,sentence,prediction 1,score 1,prediction 2,score 2,prediction 3,score 3
0,0,O hotel é perto de todos os pontos principais ...,0,quarto,152,158,ficar nos quartos no piso superior,neutro,0.978858,aprovação,0.00466,aborrecimento,0.002076
1,1,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71,Elevadores modernos rápidos .,neutro,0.970849,aprovação,0.006281,admiração,0.004747
2,2,Estive por 8 dias hospedado neste hotel com mi...,1,café da manhã,209,222,que oferecem um café da manhã aos moldes hotéi...,neutro,0.961744,aprovação,0.01331,percepção,0.004343
3,3,Adorei a estadia. Porto Alegre foi sensacional...,-1,hotel,378,383,e a estrutura física do hotel,neutro,0.975867,aprovação,0.005421,curiosidade,0.002424
4,4,"O hotel tem ótima localização, fizemos vários ...",1,internet,216,224,e a internet funciona muito bem,admiração,0.736319,aprovação,0.163721,neutro,0.046386


In [31]:
df_top3 = map_goEmotions(n, df_top3_s)
df_top3.head()

Unnamed: 0,id,review,polarity,aspect,start_position,end_position,sentence,prediction 1,score 1,prediction 2,score 2,prediction 3,score 3
0,0,O hotel é perto de todos os pontos principais ...,0,quarto,152,158,ficar nos quartos no piso superior,0,0.978858,1,0.00466,-1,0.002076
1,1,Viajamos eu e minha irmã. O hotel tem uma extr...,1,elevador,63,71,Elevadores modernos rápidos .,0,0.970849,1,0.006281,1,0.004747
2,2,Estive por 8 dias hospedado neste hotel com mi...,1,café da manhã,209,222,que oferecem um café da manhã aos moldes hotéi...,0,0.961744,1,0.01331,0,0.004343
3,3,Adorei a estadia. Porto Alegre foi sensacional...,-1,hotel,378,383,e a estrutura física do hotel,0,0.975867,1,0.005421,1,0.002424
4,4,"O hotel tem ótima localização, fizemos vários ...",1,internet,216,224,e a internet funciona muito bem,1,0.736319,1,0.163721,0,0.046386


### Prediction 1

In [32]:
prediction = list(df_top3_s['prediction 1'])
true = list(df_top3_s['polarity'])
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.83      0.28      0.42       527
           0       0.28      0.79      0.42       472
           1       0.93      0.71      0.80      2112

    accuracy                           0.65      3111
   macro avg       0.68      0.59      0.55      3111
weighted avg       0.81      0.65      0.68      3111

0.594953039017519


#### Anull OBL Aspects

In [33]:
prediction = annull_obl(prediction, obl_index)
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.85      0.25      0.39       527
           0       0.27      0.84      0.41       472
           1       0.94      0.66      0.78      2112

    accuracy                           0.62      3111
   macro avg       0.69      0.59      0.53      3111
weighted avg       0.82      0.62      0.66      3111

0.5860833108364707


### Top 3 without treshold

In [34]:
prediction = get_top3(df_top3_s)
true = list(df_top3_s['polarity'])
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.72      0.54      0.62       527
           0       0.35      0.47      0.40       472
           1       0.87      0.85      0.86      2112

    accuracy                           0.74      3111
   macro avg       0.65      0.62      0.63      3111
weighted avg       0.76      0.74      0.75      3111

0.623813676273233


#### Annull OBL Aspects

In [35]:
prediction = annull_obl(prediction, obl_index)
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.74      0.49      0.59       527
           0       0.33      0.61      0.43       472
           1       0.88      0.79      0.84      2112

    accuracy                           0.71      3111
   macro avg       0.65      0.63      0.62      3111
weighted avg       0.78      0.71      0.73      3111

0.6286531666405151


### Top 3 with treshold

In [36]:
prediction = get_top3_t(df_top3_s, 0.9)
true = list(df_top3_s['polarity'])
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.73      0.49      0.58       527
           0       0.33      0.66      0.44       472
           1       0.91      0.78      0.84      2112

    accuracy                           0.71      3111
   macro avg       0.66      0.64      0.62      3111
weighted avg       0.79      0.71      0.74      3111

0.6420563933403439


#### Annull OBL Aspects

In [37]:
prediction = annull_obl(prediction, obl_index)
print(classification_report(true, prediction))
print(balanced_accuracy_score(true, prediction))

              precision    recall  f1-score   support

          -1       0.75      0.43      0.55       527
           0       0.31      0.74      0.44       472
           1       0.91      0.73      0.81      2112

    accuracy                           0.68      3111
   macro avg       0.66      0.63      0.60      3111
weighted avg       0.80      0.68      0.71      3111

0.6347600750209457


# Summary




1) By Aspect Sentence

Model                  | Not Annull OBL Aspects | Annull OBL Aspects
:----------------------|:-----------------------|:--------------------
Prediction 1           | 0.5808073194070444     | 0.5890200549378258
Top 3 without treshold | 0.5948047434756661     | 0.6098408251540589
Top 3 with treshold    | 0.6010984286063282     | 0.6122221018193383


2) By Meaningful Surroundings

Model                  | Not Annull OBL Aspects | Annull OBL Aspects
:----------------------|:-----------------------|:--------------------
Prediction 1           | 0.594953039017519      | 0.5860833108364707
Top 3 without treshold | 0.623813676273233      | 0.6286531666405151
Top 3 with treshold    | 0.6449574328090996     | 0.639306273709013
