## Setup

### Utils

In [None]:
!pip install transformers



In [None]:
from scipy.spatial.distance import cosine

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import AutoTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-multilingual-cased', output_hidden_states = True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
from Utils import BERT_Embeddings, preprocessing, tokens_evaluation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import pandas as pd
import numpy as np
import torch
from IPython.display import HTML
from tqdm import tqdm

In [None]:
from sklearn.metrics import classification_report

### Load training Data

In [None]:
train_df = pd.read_json("./HS-Brexit_dataset/HS-Brexit_train.json", orient='index')
train_df = preprocessing.get_dataset_labels(train_df)
train_df

Unnamed: 0,original_text,hard_label,soft_label_0,soft_label_1,disagreement
1,<user> <user> I'm so glad about #Brexit.. My a...,0,1.0,0.0,1
2,RT <user>: There was more to #Brexit than immi...,0,1.0,0.0,1
3,"At the end of the day, the leave campaign won ...",0,1.0,0.0,1
4,So the reducing migration thing wasn't quite w...,0,1.0,0.0,1
5,A Brit Immigrant Asks Britain to Become India’...,0,1.0,0.0,1
...,...,...,...,...,...
780,#Brexit has to happen there is no way you can ...,0,1.0,0.0,1
781,A foreigner we've neither heard of nor voted f...,0,1.0,0.0,1
782,The irony is we will probably now look to appo...,0,1.0,0.0,1
783,"Better watch out Merkel, the German people don...",0,0.5,0.5,0


In [None]:
# sentences
train_df['sentences'] = train_df['original_text'].apply(lambda x : preprocessing.split_sentence(x))
train_df['sentences'] = train_df['sentences'].apply(lambda x :preprocessing.adjust_split(x))
train_df['sentences'] = train_df['sentences'].apply(lambda x : preprocessing.apply_lemmatization(x))


In [None]:
train_df['tokens_lists'] = train_df['sentences'].apply(lambda x: preprocessing.sentences_tokenizer(x))
train_df['tokens_lists']= train_df['tokens_lists'].apply(lambda x: preprocessing.clear_tokens(x))
train_df['tokens_list']= train_df['tokens_lists'].apply(lambda x: preprocessing.flatten_list(x))

In [None]:
train_df

Unnamed: 0,original_text,hard_label,soft_label_0,soft_label_1,disagreement,sentences,tokens_lists,tokens_list
1,<user> <user> I'm so glad about #Brexit.. My a...,0,1.0,0.0,1,"[<user> <user> I be so glad about #Brexit, my ...","[[glad, #brexit], [ancestor, england], [uk], [...","[glad, #brexit, ancestor, england, uk, break, ..."
2,RT <user>: There was more to #Brexit than immi...,0,1.0,0.0,1,"[RT <user>, there be more to #Brexit than immi...","[[#brexit, immigration, ugghhh]]","[#brexit, immigration, ugghhh]"
3,"At the end of the day, the leave campaign won ...",0,1.0,0.0,1,"[at the end of the day, the leave campaign win...","[[end, day], [leave, campaign, win, #brexit, d...","[end, day, leave, campaign, win, #brexit, due,..."
4,So the reducing migration thing wasn't quite w...,0,1.0,0.0,1,[so the reduce migration thing be not quite wh...,"[[reduce, migration, thing, quite, seem, either]]","[reduce, migration, thing, quite, seem, either]"
5,A Brit Immigrant Asks Britain to Become India’...,0,1.0,0.0,1,[a Brit immigrant ask Britain to become India ...,"[[brit, immigrant, ask, britain, become, india...","[brit, immigrant, ask, britain, become, india,..."
...,...,...,...,...,...,...,...,...
780,#Brexit has to happen there is no way you can ...,0,1.0,0.0,1,[#Brexit have to happen there be no way you ca...,"[[#brexit, happen, way, set, minimum, wage, ke...","[#brexit, happen, way, set, minimum, wage, kee..."
781,A foreigner we've neither heard of nor voted f...,0,1.0,0.0,1,[a foreigner we 've neither hear of nor vote f...,"[[foreigner, neither, hear, vote, care, warn, ...","[foreigner, neither, hear, vote, care, warn, #..."
782,The irony is we will probably now look to appo...,0,1.0,0.0,1,[the irony be we will probably now look to app...,"[[irony, probably, look, appoint, foreign, man...","[irony, probably, look, appoint, foreign, mana..."
783,"Better watch out Merkel, the German people don...",0,0.5,0.5,0,"[well watch out merkel, the german people do n...","[[well, watch, merkel], [german, people, like,...","[well, watch, merkel, german, people, like, mu..."


In [None]:
train_df['original_text']= train_df['tokens_list'].apply(lambda x: ' '.join(x))
train_df

Unnamed: 0,original_text,hard_label,soft_label_0,soft_label_1,disagreement,sentences,tokens_lists,tokens_list
1,glad #brexit ancestor england uk break heart s...,0,1.0,0.0,1,"[<user> <user> I be so glad about #Brexit, my ...","[[glad, #brexit], [ancestor, england], [uk], [...","[glad, #brexit, ancestor, england, uk, break, ..."
2,#brexit immigration ugghhh,0,1.0,0.0,1,"[RT <user>, there be more to #Brexit than immi...","[[#brexit, immigration, ugghhh]]","[#brexit, immigration, ugghhh]"
3,end day leave campaign win #brexit due anti im...,0,1.0,0.0,1,"[at the end of the day, the leave campaign win...","[[end, day], [leave, campaign, win, #brexit, d...","[end, day, leave, campaign, win, #brexit, due,..."
4,reduce migration thing quite seem either,0,1.0,0.0,1,[so the reduce migration thing be not quite wh...,"[[reduce, migration, thing, quite, seem, either]]","[reduce, migration, thing, quite, seem, either]"
5,brit immigrant ask britain become india union ...,0,1.0,0.0,1,[a Brit immigrant ask Britain to become India ...,"[[brit, immigrant, ask, britain, become, india...","[brit, immigrant, ask, britain, become, india,..."
...,...,...,...,...,...,...,...,...
780,#brexit happen way set minimum wage keep migra...,0,1.0,0.0,1,[#Brexit have to happen there be no way you ca...,"[[#brexit, happen, way, set, minimum, wage, ke...","[#brexit, happen, way, set, minimum, wage, kee..."
781,foreigner neither hear vote care warn #brexit,0,1.0,0.0,1,[a foreigner we 've neither hear of nor vote f...,"[[foreigner, neither, hear, vote, care, warn, ...","[foreigner, neither, hear, vote, care, warn, #..."
782,irony probably look appoint foreign manager #e...,0,1.0,0.0,1,[the irony be we will probably now look to app...,"[[irony, probably, look, appoint, foreign, man...","[irony, probably, look, appoint, foreign, mana..."
783,well watch merkel german people like muslim in...,0,0.5,0.5,0,"[well watch out merkel, the german people do n...","[[well, watch, merkel], [german, people, like,...","[well, watch, merkel, german, people, like, mu..."


### Load Dev Data

In [None]:
dev_df = pd.read_json("./HS-Brexit_dataset/HS-Brexit_dev.json", orient='index')
dev_df = preprocessing.get_dataset_labels(dev_df)

In [None]:
# sentences
dev_df['sentences'] = dev_df['original_text'].apply(lambda x : preprocessing.split_sentence(x))
dev_df['sentences'] = dev_df['sentences'].apply(lambda x :preprocessing.adjust_split(x))
dev_df['sentences'] = dev_df['sentences'].apply(lambda x : preprocessing.apply_lemmatization(x))
dev_df['tokens_lists'] = dev_df['sentences'].apply(lambda x: preprocessing.sentences_tokenizer(x))
dev_df['tokens_lists']= dev_df['tokens_lists'].apply(lambda x: preprocessing.clear_tokens(x))
dev_df['tokens_list']= dev_df['tokens_lists'].apply(lambda x: preprocessing.flatten_list(x))

In [None]:
dev_df['original_text']= dev_df['tokens_list'].apply(lambda x: ' '.join(x))
dev_df

Unnamed: 0,original_text,hard_label,soft_label_0,soft_label_1,disagreement,sentences,tokens_lists,tokens_list
1,cheap mean foreigner flock summer oh irony #br...,0,1.00,0.00,1,[cheap £ mean foreigner will be flock here thi...,"[[cheap, mean, foreigner, flock, summer], [oh,...","[cheap, mean, foreigner, flock, summer, oh, ir..."
2,#brexitornot easy #brexit protect country unco...,0,0.67,0.33,0,"[#BrexitOrNot, easy, #BREXIT and protect your ...","[[#brexitornot], [easy], [#brexit, protect, co...","[#brexitornot, easy, #brexit, protect, country..."
3,#brexit sum one word terrorism radical islam t...,0,0.50,0.50,0,"[<user> #brexit to sum it up in just one word,...","[[#brexit, sum, one, word], [terrorism], [radi...","[#brexit, sum, one, word, terrorism, radical, ..."
4,putin say #brexit reflect unhappiness migratio...,0,1.00,0.00,1,[Putin say #Brexit reflect unhappiness with mi...,"[[putin, say, #brexit, reflect, unhappiness, m...","[putin, say, #brexit, reflect, unhappiness, mi..."
5,#brexit look likely anti immigration much stro...,0,1.00,0.00,1,"[#Brexit be look likely, but anti, immigration...","[[#brexit, look, likely], [anti], [immigration...","[#brexit, look, likely, anti, immigration, muc..."
...,...,...,...,...,...,...,...,...
164,sky news interview guy muslims britain pro #br...,0,1.00,0.00,1,"[sky news be interview a guy from, Muslims for...","[[sky, news, interview, guy], [muslims, britai...","[sky, news, interview, guy, muslims, britain, ..."
165,#brexit anyway deport mark lawrenson fucking hate,0,0.50,0.50,0,"[after this #brexit, be there anyway we can de...","[[#brexit], [anyway, deport, mark, lawrenson],...","[#brexit, anyway, deport, mark, lawrenson, fuc..."
166,europe collapse worry welcome refugee #byzanti...,0,0.67,0.33,0,"[<user> Europe be collapse, do not worry we we...","[[europe, collapse], [worry, welcome, refugee,...","[europe, collapse, worry, welcome, refugee, #b..."
167,#brexit cheap foreign tourist come #uk ☔ expen...,0,1.00,0.00,1,[#Brexit its now cheap for foreign tourist to ...,"[[#brexit, cheap, foreign, tourist, come, #uk,...","[#brexit, cheap, foreign, tourist, come, #uk, ..."


### Load Test Data

In [None]:
test_df = pd.read_json("./HS-Brexit_dataset/HS-Brexit_test.json", orient='index')
test_df = preprocessing.get_dataset_labels(test_df)

In [None]:
# sentences
test_df['sentences'] = test_df['original_text'].apply(lambda x : preprocessing.split_sentence(x))
test_df['sentences'] = test_df['sentences'].apply(lambda x :preprocessing.adjust_split(x))
test_df['sentences'] = test_df['sentences'].apply(lambda x : preprocessing.apply_lemmatization(x))
test_df['tokens_lists'] = test_df['sentences'].apply(lambda x: preprocessing.sentences_tokenizer(x))
test_df['tokens_lists']= test_df['tokens_lists'].apply(lambda x: preprocessing.clear_tokens(x))
test_df['tokens_list']= test_df['tokens_lists'].apply(lambda x: preprocessing.flatten_list(x))

In [None]:
test_df['original_text']= test_df['tokens_list'].apply(lambda x: ' '.join(x))
test_df

Unnamed: 0,original_text,hard_label,soft_label_0,soft_label_1,disagreement,sentences,tokens_lists,tokens_list
1,uk decide leave european union mean foreign pr...,0,1.00,0.00,1,[the UK have decide to leave the European Unio...,"[[uk, decide, leave, european, union], [mean, ...","[uk, decide, leave, european, union, mean, for..."
2,good brit save country seal muslim invasian #b...,1,0.33,0.67,0,"[good for you brit, you save your country, now...","[[good, brit], [save, country], [seal, muslim,...","[good, brit, save, country, seal, muslim, inva..."
3,vote nowt immigration identity control interes...,0,1.00,0.00,1,"[I be vote out, nowt to do with immigration or...","[[vote], [nowt, immigration, identity, control...","[vote, nowt, immigration, identity, control, i..."
4,support #brexit support scapegoating muslim immig,0,1.00,0.00,1,"[RT <user>, <user> <user> <user> <user> I supp...","[[support, #brexit], [support, scapegoating, m...","[support, #brexit, support, scapegoating, musl..."
5,say year early really mean deport oh wait vote...,0,0.67,0.33,0,"[<user> u say 8 year early, 18 really mean 9, ...","[[say, year, early], [really, mean], [deport],...","[say, year, early, really, mean, deport, oh, w..."
...,...,...,...,...,...,...,...,...
164,hmm #brexit really change immigration policy w...,0,1.00,0.00,1,"[<user> Hmm, for #Brexit they can not really c...","[[hmm], [#brexit, really, change, immigration,...","[hmm, #brexit, really, change, immigration, po..."
165,sadiq isis muslim mayor total lunatic explode ...,1,0.50,0.50,0,"[Sadiq, isis muslim mayor be a total lunatic a...","[[sadiq], [isis, muslim, mayor, total, lunatic...","[sadiq, isis, muslim, mayor, total, lunatic, e..."
166,fusilier lee rigby lt forget #brexit vote cont...,1,0.50,0.50,0,"[Fusilier Lee Rigby, lt, not forget during the...","[[fusilier, lee, rigby], [lt], [forget, #brexi...","[fusilier, lee, rigby, lt, forget, #brexit, vo..."
167,world know #trump one worthy run usa unless mu...,0,1.00,0.00,1,[because the world know #Trump be the only one...,"[[world, know, #trump, one, worthy, run, usa],...","[world, know, #trump, one, worthy, run, usa, u..."


## Baseline

In [None]:
tokens_df = pd.read_csv('../Nuovi_Scores/brexit_scores.csv', sep='\t')
tokens_df_10 = tokens_df[tokens_df.occurrences >= 10]

### Threshold estimation from Dev

In [None]:
pred_somma = []
pred_tutti_verdi = []
pred_media = []
pred_mediana = []

for _, row in tqdm(dev_df.iterrows()):
  colors_agreement, _ = tokens_evaluation.get_all_colors(row['tokens_list'], tokens_df_10)

  if 'NA' in colors_agreement:
    indexes = tokens_evaluation.find_NA_indices(colors_agreement)
    for new_word_index in range(len(indexes)) :
      colors_agreement[indexes[new_word_index]]=0

  colors_agreement = [i for i in colors_agreement if i != 0]

  if colors_agreement:

    pred_somma.append(sum(colors_agreement))
    pred_media.append(np.mean(colors_agreement))
    pred_mediana.append(np.median(colors_agreement))
    pred_tutti_verdi.append(min(colors_agreement))

  else:
    pred_somma.append(0)
    pred_media.append(0)
    pred_mediana.append(0)
    pred_tutti_verdi.append(0)

In [None]:
print('stima NA: NO')
print('threshold variabile (da -0.9 a +0.9)')

t= -1
for threshold in range(19):
  t = t + 0.1
  t = round(t,1)
  print('THRESHOLD: '+ str(t) + '\n')

  print('SOMMA \n')
  print(classification_report(dev_df['disagreement'], [int(i>=t) for i in pred_somma] ))


stima NA: NO
soglia coordinate: almeno 10 occorrenze
valori non pesati
threshold variabile (da -0.9 a +0.9)
THRESHOLD: -0.9

SOMMA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116

    accuracy                           0.69       168
   macro avg       0.35      0.50      0.41       168
weighted avg       0.48      0.69      0.56       168

THRESHOLD: -0.8

SOMMA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116

    accuracy                           0.69       168
   macro avg       0.35      0.50      0.41       168
weighted avg       0.48      0.69      0.56       168

THRESHOLD: -0.7

SOMMA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116


In [None]:
print('stima NA: NO')
print('threshold variabile (da -0.9 a +0.9)')

t= -1
for threshold in range(38):
  t = t + 0.05
  t = round(t,2)
  print('THRESHOLD: '+ str(t) + '\n')

  print('\n MEDIA \n')
  print(classification_report(dev_df['disagreement'], [int(i>=t) for i in pred_media]))


stima NA: NO
soglia coordinate: almeno 10 occorrenze
valori non pesati
threshold variabile (da -0.9 a +0.9)
THRESHOLD: -0.95


 MEDIA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116

    accuracy                           0.69       168
   macro avg       0.35      0.50      0.41       168
weighted avg       0.48      0.69      0.56       168

THRESHOLD: -0.9


 MEDIA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116

    accuracy                           0.69       168
   macro avg       0.35      0.50      0.41       168
weighted avg       0.48      0.69      0.56       168

THRESHOLD: -0.85


 MEDIA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82   

In [None]:
print('stima NA: NO')
print('threshold variabile (da -0.9 a +0.9)')

t= -1
for threshold in range(38):
  t = t + 0.05
  t = round(t,2)
  print('THRESHOLD: '+ str(t) + '\n')

  print('\n MEDIANA \n')
  print(classification_report(dev_df['disagreement'], [int(i>=t) for i in pred_mediana]))

stima NA: NO
soglia coordinate: almeno 10 occorrenze
valori non pesati
threshold variabile (da -0.9 a +0.9)
THRESHOLD: -0.95


 MEDIANA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116

    accuracy                           0.69       168
   macro avg       0.35      0.50      0.41       168
weighted avg       0.48      0.69      0.56       168

THRESHOLD: -0.9


 MEDIANA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116

    accuracy                           0.69       168
   macro avg       0.35      0.50      0.41       168
weighted avg       0.48      0.69      0.56       168

THRESHOLD: -0.85


 MEDIANA 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0

In [None]:
print('stima NA: NO')
print('threshold variabile (da -0.9 a +0.9)')

t= -1
for threshold in range(38):
  t = t + 0.05
  t = round(t,2)
  print('THRESHOLD: '+ str(t) + '\n')

  print('\n ALL GREEN \n')
  print(classification_report(dev_df['disagreement'], [int(i>=t) for i in pred_tutti_verdi]))

stima NA: NO
soglia coordinate: almeno 10 occorrenze
valori non pesati
threshold variabile (da -0.9 a +0.9)
THRESHOLD: -0.95


 ALL GREEN 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116

    accuracy                           0.69       168
   macro avg       0.35      0.50      0.41       168
weighted avg       0.48      0.69      0.56       168

THRESHOLD: -0.9


 ALL GREEN 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00      0.82       116

    accuracy                           0.69       168
   macro avg       0.35      0.50      0.41       168
weighted avg       0.48      0.69      0.56       168

THRESHOLD: -0.85


 ALL GREEN 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.69      1.00 

### Performances on Test

In [None]:
pred_somma = []
pred_tutti_verdi = []
pred_media = []
pred_mediana = []

for _, row in tqdm(test_df.iterrows()):
  colors_agreement, _ = tokens_evaluation.get_all_colors(row['tokens_list'], tokens_df_10)

  if 'NA' in colors_agreement:
    indexes = tokens_evaluation.find_NA_indices(colors_agreement)
    for new_word_index in range(len(indexes)) :
       colors_agreement[indexes[new_word_index]]=0

  colors_agreement = [i for i in colors_agreement if i != 0]

  if colors_agreement:

    pred_somma.append(sum(colors_agreement))
    pred_media.append(np.mean(colors_agreement))
    pred_mediana.append(np.median(colors_agreement))
    pred_tutti_verdi.append(min(colors_agreement))

  else:
    pred_somma.append(0)
    pred_media.append(0)
    pred_mediana.append(0)
    pred_tutti_verdi.append(0)

168it [00:00, 590.29it/s]


In [None]:
print('SOMMA \n')
print(classification_report(test_df['disagreement'], [int(i>=0.9) for i in pred_somma] ))

print('\n MEDIA \n')
print(classification_report(test_df['disagreement'], [int(i>=0.4) for i in pred_media]))

print('\n MEDIANA \n')
print(classification_report(test_df['disagreement'], [int(i>=0.4) for i in pred_mediana]))

print('\n ALL GREEN \n')
print(classification_report(test_df['disagreement'], [int(i>=-0.3) for i in pred_tutti_verdi]))

stima NA: NO
soglia coordinate: almeno 10 occorrenze
valori non pesati
SOMMA 

              precision    recall  f1-score   support

           0       0.66      0.37      0.47        52
           1       0.76      0.91      0.83       116

    accuracy                           0.74       168
   macro avg       0.71      0.64      0.65       168
weighted avg       0.73      0.74      0.72       168


 MEDIA 

              precision    recall  f1-score   support

           0       0.54      0.77      0.63        52
           1       0.87      0.71      0.78       116

    accuracy                           0.73       168
   macro avg       0.71      0.74      0.71       168
weighted avg       0.77      0.73      0.74       168


 MEDIANA 

              precision    recall  f1-score   support

           0       0.45      0.42      0.44        52
           1       0.75      0.77      0.76       116

    accuracy                           0.66       168
   macro avg       0.60    

In [None]:
# open file in write mode
with open(r'./Results_NoEstimation/HS_Sum.txt', 'w') as fp:
    for item in pred_somma:
        # write each item on a new line
        fp.write("%s\n" % item)

with open(r'./Results_NoEstimation/HS_Mean.txt', 'w') as fp:
    for item in pred_media:
        # write each item on a new line
        fp.write("%s\n" % item)


with open(r'./Results_NoEstimation/HS_Median.txt', 'w') as fp:
    for item in pred_mediana:
        # write each item on a new line
        fp.write("%s\n" % item)

with open(r'./Results_NoEstimation/HS_Verdi.txt', 'w') as fp:
    for item in pred_tutti_verdi:
        # write each item on a new line
        fp.write("%s\n" % item)

# Mean contextualized
mean of the contextualized vectors to obtain one vector per token

## Context Embeddings for token in Train

In [None]:
sentences = list(train_df['original_text'])

In [None]:
from collections import OrderedDict

context_embeddings = []
context_tokens = []

for sentence in sentences:
  tokenized_text, list_token_embeddings = BERT_Embeddings.text_to_emb(sentence, tokenizer, model)

  # make ordered dictionary to keep track of the position of each word
  tokens = OrderedDict()

  # loop over tokens in sensitive sentence
  for token in tokenized_text[1:-1]:
    # keep track of position of word and whether it occurs multiple times
    if token in tokens:
      tokens[token] += 1
    else:
      tokens[token] = 1

    # compute the position of the current token
    token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
    current_index = token_indices[tokens[token]-1]

    # get the corresponding embedding
    token_vec = list_token_embeddings[current_index]

    # save values
    context_tokens.append(token)
    context_embeddings.append(token_vec)

In [None]:
def find_token_indices(list_to_check, token):
    """
    Finds the indices of 'NA' values in a list.

    Args:
        list_to_check (list): List to check for 'NA' values.

    Returns:
        list: List of indices where 'NA' values are found.

    """
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == token:
            indices.append(idx)
    return indices

In [None]:
def retrieve_elements(lst, indexes):
    return [lst[i] for i in indexes]

In [None]:
context_embeddings_mean = []
context_tokens_mean = []

for tk in set(context_tokens):
  indexes = find_token_indices(context_tokens, tk)
  context_tokens_mean.append(tk)
  context_embeddings_mean.append(torch.mean(torch.stack(retrieve_elements(context_embeddings, indexes)) , dim=0))#np.mean(list(retrieve_elements(context_embeddings, indexes))))

print(len(context_tokens_mean))

2276


In [None]:
context_embeddings_mean_2 = []
context_tokens_mean_2 = []

for tk in set(context_tokens_mean):
  if tk in list(tokens_df_10['token']):
    indexes = find_token_indices(context_tokens_mean, tk)
    context_tokens_mean_2.append(tk)
    context_embeddings_mean_2.append(context_embeddings_mean[indexes[0]])

context_embeddings_mean = context_embeddings_mean_2
context_tokens_mean = context_tokens_mean_2

In [None]:
tokens_df = pd.read_csv('../Nuovi_Scores/brexit_scores.csv', sep='\t')
tokens_df_10 = tokens_df[tokens_df.occurrences >= 10]

### G-Models

#### Threshold on DEV

In [None]:
pred_somma = []
pred_tutti_verdi = []
pred_media = []
pred_mediana = []
threshold=0.7

for _, row in tqdm(dev_df.iterrows()):
  colors_agreement, _ = tokens_evaluation.get_all_colors(row['tokens_list'], tokens_df_10)

  if 'NA' in colors_agreement:
    similar_words, distances_df, new_words  = BERT_Embeddings.find_similar_words(row['original_text'], tokens_evaluation.find_NA_indices(colors_agreement), tokenizer, context_tokens_mean,context_embeddings_mean, model, tokens_df_10)
    for i in range(0, len(new_words)):
      if list(distances_df.loc[(distances_df['new_token']==new_words[i])& (distances_df['distance']>=threshold)]['distance']):
        stimated_coordinate = np.average(
            list(distances_df.loc[(distances_df['new_token']==new_words[i]) & (distances_df['distance']>=threshold)]['Agreement_coordinate']),
            weights=list(distances_df.loc[(distances_df['new_token']==new_words[i])& (distances_df['distance']>=threshold)]['distance']))

        colors_agreement[tokens_evaluation.find_NA_indices(colors_agreement)[0]]=stimated_coordinate
      else: #if there isn't any word above the threshold
        colors_agreement[tokens_evaluation.find_NA_indices(colors_agreement)[0]]=0
  colors_agreement = [i for i in colors_agreement if i != 0]

  if colors_agreement:

    pred_somma.append(sum(colors_agreement))
    pred_media.append(np.mean(colors_agreement))
    pred_mediana.append(np.median(colors_agreement))
    pred_tutti_verdi.append(min(colors_agreement))

  else:
    pred_somma.append(0)
    pred_media.append(0)
    pred_mediana.append(0)
    pred_tutti_verdi.append(0)

168it [00:38,  4.37it/s]


In [None]:
best_t = 0
best_f1 = 0
for t in np.arange(round(min(pred_somma)), round(max(pred_somma)), 0.1):
  t = round(t,1)
  report = classification_report(dev_df['disagreement'], [int(i>=t) for i in pred_somma], output_dict=True)
  if report['macro avg']['f1-score'] > best_f1:
    best_f1 = report['macro avg']['f1-score']
    best_t = t

print('SOMMA \n')
print('THRESHOLD: '+ str(best_t) + '\n')
print(classification_report(dev_df['disagreement'], [int(i>=best_t) for i in pred_somma] ))

SOMMA 

THRESHOLD: 1.9

              precision    recall  f1-score   support

           0       0.61      0.42      0.50        52
           1       0.77      0.88      0.82       116

    accuracy                           0.74       168
   macro avg       0.69      0.65      0.66       168
weighted avg       0.72      0.74      0.72       168



In [None]:
best_t = 0
best_f1 = 0
pred = pred_media
for t in np.arange(round(min(pred)), round(max(pred)), 0.1):
  t = round(t,1)
  report = classification_report(dev_df['disagreement'], [int(i>=t) for i in pred], output_dict=True)
  if report['macro avg']['f1-score'] > best_f1:
    best_f1 = report['macro avg']['f1-score']
    best_t = t

print('MEDIA \n')
print('THRESHOLD: '+ str(best_t) + '\n')
print(classification_report(dev_df['disagreement'], [int(i>=best_t) for i in pred] ))

MEDIA 

THRESHOLD: 0.4

              precision    recall  f1-score   support

           0       0.55      0.71      0.62        52
           1       0.85      0.74      0.79       116

    accuracy                           0.73       168
   macro avg       0.70      0.73      0.71       168
weighted avg       0.76      0.73      0.74       168



In [None]:
best_t = 0
best_f1 = 0
pred = pred_mediana
for t in np.arange(round(min(pred)), round(max(pred)), 0.1):
  t = round(t,1)
  report = classification_report(dev_df['disagreement'], [int(i>=t) for i in pred], output_dict=True)
  if report['macro avg']['f1-score'] > best_f1:
    best_f1 = report['macro avg']['f1-score']
    best_t = t

print('MEDIANA \n')
print('THRESHOLD: '+ str(best_t) + '\n')
print(classification_report(dev_df['disagreement'], [int(i>=best_t) for i in pred] ))

MEDIANA 

THRESHOLD: 0.4

              precision    recall  f1-score   support

           0       0.54      0.42      0.47        52
           1       0.76      0.84      0.80       116

    accuracy                           0.71       168
   macro avg       0.65      0.63      0.64       168
weighted avg       0.69      0.71      0.70       168



In [None]:
best_t = 0
best_f1 = 0
pred = pred_tutti_verdi
for t in np.arange(round(min(pred)), round(max(pred)), 0.1):
  t = round(t,1)
  report = classification_report(dev_df['disagreement'], [int(i>=t) for i in pred], output_dict=True)
  if report['macro avg']['f1-score'] > best_f1:
    best_f1 = report['macro avg']['f1-score']
    best_t = t

print(' ALL GREEN \n')
print('THRESHOLD: '+ str(best_t) + '\n')
print(classification_report(dev_df['disagreement'], [int(i>=best_t) for i in pred] ))

 ALL GREEN 

THRESHOLD: -0.3

              precision    recall  f1-score   support

           0       0.62      0.71      0.66        52
           1       0.86      0.80      0.83       116

    accuracy                           0.77       168
   macro avg       0.74      0.76      0.75       168
weighted avg       0.79      0.77      0.78       168



In [None]:
best_t = 0
best_f1 = 0
pred = pred_tutti_verdi
for t in np.arange(round(min(pred)), round(max(pred)), 0.1):
  t = round(t,1)
  report = classification_report(dev_df['disagreement'], [int(i>=t) for i in pred], output_dict=True)
  if report['macro avg']['f1-score'] >= best_f1:
    best_f1 = report['macro avg']['f1-score']
    best_t = t

print(' ALL GREEN \n')
print('THRESHOLD: '+ str(best_t) + '\n')
print(classification_report(dev_df['disagreement'], [int(i>=best_t) for i in pred] ))

 ALL GREEN 

THRESHOLD: -0.3

              precision    recall  f1-score   support

           0       0.62      0.71      0.66        52
           1       0.86      0.80      0.83       116

    accuracy                           0.77       168
   macro avg       0.74      0.76      0.75       168
weighted avg       0.79      0.77      0.78       168



#### Performances on Test

In [None]:
pred_somma = []
pred_tutti_verdi = []
pred_media = []
pred_mediana = []
threshold=0.7

for _, row in tqdm(test_df.iterrows()):
  colors_agreement, _ = tokens_evaluation.get_all_colors(row['tokens_list'], tokens_df_10)

  if 'NA' in colors_agreement:
    similar_words, distances_df, new_words  = BERT_Embeddings.find_similar_words(row['original_text'], tokens_evaluation.find_NA_indices(colors_agreement), tokenizer, context_tokens_mean,context_embeddings_mean, model, tokens_df_10)
    for i in range(0, len(new_words)):
      if list(distances_df.loc[(distances_df['new_token']==new_words[i])& (distances_df['distance']>=threshold)]['distance']):
        stimated_coordinate = np.mean([a*b for a,b in zip(list(distances_df.loc[(distances_df['new_token']==new_words[i]) & (distances_df['distance']>=threshold)]['Agreement_coordinate']),
                             list(distances_df.loc[(distances_df['new_token']==new_words[i])& (distances_df['distance']>=threshold)]['distance']))])

        colors_agreement[tokens_evaluation.find_NA_indices(colors_agreement)[0]]=stimated_coordinate
      else: #if there isn't any word above the threshold
        colors_agreement[tokens_evaluation.find_NA_indices(colors_agreement)[0]]=0
  colors_agreement = [i for i in colors_agreement if i != 0]

  if colors_agreement:

    pred_somma.append(sum(colors_agreement))
    pred_media.append(np.mean(colors_agreement))
    pred_mediana.append(np.median(colors_agreement))
    pred_tutti_verdi.append(min(colors_agreement))

  else:
    pred_somma.append(0)
    pred_media.append(0)
    pred_mediana.append(0)
    pred_tutti_verdi.append(0)

168it [00:41,  4.09it/s]


In [None]:
print('SOMMA \n')
print(classification_report(test_df['disagreement'], [int(i>=1.9) for i in pred_somma] ))

print('\n MEDIA \n')
print(classification_report(test_df['disagreement'], [int(i>=0.4) for i in pred_media]))

print('\n MEDIANA \n')
print(classification_report(test_df['disagreement'], [int(i>=0.4) for i in pred_mediana]))

print('\n ALL GREEN \n')
print(classification_report(test_df['disagreement'], [int(i>=-0.3) for i in pred_tutti_verdi]))

stima NA: NO
soglia coordinate: almeno 10 occorrenze
valori non pesati
SOMMA 

              precision    recall  f1-score   support

           0       0.57      0.38      0.46        52
           1       0.76      0.87      0.81       116

    accuracy                           0.72       168
   macro avg       0.67      0.63      0.64       168
weighted avg       0.70      0.72      0.70       168


 MEDIA 

              precision    recall  f1-score   support

           0       0.54      0.71      0.61        52
           1       0.85      0.72      0.78       116

    accuracy                           0.72       168
   macro avg       0.69      0.72      0.70       168
weighted avg       0.75      0.72      0.73       168


 MEDIANA 

              precision    recall  f1-score   support

           0       0.40      0.42      0.41        52
           1       0.73      0.72      0.72       116

    accuracy                           0.62       168
   macro avg       0.57    

In [None]:
# open file in write mode
with open(r'./Brexit_G-Sum.txt', 'w') as fp:
    for item in pred_somma:
        # write each item on a new line
        fp.write("%s\n" % item)

with open(r'./Brexit_G-Mean.txt', 'w') as fp:
    for item in pred_media:
        # write each item on a new line
        fp.write("%s\n" % item)

with open(r'./Brexit_G-Median.txt', 'w') as fp:
    for item in pred_mediana:
        # write each item on a new line
        fp.write("%s\n" % item)

with open(r'./Brexit_G-Min.txt', 'w') as fp:
    for item in pred_tutti_verdi:
        # write each item on a new line
        fp.write("%s\n" % item)