In [None]:
!pip install -qq transformers

#DATASET DOWNLOAD

In [None]:
'''
---Example--- 
!gdown --id 1xIGVP3cKUOfwFamGR7htTEyQPZnm3jby
!gdown --id 1UyoKR5dDx5YaU0m-lXMxsT5eDIDonnzl
!gdown --id 1Tyi6x3h_PMcNiuZH_kDpXsRv4B-ciLSk
'''

!gdown --id 1xIGVP3cKUOfwFamGR7htTEyQPZnm3jby
!gdown --id 1QHrOFzTdP53oh5L8c0avAQgrD0zvUJSK

from google.colab import drive
drive.mount('/content/gdrive')


Downloading...
From: https://drive.google.com/uc?id=1xIGVP3cKUOfwFamGR7htTEyQPZnm3jby
To: /content/TRAIN.csv
100% 480k/480k [00:00<00:00, 15.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QHrOFzTdP53oh5L8c0avAQgrD0zvUJSK
To: /content/TEST.csv
100% 238k/238k [00:00<00:00, 34.7MB/s]
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# IMPORT AND MODELS PATH

Here are defined all the libraries used and the path of the drive that contains the tagged datasets.

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForMaskedLM
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, random_split, TensorDataset
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#HERE DEFINE MODELS PATH
path = F"/content/gdrive/My Drive/Models/"
path_pickle = F"/content/gdrive/My Drive/SardiStance/colab work/pickle/"



# SET PICKLE NAME

In [None]:
# The pickle file contain the tagged dataset

TRAINING_PICKLE_NAME = "training"
TEST_PICKLE_NAME = "test"

# PUT DATASET INTO PANDAS DATAFRAMES

To more easily process the input data we use the pandas library that allows us to save all the necessary information in data structures called dataframes.


In [None]:
from sklearn.utils import shuffle

#The input dataset can be saved in two extensions, as pickle or as csv, and depending on the extension a different function must be used.

#FOR PICKLE EXTENSION
'''df = pd.read_pickle(path_pickle + TRAINING_PICKLE_NAME + "_pickle.pkl")
df_test = pd.read_pickle(path_pickle + TEST_PICKLE_NAME + "_pickle.pkl")
df = shuffle(df)'''

#FOR CSV EXTENSION

df = pd.read_csv("TRAIN.csv")
df_test = pd.read_csv("TEST.csv")
df = shuffle(df)


# CREATING A NEW "STANCE" COLUMN IN THE DATAFRAME

To handle the data more easily, we have created an additional column in the dataframe that assigns the values ​​0, 1 and 2 to the AGAINST, NONE and FAVOR labels respectively.

In [None]:
def to_stance(stance):
  stance = str(stance)
  if stance == "NONE":
    return 1
  elif stance == "AGAINST":
    return 0
  else:
    return 2
df['stance'] = df.label.apply(to_stance)
#df_test['stance'] = df_test.label.apply(to_stance)


In [None]:
#HERE DEFINE THE NAME OF THE CLASSES
class_names = ['AGAINST', 'NONE', 'FAVOR']

# DEFINING OF PRE TRAINED MODEL AND TOKENIZER

In [None]:
UMBERTO_PRE_TRAINED = "Musixmatch/umberto-commoncrawl-cased-v1"
ALBERTO = "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
MULTILINGUAL = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MULTILINGUAL)
#tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




In [None]:
#NUMBER OF TOKENS ADMITTED
MAX_LEN = 125

# DATASET CLASS

Now we create the class that contains the information of each tweet and then we use it to create the data structure containing the tokenized phrases.

In [None]:
class StanceDataset(Dataset):
  def __init__(self, tweets_id, tweets, labels, tokenizer, max_len):
    self.tweets_id = tweets_id
    self.tweets = tweets
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.tweets)
  def __getitem__(self, item):
    tweet = str(self.tweets[item])
    tweet_id = int(self.tweets_id[item])
    label = self.labels[item]
    encoding = self.tokenizer.encode_plus(
      tweet,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt',
    )
    return {
      'tweet_id': torch.tensor(tweet_id, dtype=torch.long),
      'tweet': tweet,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

In [None]:
class StanceDatasetTest(Dataset):
  def __init__(self, tweets_id, tweets, tokenizer, max_len):
    self.tweets_id = tweets_id
    self.tweets = tweets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.tweets)
  def __getitem__(self, item):
    tweet = str(self.tweets[item])
    tweet_id = int(self.tweets_id[item])
    encoding = self.tokenizer.encode_plus(
      tweet,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt',
    )
    return {
      'tweet_id': torch.tensor(tweet_id, dtype=torch.long),
      'tweet': tweet,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = StanceDataset(
    tweets=df.text.to_numpy(),
    labels=df.stance.to_numpy(),
    tweets_id=df.tweet_id.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4,
    shuffle=False
  )

In [None]:
def create_data_loader_test(df, tokenizer, max_len, batch_size):
  ds = StanceDatasetTest(
    tweets=df.text.to_numpy(),
    tweets_id=df.tweet_id.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4,
    shuffle=False
  )

# BERT CLASSIFIER


This class is used to load the UmBERTo model and apply activation functions that will be used in the training process.
Furthermore, to make the training phase more consistent with the test phase, we also used a dropout function.

In [None]:
class StanceClassifier(nn.Module):
  def __init__(self, n_classes):
    super(StanceClassifier, self).__init__()
    #self.bert = AutoModel.from_pretrained(UMBERTO_PRE_TRAINED,force_download=True)
    #self.bert = AutoModelForMaskedLM.from_pretrained(MULTILINGUAL, force_download=True)
    self.bert = AutoModel.from_pretrained(MULTILINGUAL, force_download=True)
    self.drop = nn.Dropout(p=0.4)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    output = self.drop(pooled_output)
    return self.out(output)

#SAVE MODEL
  def save_pretrained(self, path):
    self.bert.save_pretrained(path)

# MODEL TRAINING

Here it s a helper function for training our model for one epoch

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()

  all_predictions , true_labels = [], []


  correct_predictions = 0
  losses = []
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    labels = d["labels"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)

    all_predictions.append(preds.cpu().data)
    true_labels.append(labels.cpu().data) 

    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  all_predictions = np.concatenate(all_predictions, axis=0)
  true_labels = np.concatenate(true_labels, axis=0)
  
  #SYSTEM MEASURES
  f1 = f1_score(true_labels, all_predictions,labels=np.unique(all_predictions),average="macro")
  precision = precision_score(true_labels, all_predictions,labels=np.unique(all_predictions) ,average="macro")
  recall = recall_score(true_labels, all_predictions, labels=np.unique(all_predictions),average="macro")
  accuracy = accuracy_score(true_labels,all_predictions)

  return accuracy, np.mean(losses), precision , recall, f1

# MODEL EVALUATION
Let’s write another helper function to evaluate the model on a given data loader

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  
  predictions = []
  all_predictions , true_labels, ids = [], [], []

  correct_predictions = 0
  with torch.no_grad():
    
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      losses.append(loss.item())
      correct_predictions += torch.sum(preds == labels)
      all_predictions.append(preds.cpu().data)
      true_labels.append(labels.cpu().data) 
      ids.append(d["tweet_id"].cpu().data)
  
  all_predictions = np.concatenate(all_predictions, axis=0)
  true_labels = np.concatenate(true_labels, axis=0)
  predictions = {"tweet_id":ids,"preds":all_predictions,"exact":true_labels}

  #SYSTEM MEASURES
  f1 = f1_score(true_labels, all_predictions,labels=[0,2],average="macro")
  precision = precision_score(true_labels, all_predictions,labels=np.unique(all_predictions) ,average="macro")
  recall = recall_score(true_labels, all_predictions, labels=np.unique(all_predictions),average="macro")
  accuracy = accuracy_score(true_labels,all_predictions)

  return accuracy, np.mean(losses), predictions, precision , recall, f1

In [None]:
def predict_test(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  
  predictions = []
  all_predictions, ids = [],  []

  with torch.no_grad():
    
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      all_predictions.append(preds.cpu().data)
      ids.append(d["tweet_id"].cpu().data)
  
  all_predictions = np.concatenate(all_predictions, axis=0)
  predictions = {"tweet_id":ids,"preds":all_predictions}
  
  
  return predictions

In [None]:
df_train = df

#NUMBER OF BATCH
BATCH_SIZE = 32

#train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader_test(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
#val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

# TRAINING CROSS VALIDATION
Cross-validation is a technique for evaluating ML models by training several ML models on subsets of the available input data and evaluating them on the complementary subset of the data

In [None]:
def n_cross_validation(n, df, epochs):
  splitting_size = int(len(df)/n)
  all_models_predictions = {}
  accuracy_measures = []
  f1 = 0
  
  print("Splitting size: {}".format(splitting_size))
  for index in range(n+1):
    if index>0:
      df_val = df[splitting_size * (index-1) : index * splitting_size] #splicing test 
      model = StanceClassifier(len(class_names)) #refresh model 
      model = model.to(device)
      df_train_splitted = df.drop(df_val.index)
      val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
      splitted_train_data_loader = create_data_loader(df_train_splitted, tokenizer, MAX_LEN, BATCH_SIZE)
      EPOCHS = epochs

      optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

      total_steps = len(splitted_train_data_loader) * EPOCHS
      warmup_step = int(len(splitted_train_data_loader)/2) 

      scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_step,
        num_training_steps=total_steps
      )
      loss_fn = nn.CrossEntropyLoss().to(device)

      print(f'DF_TRAIN_SPLITTED_LEN : {len(df_train_splitted)}\nDF_VAL_LEN : {len(df_val)}')

      history = defaultdict(list)
      best_f1 = 0
      for epoch in range(EPOCHS):
        print(f'___________Model {index}___________')
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)

        train_acc, train_loss, p, r, f1 = train_epoch(
          model,
          splitted_train_data_loader,
          loss_fn,
          optimizer,
          device,
          scheduler,
          len(splitted_train_data_loader)
        )
        print(f'Train loss {train_loss} accuracy {train_acc} precision {p} recall {r} f1 {f1}')


        val_acc, val_loss,predictions, p,r,f1 = eval_model(
          model,
          val_data_loader,
          loss_fn,
          device,
          len(val_data_loader)
        )
        print(f'Val   loss {val_loss} accuracy {val_acc} precision {p} f1 {f1} ')

        
        predictions= predict_test(
          model,
          test_data_loader,
          loss_fn,
          device,
          len(test_data_loader)
        )

        if f1 > best_f1:
          key = f"{index}"
          all_models_predictions[key] = predictions
          best_f1 = f1

      accuracy_measures.append(val_acc)

  print(f'folds_accuracy_measures {accuracy_measures}')

  return all_models_predictions

# SAVE SYSTEM PREDICTION TO FILE

In [None]:
import json

def save_predictions_to_file(predictions, model_name):
  file = open(f"./predictions_{model_name}.json",mode="w+")
  file.write(json.dumps(predictions))
  return True

In [None]:
#SET NUMBER OF EPOCHS AND SPLITTING SIZE
SPLITTING_SIZE = 5
EPOCHS = 5

predictions = n_cross_validation(SPLITTING_SIZE,df_train,EPOCHS)

Splitting size: 426


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…


DF_TRAIN_SPLITTED_LEN : 1706
DF_VAL_LEN : 426
___________Model 1___________
Epoch 1/5
----------
Train loss 1.060581644376119 accuracy 0.4712778429073857 precision 0.3098328686563981 recall 0.33114058142510044 f1 0.2547337054606117
Val   loss 1.0724142917564936 accuracy 0.4460093896713615 precision 0.4460093896713615 f1 0.30844155844155846 
___________Model 1___________
Epoch 2/5
----------
Train loss 1.0202768135953832 accuracy 0.4988276670574443 precision 0.46068052015251304 recall 0.3730134396147773 f1 0.32518903103492375
Val   loss 1.0706968435219355 accuracy 0.4953051643192488 precision 0.648943648943649 f1 0.5304901614827511 
___________Model 1___________
Epoch 3/5
----------
Train loss 0.9545148920129847 accuracy 0.5492379835873388 precision 0.4782533259899077 recall 0.45326914843367766 f1 0.4199922843640855
Val   loss 1.1671809468950545 accuracy 0.49061032863849763 precision 0.433702411782323 f1 0.48670314158243433 
___________Model 1___________
Epoch 4/5
----------
Train loss

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…


DF_TRAIN_SPLITTED_LEN : 1706
DF_VAL_LEN : 426
___________Model 2___________
Epoch 1/5
----------
Train loss 1.0600369451222595 accuracy 0.4630715123094959 precision 0.3410314972383938 recall 0.343201961228139 f1 0.2724135900606489
Val   loss 0.9819961615971157 accuracy 0.5516431924882629 precision 0.5249784668389319 f1 0.5760614713188013 
___________Model 2___________
Epoch 2/5
----------
Train loss 1.014835278193156 accuracy 0.5158264947245017 precision 0.4643133882544481 recall 0.42207464819667634 f1 0.38631010271550736
Val   loss 0.979046357529504 accuracy 0.568075117370892 precision 0.5786265182653674 f1 0.5469738501958626 
___________Model 2___________
Epoch 3/5
----------
Train loss 0.815478777995816 accuracy 0.6389214536928488 precision 0.6143717702666639 recall 0.5826000212262524 f1 0.582498291394343
Val   loss 1.0627101872648512 accuracy 0.5704225352112676 precision 0.535405692032198 f1 0.5473001824463337 
___________Model 2___________
Epoch 4/5
----------
Train loss 0.587006

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…


DF_TRAIN_SPLITTED_LEN : 1706
DF_VAL_LEN : 426
___________Model 3___________
Epoch 1/5
----------
Train loss 1.0616586131078225 accuracy 0.4536928487690504 precision 0.3426383652608942 recall 0.3328379176384948 f1 0.2625941852244254
Val   loss 1.022812430347715 accuracy 0.5093896713615024 precision 0.5093896713615024 f1 0.3374805598755832 
___________Model 3___________
Epoch 2/5
----------
Train loss 1.0274282153005954 accuracy 0.49531066822977726 precision 0.32995167483119286 recall 0.38216964652692154 f1 0.3248045706180948
Val   loss 0.9534362724849156 accuracy 0.568075117370892 precision 0.5609803351738836 f1 0.5997479747974797 
___________Model 3___________
Epoch 3/5
----------
Train loss 0.900802453910863 accuracy 0.5715123094958968 precision 0.7206373709306714 recall 0.4825849844094834 f1 0.42745748366008796
Val   loss 1.0765790471008845 accuracy 0.5704225352112676 precision 0.5622038479885514 f1 0.6034421067697151 
___________Model 3___________
Epoch 4/5
----------
Train loss 0.

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…


DF_TRAIN_SPLITTED_LEN : 1706
DF_VAL_LEN : 426
___________Model 4___________
Epoch 1/5
----------
Train loss 1.065870346846404 accuracy 0.4566236811254396 precision 0.3723939665052485 recall 0.3416643645749255 f1 0.2968555729934183
Val   loss 1.0599404530865806 accuracy 0.47183098591549294 precision 0.47183098591549294 f1 0.3205741626794258 
___________Model 4___________
Epoch 2/5
----------
Train loss 1.0563584257055212 accuracy 0.48534583821805394 precision 0.4788022064617809 recall 0.3355905120451264 f1 0.2254437325583022
Val   loss 1.057251742907933 accuracy 0.47183098591549294 precision 0.47183098591549294 f1 0.3205741626794258 
___________Model 4___________
Epoch 3/5
----------
Train loss 1.0429720315668318 accuracy 0.4876905041031653 precision 0.5767039874287959 recall 0.5057513372819872 f1 0.3397323326007954
Val   loss 1.0473237761429377 accuracy 0.47183098591549294 precision 0.47183098591549294 f1 0.3205741626794258 
___________Model 4___________
Epoch 4/5
----------
Train los

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…


DF_TRAIN_SPLITTED_LEN : 1706
DF_VAL_LEN : 426
___________Model 5___________
Epoch 1/5
----------
Train loss 1.072938522806874 accuracy 0.458968347010551 precision 0.32196904342618404 recall 0.3256199484871883 f1 0.25163245358446346
Val   loss 1.080173637185778 accuracy 0.3474178403755869 precision 0.47149894440534834 f1 0.3614287350116962 
___________Model 5___________
Epoch 2/5
----------
Train loss 1.0400466996210593 accuracy 0.4742086752637749 precision 0.38127974680639687 recall 0.349671942257378 f1 0.2949938408026222
Val   loss 0.9896987208298275 accuracy 0.5305164319248826 precision 0.5192320200452677 f1 0.5706982281672386 
___________Model 5___________
Epoch 3/5
----------
Train loss 0.9724753896395365 accuracy 0.5398593200468933 precision 0.5079038112570978 recall 0.46085104759026335 f1 0.46165351461431064
Val   loss 0.9972238370350429 accuracy 0.5258215962441315 precision 0.561244019138756 f1 0.5696831983139037 
___________Model 5___________
Epoch 4/5
----------
Train loss 0.

In [None]:
def create_predictions(predictions): 
  for el in predictions:
    predictions[el]['tweet_id'] = np.concatenate(predictions[el]['tweet_id'],axis=0) 
    
  return predictions      

In [None]:
final_preds = create_predictions(predictions)

ValueError: ignored

# VOTING FUNCTION


After obtaining the classifications made by each fold in the cross validation, we apply a voting strategy, taking as the final prediction of a sentence the class most predicted by each fold.

In [None]:
def voting(preds, excluded):
  voting = {}
  exact = {}
  k = 0
  for el in preds:
    if el not in excluded:
      for i in range(len(preds[el]["tweet_id"])):
        if preds[el]["tweet_id"][i] not in voting:
          voting[preds[el]["tweet_id"][i]] = {"0":0,"1":0,"2":0}
                      
        if preds[el]["preds"][i] == 0:
          voting[preds[el]["tweet_id"][i]]["0"]+=1
        
        if preds[el]["preds"][i] == 1:
          voting[preds[el]["tweet_id"][i]]["1"]+=1
          
        if preds[el]["preds"][i] == 2:
          voting[preds[el]["tweet_id"][i]]["2"]+=1
      
  print(voting)
  for el in voting:
    voting[el]= max_value_keys = [key for key in voting[el].keys() if voting[el][key] == max(voting[el].values())][0]


  return voting , exact




In [None]:
#HERE DEFINE FOLDS THAT DOESN'T HAVE TO CONTRIBUTE TO VOTING
voting_res, exacts = voting(final_preds,[])

def write_output(file_name="test.tsv",test=False):
  f = open(file_name,"w+")
  for el in voting_res:58
    test_s = ""
    if test:
      test_s = "\t" + str(exacts[el])
    f.write(f'{el}\t{class_names[int(voting_res[el])]}{test_s}\n')

  f.close()

IndentationError: ignored

In [None]:
write_output("sardistance2020_UNITOR_TaskA_1_UMBERTO_BASED.tsv",test=False)

In [None]:
#MEASURE SYSTEM ACCURACY AFTER VOTING
def accuracy_voting(voting):
  exact = 0
  for e in voting:
    voting[e] = int(voting[e])
    if int(voting[e]) == exacts[e]:
      exact+=1

  print(exact/len(list(voting.keys())))

In [None]:
import csv
from sklearn.metrics import  precision_recall_fscore_support


y_true= {}
csvfile=open('LABEL.csv')
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in spamreader:
    if row[1] in ['AGAINST','FAVOR','NONE']:
        y_true[row[0]]=row[1].lower()
print("TEST-GOLD: ",len(y_true))

y_pred={}
csvfile=open('alberto.tsv')
spamreader = csv.reader(csvfile, delimiter='\t', quotechar='"')
for row in spamreader:
    if row[1] in ['AGAINST','FAVOR','NONE']:
        y_pred[row[0]]=row[1].lower()
print("TEST-PRED: ",len(y_pred))

l_true=[]
l_pred=[]
for tweet_id in y_true.keys():
    l_true.append(y_true[tweet_id])
    l_pred.append(y_pred[tweet_id])

if len(y_pred) == len(y_true):
    prec, recall, f, support = precision_recall_fscore_support(l_true, l_pred, average=None)

print((f[0]+f[1])/2,prec, recall, f, support)

TEST-GOLD:  1110
TEST-PRED:  1110
0.6272666888611189 [0.80739599 0.44621514 0.34285714] [0.70619946 0.57142857 0.41860465] [0.75341481 0.50111857 0.37696335] [742 196 172]
