In [1]:
!pip install transformers
!pip install datasets
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, https://us

In [2]:
# Cell for the download of the datasets
!wget https://zenodo.org/record/7550385/files/arguments-training.tsv
!wget https://zenodo.org/record/7550385/files/labels-training.tsv
!wget https://zenodo.org/record/7550385/files/arguments-validation.tsv
!wget https://zenodo.org/record/7550385/files/labels-validation.tsv
!wget https://zenodo.org/record/7550385/files/arguments-test.tsv
!wget https://zenodo.org/record/7550385/files/arguments-validation-zhihu.tsv
!wget https://zenodo.org/record/7550385/files/labels-validation-zhihu.tsv

--2023-02-04 14:45:00--  https://zenodo.org/record/7550385/files/arguments-training.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1012498 (989K) [application/octet-stream]
Saving to: ‘arguments-training.tsv’


2023-02-04 14:45:04 (434 KB/s) - ‘arguments-training.tsv’ saved [1012498/1012498]

--2023-02-04 14:45:04--  https://zenodo.org/record/7550385/files/labels-training.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 253843 (248K) [application/octet-stream]
Saving to: ‘labels-training.tsv’


2023-02-04 14:45:07 (410 KB/s) - ‘labels-training.tsv’ saved [253843/253843]

--2023-02-04 14:45:07--  https://zenodo.org/record/7550385/files/arguments-validation.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting

In [3]:
# imports for dataset loading
import numpy as np
import random
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# torch imports
import torch
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam

# progress bar
from tqdm import tqdm
# garbage collector
import gc

# imports for evaluation
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [4]:
def fix_random(seed: int) -> None:
  """Fix all the possible sources of randomness.

  Args:
    seed: the seed to use. 
  """
  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True

In [5]:
seed = 10
fix_random(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
def huggingface_from_pandas(pandas_df):
  hf_ds = Dataset.from_pandas(pandas_df, preserve_index=False)
  hf_ds = hf_ds.remove_columns(["Argument ID", "Argument ID2"])
  hf_ds = hf_ds.map(lambda x:{"labels": [int(x[col]) for col in hf_ds.column_names if
                                      col not in ['Conclusion', 'Stance', 'Premise']]})
  label_cols = [col for col in hf_ds.column_names if col not in ['Conclusion', 'Stance', 'Premise', "labels"]]
  hf_ds = hf_ds.remove_columns(label_cols)
  return hf_ds, label_cols

In [7]:
# Dataset loading and splitting
raw_training = pd.read_csv("arguments-training.tsv", encoding='utf-8', sep='\t', header=0)
raw_training_lab = pd.read_csv("labels-training.tsv", encoding='utf-8', sep='\t', header=0)
raw_test = pd.read_csv("arguments-validation.tsv", encoding='utf-8', sep='\t', header=0)
raw_test_lab = pd.read_csv("labels-validation.tsv", encoding='utf-8', sep='\t', header=0)

train = raw_training.join(raw_training_lab,how='inner' ,lsuffix='2') # joining labels
test = raw_test.join(raw_test_lab, how='inner', lsuffix='2') # joining labels
train, val = train_test_split(train ,train_size=.80, random_state=seed) # splitting training

train_ds, label_list = huggingface_from_pandas(train)
val_ds, _ = huggingface_from_pandas(val)
test_ds, _ = huggingface_from_pandas(test)

print(train_ds[0])
print(label_list)

whole_dataset = DatasetDict()
whole_dataset["train"] = train_ds.with_format("torch")
whole_dataset["val"] = val_ds.with_format("torch")
whole_dataset["test"] = test_ds.with_format("torch")

  0%|          | 0/4314 [00:00<?, ?ex/s]

  0%|          | 0/1079 [00:00<?, ?ex/s]

  0%|          | 0/1896 [00:00<?, ?ex/s]

{'Conclusion': 'We should ban the Church of Scientology', 'Stance': 'in favor of', 'Premise': "Scientology is not a true religion it is a sect or a cult which brainwashes it's followers and makes money from them.", 'labels': [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]}
['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']


In [8]:
print(whole_dataset.keys())
print(whole_dataset["train"])

dict_keys(['train', 'val', 'test'])
Dataset({
    features: ['Conclusion', 'Stance', 'Premise', 'labels'],
    num_rows: 4314
})


In [10]:
print(whole_dataset["train"]["labels"][0].shape)
print(whole_dataset["train"]["labels"])

torch.Size([20])
tensor([[1, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 1, 0],
        [0, 0, 1,  ..., 0, 0, 1]])


In [9]:
from os import supports_effective_ids
def make_predictions(model, loader):
    Y_shuffled, Y_preds = [], []
    model.eval()
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
    gc.collect()
    Y_preds = torch.cat(Y_preds)
    Y_preds = Y_preds.sigmoid()
    return Y_preds.detach()

def keep_above_thresh(Y_preds, thr):
  Y_preds_thr = np.copy(Y_preds.numpy())
  max_rows = Y_preds_thr.shape[0]
  max_cols = Y_preds_thr.shape[1]
  for i in range(max_rows):
    new_row = np.array([1 if Y_preds_thr[i][j] > thr else 0 for j in range(max_cols)])
    Y_preds_thr[i] = new_row
  return Y_preds_thr

def compute_macro_score(M_true, M_pred, score_func):
    scores = []
    for i in range(M_true.shape[1]):
        true = M_true[:, i]
        pred = M_pred[:, i]
        if score_func == accuracy_score:
          scores.append(score_func(true, pred))
        else: 
          scores.append(score_func(true, pred, zero_division=0))
    return np.mean(scores), scores
  
def support(true, pred, zero_division):
  return sum(true)

def print_report(classifier, loader, y_true, threshold, labels=label_list):
  Y_preds = make_predictions(classifier, loader)
  Y_preds_thr = keep_above_thresh(Y_preds.to('cpu'), threshold)
  f1_macro, f1 = compute_macro_score(y_true, Y_preds_thr, f1_score)
  acc_macro, acc = compute_macro_score(y_true, Y_preds_thr, accuracy_score)
  prec_macro, prec = compute_macro_score(y_true, Y_preds_thr, precision_score)
  rec_macro, rec = compute_macro_score(y_true, Y_preds_thr, recall_score)
  _, sup = compute_macro_score(y_true, Y_preds_thr, support)
  print("----- MACRO AVG. -----")
  print(f"  F1-score:\t{round(f1_macro,4)}\n\
  Precision:\t{round(prec_macro,4)}\n\
  Recall:\t{round(rec_macro,4)}\n\
  Accuracy:\t{round(acc_macro,4)}")
  print("----- PER-CLASS VALUES -----")
  print("  \t\t\t\tF1-score\tPrecision\tRecall\t\tAccuracy\tSupport")
  for i in range(len(labels)):
    print("  " + labels[i]+" "*(len(max(labels, key=len))-len(labels[i])), end="\t")
    print(f"{round(f1[i],4)}\t\t{round(prec[i],4)}\t\t{round(rec[i],4)}\t\t{round(acc[i],4)}\t\t{sup[i]}")

In [11]:
from transformers import BertTokenizer, BertModel

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [12]:
lengths = whole_dataset["train"].map(
    lambda x: 
    {"tok" : bert_tokenizer(x["Conclusion"])}).map(lambda x:{"len": len(x["tok"]["token_type_ids"])})
print(lengths["len"])

  0%|          | 0/4314 [00:00<?, ?ex/s]

  0%|          | 0/4314 [00:00<?, ?ex/s]

tensor([ 9,  7,  9,  ...,  9, 11, 10])


In [13]:
print(np.quantile(lengths["len"], .9))
print(np.mean(lengths["len"].numpy()))

13.0
9.60477515067223


In [14]:
max_words = 70
batch_size = 32

# collate function that uses the tokenizer relative to the bert pretrained model
def bert_vectorize_batch(batch):
    X = [elem["Premise"] + " [SEP] " + elem["Stance"] + " [SEP] " + elem["Conclusion"] for elem in batch] 
    Y = [elem["labels"] for elem in batch]
    X = bert_tokenizer(X, padding="max_length", truncation="longest_first", return_tensors = "pt", max_length = max_words) 
    Y_tensor = torch.zeros(len(batch), Y[0].shape[0])
    for i, tokens in enumerate(Y):    
        Y_tensor[i] = Y[i]
    X_tensor = torch.stack([X["input_ids"], X["token_type_ids"], X["attention_mask"]])

    return X_tensor, Y_tensor

train_dataset = whole_dataset["train"]
val_dataset = whole_dataset["val"] 
test_dataset = whole_dataset["test"] 

# Construction of the Dataloaders for train and validation
bert_train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))
bert_val_loader  = DataLoader(val_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))
bert_test_loader  = DataLoader(test_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))

In [15]:
str_to_test = "Top scorer"
str_to_test_2 = "Stand still"


bert_model.to(device)

normalized_batch = bert_tokenizer([str_to_test, str_to_test_2], 
                                  padding="max_length", 
                                  max_length=max_words, 
                                  truncation="longest_first", 
                                  return_tensors = "pt")
tensor_batch = torch.tensor(np.array([normalized_batch["input_ids"].numpy(), 
                                      normalized_batch["token_type_ids"].numpy(), 
                                      normalized_batch["attention_mask"].numpy()
                                      ]))

# token of the sequence 
print(bert_model(input_ids = tensor_batch[0].to(device), token_type_ids =  tensor_batch[1].to(device), attention_mask = tensor_batch[2].to(device)).pooler_output.shape)
print(bert_model(input_ids = tensor_batch[0].to(device), token_type_ids =  tensor_batch[1].to(device), attention_mask = tensor_batch[2].to(device)).pooler_output)

torch.Size([2, 768])
tensor([[-0.9275, -0.3706, -0.6260,  ..., -0.5522, -0.5421,  0.8422],
        [-0.9419, -0.2029, -0.7530,  ..., -0.4699, -0.5669,  0.8745]],
       device='cuda:0', grad_fn=<TanhBackward0>)


In [18]:
num_classes = 20

# Simple model to perform some tests with pytorch
class BertLSTM(nn.Module):
    def __init__(self):
        super(BertLSTM, self).__init__() 
        self.bert_model = bert_model
        for param in self.bert_model.parameters():
            param.requires_grad = False

        self.lstm_layers = 2
        self.lstm_hs = 128
        self.lstm = nn.LSTM(input_size=768,
                            hidden_size=self.lstm_hs,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True)
        
        self.self_attention = nn.MultiheadAttention(200, 1)
        self.conv1d = nn.Conv1d(200, 200, 5)
        self.max_pool = nn.MaxPool1d(2)
        self.linear_1 = nn.Linear(512, 128)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(128, num_classes)
        self.reducer = nn.Linear(768, self.lstm_hs)
        
                

    def forward(self, X_batch):
        out = self.bert_model(input_ids=X_batch[0], token_type_ids = X_batch[1], attention_mask = X_batch[2])
        cell = self.reducer(out.pooler_output)
        out = out.last_hidden_state
        h0 = torch.zeros(self.lstm_layers * 2, X_batch.shape[1], 128)
        h0 = h0.to(device)
        c0 = torch.stack([cell,cell,cell,cell])
        out_lstm, hc_n  = self.lstm(out, (h0, c0))
        c_n = hc_n[1].permute(1, 0, 2)
        out = torch.cat([c_n[:,0,:], c_n[:,1,:]], 1)
        out2 = torch.cat([c_n[:,2,:], c_n[:,3,:]], 1)
        out = torch.cat([out, out2], 1)
        out = self.linear_1(out)
        out = self.relu(out)
        out = self.linear_2(out)
        return out

# Function needed to compute the validation loss and the accuracy
def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
      Y_shuffled, Y_preds, losses = [],[],[]
      for X, Y in val_loader:
        preds = model(X)
        loss = loss_fn(preds, Y)
        losses.append(loss.item())
        Y_shuffled.append(Y)
        Y_preds.append(preds.argmax(dim=-1))

      Y_shuffled = torch.cat(Y_shuffled)
      Y_preds = torch.cat(Y_preds)

      loss = torch.tensor(losses).mean()
      print("Valid Loss : {:.3f}".format(loss))
    return loss

# Training function
def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs, early_stopping_info):
    patience_acc = 0
    precedent_loss = np.Inf
    model.train()
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss = CalcValLossAndAccuracy(model, loss_fn, val_loader)
        if precedent_loss - loss < early_stopping_info["delta"]:
           patience_acc = patience_acc + 1
        else:
          patience_acc = 0
          precedent_loss = loss
          torch.save(model, "best.pth")

        if patience_acc > early_stopping_info["patience"]:
          return torch.load("best.pth")

        if i%1==0:
            print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
    return model

In [19]:
from torchinfo import summary
epochs = 50
learning_rate = 1e-4

loss_fn = nn.BCEWithLogitsLoss()
prebert_classifier = BertLSTM()
optimizer = Adam(prebert_classifier.parameters(), lr=learning_rate)

prebert_classifier.to(device)
summary(prebert_classifier, input_size=(3, 1, max_words), 
        device="cuda", dtypes = [torch.int]*3)

Layer (type:depth-idx)                                  Output Shape              Param #
BertLSTM                                                [1, 20]                   361,000
├─BertModel: 1-1                                        [1, 768]                  --
│    └─BertEmbeddings: 2-1                              [1, 70, 768]              --
│    │    └─Embedding: 3-1                              [1, 70, 768]              (23,440,896)
│    │    └─Embedding: 3-2                              [1, 70, 768]              (1,536)
│    │    └─Embedding: 3-3                              [1, 70, 768]              (393,216)
│    │    └─LayerNorm: 3-4                              [1, 70, 768]              (1,536)
│    │    └─Dropout: 3-5                                [1, 70, 768]              --
│    └─BertEncoder: 2-2                                 [1, 70, 768]              --
│    │    └─ModuleList: 3-6                             --                        (85,054,464)
│    └─BertPooler:

In [20]:
fix_random(seed)
prebert_classifier = TrainModel(prebert_classifier, loss_fn, optimizer, bert_train_loader, bert_val_loader, epochs, {"patience": 3, "delta": 1e-4})

100%|██████████| 135/135 [00:23<00:00,  5.72it/s]


Valid Loss : 0.401
Train Loss : 0.462


100%|██████████| 135/135 [00:23<00:00,  5.81it/s]


Valid Loss : 0.365
Train Loss : 0.390


100%|██████████| 135/135 [00:23<00:00,  5.74it/s]


Valid Loss : 0.345
Train Loss : 0.355


100%|██████████| 135/135 [00:22<00:00,  5.91it/s]


Valid Loss : 0.336
Train Loss : 0.338


100%|██████████| 135/135 [00:23<00:00,  5.86it/s]


Valid Loss : 0.332
Train Loss : 0.326


100%|██████████| 135/135 [00:22<00:00,  5.93it/s]


Valid Loss : 0.329
Train Loss : 0.317


100%|██████████| 135/135 [00:23<00:00,  5.86it/s]


Valid Loss : 0.327
Train Loss : 0.310


100%|██████████| 135/135 [00:22<00:00,  5.88it/s]


Valid Loss : 0.325
Train Loss : 0.304


100%|██████████| 135/135 [00:23<00:00,  5.84it/s]


Valid Loss : 0.325
Train Loss : 0.298


100%|██████████| 135/135 [00:22<00:00,  5.97it/s]


Valid Loss : 0.326
Train Loss : 0.293


100%|██████████| 135/135 [00:22<00:00,  5.96it/s]


Valid Loss : 0.325
Train Loss : 0.288


100%|██████████| 135/135 [00:23<00:00,  5.84it/s]


Valid Loss : 0.327
Train Loss : 0.282


100%|██████████| 135/135 [00:23<00:00,  5.69it/s]


Valid Loss : 0.328


In [21]:
print_report(prebert_classifier, bert_val_loader, val_dataset["labels"], 0.25)

----- MACRO AVG. -----
  F1-score:	0.4657
  Precision:	0.4423
  Recall:	0.5206
  Accuracy:	0.8265
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.6435		0.5744		0.7316		0.8573		190
  Self-direction: action    	0.5468		0.4535		0.6884		0.7081		276
  Stimulation               	0.2424		0.3478		0.186		0.9537		43
  Hedonism                  	0.1852		0.2381		0.1515		0.9592		33
  Achievement               	0.6239		0.5271		0.7642		0.7285		318
  Power: dominance          	0.4015		0.3929		0.4104		0.848		134
  Power: resources          	0.4779		0.4576		0.5		0.8906		108
  Face                      	0.1565		0.1731		0.1429		0.9101		63
  Security: personal        	0.6688		0.5529		0.8462		0.7071		377
  Security: societal        	0.703		0.6081		0.8328		0.7776		341
  Tradition                 	0.5		0.4537		0.5568		0.9092		88
  Conformity: rules         	0.5008		0.4119		0.6386		0.7062		249
  Conformity: interpersonal 	0.3793		0.55		0.2895		0.966

In [22]:
prebert_classifier = None
gc.collect()

0