In [1]:
!pip install transformers
!pip install datasets
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, https://us

In [2]:
# Cell for the download of the datasets
!wget https://zenodo.org/record/7550385/files/arguments-training.tsv
!wget https://zenodo.org/record/7550385/files/labels-training.tsv
!wget https://zenodo.org/record/7550385/files/arguments-validation.tsv
!wget https://zenodo.org/record/7550385/files/labels-validation.tsv
!wget https://zenodo.org/record/7550385/files/arguments-test.tsv
!wget https://zenodo.org/record/7550385/files/arguments-validation-zhihu.tsv
!wget https://zenodo.org/record/7550385/files/labels-validation-zhihu.tsv

--2023-02-07 11:14:00--  https://zenodo.org/record/7550385/files/arguments-training.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1012498 (989K) [application/octet-stream]
Saving to: ‘arguments-training.tsv’


2023-02-07 11:14:05 (321 KB/s) - ‘arguments-training.tsv’ saved [1012498/1012498]

--2023-02-07 11:14:05--  https://zenodo.org/record/7550385/files/labels-training.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 253843 (248K) [application/octet-stream]
Saving to: ‘labels-training.tsv’


2023-02-07 11:14:08 (324 KB/s) - ‘labels-training.tsv’ saved [253843/253843]

--2023-02-07 11:14:08--  https://zenodo.org/record/7550385/files/arguments-validation.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting

In [3]:
# imports for dataset loading
import numpy as np
import random
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# torch imports
import torch
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from torchinfo import summary
from torch.optim import AdamW

#huggingface imports
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

# progress bar
from tqdm import tqdm
# garbage collector
import gc

# imports for evaluation
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score



In [4]:
def fix_random(seed: int) -> None:
  """Fix all the possible sources of randomness.

  Args:
    seed: the seed to use. 
  """
  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True

In [5]:
seed = 10
fix_random(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
def huggingface_from_pandas(pandas_df):
  hf_ds = Dataset.from_pandas(pandas_df, preserve_index=False)
  hf_ds = hf_ds.remove_columns(["Argument ID", "Argument ID2"])
  hf_ds = hf_ds.map(lambda x:{"labels": [int(x[col]) for col in hf_ds.column_names if
                                      col not in ['Conclusion', 'Stance', 'Premise']]})
  label_cols = [col for col in hf_ds.column_names if col not in ['Conclusion', 'Stance', 'Premise', "labels"]]
  hf_ds = hf_ds.remove_columns(label_cols)
  return hf_ds, label_cols

In [7]:
# Dataset loading and splitting
raw_training = pd.read_csv("arguments-training.tsv", encoding='utf-8', sep='\t', header=0)
raw_training_lab = pd.read_csv("labels-training.tsv", encoding='utf-8', sep='\t', header=0)
raw_test = pd.read_csv("arguments-validation.tsv", encoding='utf-8', sep='\t', header=0)
raw_test_lab = pd.read_csv("labels-validation.tsv", encoding='utf-8', sep='\t', header=0)
raw_test_chn=pd.read_csv("arguments-validation-zhihu.tsv", encoding='utf-8', sep='\t', header=0)
raw_test_chn_lab=pd.read_csv("labels-validation-zhihu.tsv", encoding='utf-8', sep='\t', header=0)

train = raw_training.join(raw_training_lab,how='inner' ,lsuffix='2') # joining labels
test = raw_test.join(raw_test_lab, how='inner', lsuffix='2') # joining labels
test_chn = raw_test_chn.join(raw_test_chn_lab, how='inner', lsuffix='2')
train, val = train_test_split(train ,train_size=.80, random_state=seed) # splitting training

train_ds, label_list = huggingface_from_pandas(train)
val_ds, _ = huggingface_from_pandas(val)
test_ds, _ = huggingface_from_pandas(test)
test_chn_ds, l = huggingface_from_pandas(test_chn)


print(train_ds[0])
print(label_list)
num_classes = len(label_list)
print(num_classes)
whole_dataset = DatasetDict()
whole_dataset["train"] = train_ds.with_format("torch")
whole_dataset["val"] = val_ds.with_format("torch")
whole_dataset["test"] = test_ds.with_format("torch")
whole_dataset["test_chn"] = test_chn_ds.with_format("torch")

  0%|          | 0/4314 [00:00<?, ?ex/s]

  0%|          | 0/1079 [00:00<?, ?ex/s]

  0%|          | 0/1896 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

{'Conclusion': 'We should ban the Church of Scientology', 'Stance': 'in favor of', 'Premise': "Scientology is not a true religion it is a sect or a cult which brainwashes it's followers and makes money from them.", 'labels': [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]}
['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']
20


In [8]:
def make_predictions(model, loader):
    Y_shuffled, Y_preds = [], []
    model.eval()
    for X, Y in loader:
      with torch.no_grad():
        preds = model(X)
      Y_preds.append(preds)
    gc.collect()
    Y_preds = torch.cat(Y_preds)
    Y_preds = Y_preds.sigmoid()
    return Y_preds.detach()

def keep_above_thresh(Y_preds, thr):
  Y_preds_thr = np.copy(Y_preds.numpy())
  max_rows = Y_preds_thr.shape[0]
  max_cols = Y_preds_thr.shape[1]
  for i in range(max_rows):
    new_row = np.array([1 if Y_preds_thr[i][j] > thr else 0 for j in range(max_cols)])
    Y_preds_thr[i] = new_row
  return Y_preds_thr

def compute_macro_score(M_true, M_pred, score_func):
    scores = []
    for i in range(M_true.shape[1]):
        true = M_true[:, i]
        pred = M_pred[:, i]
        if score_func == accuracy_score:
          scores.append(score_func(true, pred))
        else: 
          scores.append(score_func(true, pred, zero_division=0))
    return np.mean(scores), scores
  
def support(true, pred, zero_division):
  return sum(true)

def print_report(classifier, loader, y_true, threshold, labels=label_list):
  Y_preds = make_predictions(classifier, loader)
  Y_preds_thr = keep_above_thresh(Y_preds.to('cpu'), threshold)
  f1_macro, f1 = compute_macro_score(y_true, Y_preds_thr, f1_score)
  acc_macro, acc = compute_macro_score(y_true, Y_preds_thr, accuracy_score)
  prec_macro, prec = compute_macro_score(y_true, Y_preds_thr, precision_score)
  rec_macro, rec = compute_macro_score(y_true, Y_preds_thr, recall_score)
  _, sup = compute_macro_score(y_true, Y_preds_thr, support)
  print("----- MACRO AVG. -----")
  print(f"  F1-score:\t{round(f1_macro,4)}\n\
  Precision:\t{round(prec_macro,4)}\n\
  Recall:\t{round(rec_macro,4)}\n\
  Accuracy:\t{round(acc_macro,4)}")
  print("----- PER-CLASS VALUES -----")
  print("  \t\t\t\tF1-score\tPrecision\tRecall\t\tAccuracy\tSupport")
  for i in range(len(labels)):
    print("  " + labels[i]+" "*(len(max(labels, key=len))-len(labels[i])), end="\t")
    print(f"{round(f1[i],4)}\t\t{round(prec[i],4)}\t\t{round(rec[i],4)}\t\t{round(acc[i],4)}\t\t{sup[i]}")

In [9]:
# Pretrained GloVe setup

global_vectors = GloVe(name='6B', dim=100)

# the current choice is to give an id to each word
tokenizer = get_tokenizer("basic_english")

embeddings = global_vectors.get_vecs_by_tokens(tokenizer("Hello, How are you?"), lower_case_backup=True)

print(embeddings.shape)

.vector_cache/glove.6B.zip: 862MB [02:42, 5.30MB/s]                           
100%|█████████▉| 399999/400000 [00:15<00:00, 25394.56it/s]


torch.Size([6, 100])


In [10]:
max_words_emb = 35
embed_len = 100

# collate function where the Premises are tokenized and embedded in batches
def vectorize_batch(batch):
    X = [elem["Premise"] + " " + elem["Stance"] + " " +elem["Conclusion"] for elem in batch]
    Y = [elem["labels"] for elem in batch]
    X = [tokenizer(x) for x in X]
    X = [tokens+[""] * (max_words_emb-len(tokens))  if len(tokens)<max_words_emb else tokens[:max_words_emb] for tokens in X]
    X_tensor = torch.zeros(len(batch), max_words_emb, embed_len)
    Y_tensor = torch.zeros(len(batch), Y[0].shape[0])
    for i, tokens in enumerate(X):
        X_tensor[i] = global_vectors.get_vecs_by_tokens(tokens)
        Y_tensor[i] = Y[i]
    return X_tensor, Y_tensor

In [11]:
# Simple model to perform some tests with pytorch
class EmbeddingClassifier(nn.Module):
    def __init__(self):
        super(EmbeddingClassifier, self).__init__() 
       
        self.gru_layers = 1

        self.gru = nn.GRU(input_size = embed_len,
                          hidden_size = embed_len,
                          num_layers = self.gru_layers,
                          batch_first=True, 
                          bidirectional = True)
        self.flatten = nn.Flatten(start_dim=1)
        self.linear_1 = nn.Linear(max_words_emb*embed_len*self.gru_layers*2, 512)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(512,128)
        self.linear_3 = nn.Linear(128, num_classes)
        
                

    def forward(self, X_batch):
        h0 = torch.zeros(2*self.gru_layers,X_batch.shape[0], embed_len)
        h0 = h0.to(device)
        out, hn = self.gru(X_batch, h0)
        out = self.flatten(out)
        out = self.linear_1(out)
        out = self.relu(out)
        out = self.linear_2(out)
        out = self.relu(out)
        out = self.linear_3(out)
        return out

# Function needed to compute the validation loss and the accuracy
def CalcValLoss(model, loss_fn, val_loader):
    with torch.no_grad():
      Y_shuffled, Y_preds, losses = [],[],[]
      for X, Y in val_loader:
        preds = model(X)
        loss = loss_fn(preds, Y)
        losses.append(loss.item())
        Y_shuffled.append(Y)
        Y_preds.append(preds.argmax(dim=-1))

      Y_shuffled = torch.cat(Y_shuffled)
      Y_preds = torch.cat(Y_preds)

      loss = torch.tensor(losses).mean()
      print("Valid Loss : {:.3f}".format(loss))
    return loss


# Training function
def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs, early_stopping_info, model_name):
    patience_acc = 0
    precedent_loss = np.Inf
    model.train()
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):

            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss = CalcValLoss(model, loss_fn, val_loader)
        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        if precedent_loss - loss < early_stopping_info["delta"]:
           patience_acc = patience_acc + 1
        else:
          patience_acc = 0
          torch.save(model, model_name + "_best.pth")

        if patience_acc > early_stopping_info["patience"]:
          return torch.load(model_name + "_best.pth")
        precedent_loss = loss
            
    return model

In [12]:
epochs = 50
learning_rate = 1e-4
batch_size = 32

loss_fn = nn.BCEWithLogitsLoss()
embed_classifier = EmbeddingClassifier()
optimizer = Adam(embed_classifier.parameters(), lr=learning_rate)

# Construction of the Dataloaders for train and validation
train_loader = DataLoader(whole_dataset["train"], batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in vectorize_batch(x)))
val_loader  = DataLoader(whole_dataset["val"], batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in vectorize_batch(x)))
test_loader  = DataLoader(whole_dataset["test"], batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in vectorize_batch(x)))


embed_classifier.to(device)
summary(embed_classifier, 
                input_data=next(iter(train_loader))[0],
                device=device)


Layer (type:depth-idx)                   Output Shape              Param #
EmbeddingClassifier                      [32, 20]                  --
├─GRU: 1-1                               [32, 35, 200]             121,200
├─Flatten: 1-2                           [32, 7000]                --
├─Linear: 1-3                            [32, 512]                 3,584,512
├─ReLU: 1-4                              [32, 512]                 --
├─Linear: 1-5                            [32, 128]                 65,664
├─ReLU: 1-6                              [32, 128]                 --
├─Linear: 1-7                            [32, 20]                  2,580
Total params: 3,773,956
Trainable params: 3,773,956
Non-trainable params: 0
Total mult-adds (M): 252.63
Input size (MB): 0.45
Forward/backward pass size (MB): 1.96
Params size (MB): 15.10
Estimated Total Size (MB): 17.50

In [13]:
fix_random(seed)
embed_classifier = TrainModel(embed_classifier, loss_fn, optimizer, train_loader, val_loader, epochs, {"patience": 3, "delta": 1e-4}, "glove")

100%|██████████| 135/135 [00:02<00:00, 48.10it/s]


Valid Loss : 0.406
Train Loss : 0.445


100%|██████████| 135/135 [00:02<00:00, 47.61it/s]


Valid Loss : 0.399
Train Loss : 0.410


100%|██████████| 135/135 [00:03<00:00, 42.62it/s]


Valid Loss : 0.384
Train Loss : 0.397


100%|██████████| 135/135 [00:02<00:00, 56.85it/s]


Valid Loss : 0.370
Train Loss : 0.379


100%|██████████| 135/135 [00:02<00:00, 56.39it/s]


Valid Loss : 0.362
Train Loss : 0.365


100%|██████████| 135/135 [00:02<00:00, 57.10it/s]


Valid Loss : 0.356
Train Loss : 0.356


100%|██████████| 135/135 [00:03<00:00, 40.36it/s]


Valid Loss : 0.353
Train Loss : 0.349


100%|██████████| 135/135 [00:02<00:00, 57.12it/s]


Valid Loss : 0.350
Train Loss : 0.342


100%|██████████| 135/135 [00:02<00:00, 56.74it/s]


Valid Loss : 0.348
Train Loss : 0.335


100%|██████████| 135/135 [00:02<00:00, 57.12it/s]


Valid Loss : 0.346
Train Loss : 0.328


100%|██████████| 135/135 [00:03<00:00, 38.69it/s]


Valid Loss : 0.345
Train Loss : 0.322


100%|██████████| 135/135 [00:02<00:00, 57.01it/s]


Valid Loss : 0.345
Train Loss : 0.315


100%|██████████| 135/135 [00:02<00:00, 57.31it/s]


Valid Loss : 0.345
Train Loss : 0.309


100%|██████████| 135/135 [00:02<00:00, 56.73it/s]


Valid Loss : 0.345
Train Loss : 0.303


100%|██████████| 135/135 [00:02<00:00, 45.92it/s]


Valid Loss : 0.346
Train Loss : 0.296


100%|██████████| 135/135 [00:02<00:00, 51.91it/s]


Valid Loss : 0.347
Train Loss : 0.290


In [15]:
print_report(embed_classifier, val_loader, whole_dataset["val"]["labels"] ,0.25)
# batchsize 32, 1e-4, no conclusion/stance 0.3648 max 25
# batchsize 64, 1e-4, no conclusion/stance 0.3478
# batchsize 32, 1e-4, conclusion/stance 0.3505 max 35


----- MACRO AVG. -----
  F1-score:	0.3749
  Precision:	0.3606
  Recall:	0.4363
  Accuracy:	0.8086
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.5377		0.4385		0.6947		0.7896		190
  Self-direction: action    	0.5116		0.411		0.6775		0.6691		276
  Stimulation               	0.0		0.0		0.0		0.9583		43
  Hedonism                  	0.0		0.0		0.0		0.9666		33
  Achievement               	0.5828		0.5034		0.6918		0.7081		318
  Power: dominance          	0.3045		0.3394		0.2761		0.8434		134
  Power: resources          	0.4758		0.4538		0.5		0.8897		108
  Face                      	0.0513		0.1333		0.0317		0.9314		63
  Security: personal        	0.6386		0.5297		0.8037		0.6821		377
  Security: societal        	0.6533		0.5714		0.7625		0.7442		341
  Tradition                 	0.4147		0.3488		0.5114		0.8823		88
  Conformity: rules         	0.4541		0.3706		0.5863		0.6747		249
  Conformity: interpersonal 	0.0		0.0		0.0		0.9648		38
  Humility    

In [16]:
gc.collect()

0

In [17]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)
print("Bert loaded")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Bert loaded


In [38]:
max_words_bert = 70
# collate function that uses the tokenizer relative to the bert pretrained model
def bert_vectorize_batch(batch):
    X = [elem["Premise"] + " [SEP] " + elem["Stance"] + " [SEP] " + elem["Conclusion"] for elem in batch]
    Y = [elem["labels"] for elem in batch]
    X = bert_tokenizer(X, padding="max_length", truncation="longest_first", return_tensors = "pt", max_length = max_words_bert) 
    Y_tensor = torch.zeros(len(batch), Y[0].shape[0])
    for i, tokens in enumerate(Y):    
        Y_tensor[i] = Y[i]
    X_tensor = torch.stack([X["input_ids"], X["token_type_ids"], X["attention_mask"]])

    return X_tensor, Y_tensor

train_dataset = whole_dataset["train"]
val_dataset = whole_dataset["val"] 
test_dataset = whole_dataset["test"] 

In [39]:
# Simple model to perform some tests with pytorch
class BertLSTM(nn.Module):
    def __init__(self,bert_model):
        super(BertLSTM, self).__init__() 
        self.lstm_layers = 2
        self.lstm_hs = 128
        bert_hidden_size = bert_model.config.hidden_size

        self.bert_model = bert_model
        for param in self.bert_model.parameters():
            param.requires_grad = False

        self.lstm = nn.LSTM(input_size=bert_hidden_size,
                            hidden_size=self.lstm_hs,
                            num_layers=self.lstm_layers ,
                            batch_first=True,
                            bidirectional=True)
        self.reducer_c0 = nn.Linear(bert_hidden_size, self.lstm_hs)
        self.reducer_h0 = nn.Linear(bert_hidden_size, self.lstm_hs)
        self.linear_1 = nn.Linear(self.lstm_hs*2*self.lstm_layers, self.lstm_hs)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(self.lstm_hs, num_classes) 

    def forward(self, X_batch):
        out = self.bert_model(input_ids=X_batch[0], token_type_ids = X_batch[1], attention_mask = X_batch[2])
        cell = self.reducer_c0(out.pooler_output)
        hidden = self.reducer_h0(out.pooler_output)
        out = out.last_hidden_state[:,1:,:]
        c0 = torch.stack([cell,cell,cell,cell])
        h0 = torch.stack([hidden, hidden, hidden, hidden])
        out_lstm, hc_n  = self.lstm(out, (h0, c0))
        c_n = hc_n[1].permute(1, 0, 2)
        out = torch.cat([c_n[:,0,:], c_n[:,1,:]], 1)
        out2 = torch.cat([c_n[:,2,:], c_n[:,3,:]], 1)
        out = torch.cat([out, out2], 1)
        out = self.linear_1(out)
        out = self.relu(out)
        out = self.linear_2(out)
        return out

In [40]:
batch_size = 32
epochs = 50
learning_rate = 1e-3

loss_fn = nn.BCEWithLogitsLoss()
prebert_classifier = BertLSTM(bert_model)
optimizer = Adam(prebert_classifier.parameters(), lr=learning_rate)

bert_train_loader = DataLoader(whole_dataset["train"], batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))
bert_val_loader  = DataLoader(whole_dataset["val"], batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))
bert_test_loader  = DataLoader(whole_dataset["test"], batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))

prebert_classifier.to(device)
summary(prebert_classifier, 
                input_data=next(iter(bert_train_loader))[0],
                device=device)

Layer (type:depth-idx)                                  Output Shape              Param #
BertLSTM                                                [32, 20]                  --
├─BertModel: 1-1                                        [32, 768]                 --
│    └─BertEmbeddings: 2-1                              [32, 50, 768]             --
│    │    └─Embedding: 3-1                              [32, 50, 768]             (23,440,896)
│    │    └─Embedding: 3-2                              [32, 50, 768]             (1,536)
│    │    └─Embedding: 3-3                              [1, 50, 768]              (393,216)
│    │    └─LayerNorm: 3-4                              [32, 50, 768]             (1,536)
│    │    └─Dropout: 3-5                                [32, 50, 768]             --
│    └─BertEncoder: 2-2                                 [32, 50, 768]             --
│    │    └─ModuleList: 3-6                             --                        (85,054,464)
│    └─BertPooler: 2-3 

In [41]:
fix_random(seed)
prebert_classifier = TrainModel(prebert_classifier, loss_fn, optimizer, bert_train_loader, bert_val_loader, epochs, {"patience": 3, "delta": 1e-4}, "bertencoder")

100%|██████████| 135/135 [00:15<00:00,  8.57it/s]


Valid Loss : 0.347
Train Loss : 0.394


100%|██████████| 135/135 [00:15<00:00,  8.71it/s]


Valid Loss : 0.334
Train Loss : 0.335


100%|██████████| 135/135 [00:15<00:00,  8.85it/s]


Valid Loss : 0.332
Train Loss : 0.313


100%|██████████| 135/135 [00:15<00:00,  8.89it/s]


Valid Loss : 0.332
Train Loss : 0.295


100%|██████████| 135/135 [00:15<00:00,  8.75it/s]


Valid Loss : 0.341
Train Loss : 0.278


100%|██████████| 135/135 [00:15<00:00,  8.80it/s]


Valid Loss : 0.350
Train Loss : 0.258


100%|██████████| 135/135 [00:15<00:00,  8.64it/s]


Valid Loss : 0.362
Train Loss : 0.240


In [42]:
print_report(prebert_classifier, bert_val_loader, whole_dataset["val"]["labels"], 0.25)

----- MACRO AVG. -----
  F1-score:	0.4326
  Precision:	0.4227
  Recall:	0.4921
  Accuracy:	0.8195
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.6147		0.5447		0.7053		0.8443		190
  Self-direction: action    	0.5346		0.4273		0.7138		0.6821		276
  Stimulation               	0.1935		0.3158		0.1395		0.9537		43
  Hedonism                  	0.1		0.2857		0.0606		0.9666		33
  Achievement               	0.6167		0.501		0.8019		0.7062		318
  Power: dominance          	0.4186		0.3429		0.5373		0.8146		134
  Power: resources          	0.4437		0.358		0.5833		0.8536		108
  Face                      	0.1386		0.1842		0.1111		0.9194		63
  Security: personal        	0.6774		0.5683		0.8382		0.721		377
  Security: societal        	0.6843		0.5636		0.871		0.7461		341
  Tradition                 	0.4249		0.3905		0.4659		0.8971		88
  Conformity: rules         	0.5064		0.4662		0.5542		0.7507		249
  Conformity: interpersonal 	0.1455		0.2353		0.1053		0

In [None]:
prebert_classifier = None
gc.collect()

26

In [32]:
# Simple model to perform some tests with pytorch
class FineTunedBert(nn.Module):
    def __init__(self, bert_model):
        super(FineTunedBert, self).__init__() 
        self.bert_model = bert_model
        for param in self.bert_model.parameters():
            param.requires_grad = True
        bert_hidden_size = bert_model.config.hidden_size
        self.linear_1 = nn.Linear(bert_hidden_size, bert_hidden_size//2)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(bert_hidden_size//2, num_classes)

    def forward(self, X_batch):

        out = self.bert_model(input_ids=X_batch[0], 
                              token_type_ids = X_batch[1],
                              attention_mask = X_batch[2])

        out = out.last_hidden_state[:,0,:]
        out = self.linear_1(out)
        out = self.relu(out)
        out = self.linear_2(out)
        return out

# Training function
def finetune_bert(model, loss_fn, optimizer, train_loader, val_loader, epochs, early_stopping_info, model_name, scheduler):
    patience_acc = 0
    precedent_loss = np.Inf
    model.train()
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            model.zero_grad()
            Y_preds = model(X)
            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        loss = CalcValLoss(model, loss_fn, val_loader)
        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        if precedent_loss - loss < early_stopping_info["delta"]:
           patience_acc = patience_acc + 1
        else:
          patience_acc = 0
          precedent_loss = loss
          torch.save(model, model_name + "_best.pth")

        if patience_acc > early_stopping_info["patience"]:
          return torch.load(model_name + "best.pth")


    return model

In [33]:
bert_model_unfrozen = BertModel.from_pretrained('bert-base-uncased')
bert_model_unfrozen.to(device)
print("ricaricato")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ricaricato


In [34]:
batch_size = 16
epochs = 4
learning_rate = 5e-5

loss_fn = nn.BCEWithLogitsLoss()

finetune_classifier = FineTunedBert(bert_model_unfrozen)
optimizer = AdamW(finetune_classifier.parameters(), lr=learning_rate, eps=1e-8)


bert_train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))
bert_val_loader  = DataLoader(val_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))
bert_test_loader  = DataLoader(test_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=len(bert_train_loader)*epochs)

finetune_classifier.to(device)
summary(finetune_classifier,input_data=next(iter(bert_train_loader))[0], device=device, dtypes = [torch.int]*3)

Layer (type:depth-idx)                                  Output Shape              Param #
FineTunedBert                                           [16, 20]                  --
├─BertModel: 1-1                                        [16, 768]                 --
│    └─BertEmbeddings: 2-1                              [16, 70, 768]             --
│    │    └─Embedding: 3-1                              [16, 70, 768]             23,440,896
│    │    └─Embedding: 3-2                              [16, 70, 768]             1,536
│    │    └─Embedding: 3-3                              [1, 70, 768]              393,216
│    │    └─LayerNorm: 3-4                              [16, 70, 768]             1,536
│    │    └─Dropout: 3-5                                [16, 70, 768]             --
│    └─BertEncoder: 2-2                                 [16, 70, 768]             --
│    │    └─ModuleList: 3-6                             --                        85,054,464
│    └─BertPooler: 2-3           

In [35]:
fix_random(seed)
finetune_classifier = finetune_bert(finetune_classifier, 
                                   loss_fn, optimizer,
                                   bert_train_loader,
                                   bert_val_loader,
                                   epochs,
                                   {"patience": 3, "delta": 1e-4}, 
                                   "finebert", scheduler)
# 5e-5, 3 epochs, 32 batch size 0.436
# 3e-5, 3 epochs, 16 batch size 0.43
# 5e-5, 3 epochs, 16 batch size 0.4887
# 5e-5, 4 epochs, 16 batch size 0.5115


100%|██████████| 270/270 [01:02<00:00,  4.29it/s]


Valid Loss : 0.341
Train Loss : 0.400


100%|██████████| 270/270 [01:02<00:00,  4.32it/s]


Valid Loss : 0.319
Train Loss : 0.319


100%|██████████| 270/270 [01:02<00:00,  4.30it/s]


Valid Loss : 0.311
Train Loss : 0.272


100%|██████████| 270/270 [01:02<00:00,  4.33it/s]


Valid Loss : 0.309
Train Loss : 0.240


In [36]:
print("FINETUNED BERT:")
print_report(finetune_classifier, bert_val_loader, whole_dataset["val"]["labels"], 0.25)

FINETUNED BERT:
----- MACRO AVG. -----
  F1-score:	0.4932
  Precision:	0.4884
  Recall:	0.5342
  Accuracy:	0.8432
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.6227		0.5119		0.7947		0.8304		190
  Self-direction: action    	0.5959		0.5025		0.7319		0.7461		276
  Stimulation               	0.2034		0.375		0.1395		0.9564		43
  Hedonism                  	0.2979		0.5		0.2121		0.9694		33
  Achievement               	0.6467		0.5711		0.7453		0.76		318
  Power: dominance          	0.4154		0.4286		0.403		0.8591		134
  Power: resources          	0.5703		0.4932		0.6759		0.8981		108
  Face                      	0.2041		0.2857		0.1587		0.9277		63
  Security: personal        	0.7096		0.6181		0.8329		0.7618		377
  Security: societal        	0.7004		0.6043		0.8328		0.7748		341
  Tradition                 	0.5464		0.5		0.6023		0.9184		88
  Conformity: rules         	0.5414		0.5		0.5904		0.7692		249
  Conformity: interpersonal 	0.2807		0.4211	

In [24]:
chn_loader = DataLoader(whole_dataset["test_chn"], batch_size=32, collate_fn=lambda x:tuple(y.to(device) for y in vectorize_batch(x)))
bert_chn_loader = DataLoader(whole_dataset["test_chn"], batch_size=32, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))


In [29]:
print_report(embed_classifier, chn_loader, whole_dataset["test_chn"]["labels"], 0.25)

----- MACRO AVG. -----
  F1-score:	0.1647
  Precision:	0.2646
  Recall:	0.1446
  Accuracy:	0.8745
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.0		0.0		0.0		0.88		6
  Self-direction: action    	0.2667		0.5		0.1818		0.89		11
  Stimulation               	0.0		0.0		0.0		1.0		0
  Hedonism                  	0.0		0.0		0.0		0.98		2
  Achievement               	0.481		0.475		0.4872		0.59		39
  Power: dominance          	0.0		0.0		0.0		0.99		1
  Power: resources          	0.3529		0.4		0.3158		0.78		19
  Face                      	0.0		0.0		0.0		0.99		1
  Security: personal        	0.4762		0.4545		0.5		0.67		30
  Security: societal        	0.3462		0.4286		0.2903		0.66		31
  Tradition                 	0.0		0.0		0.0		1.0		0
  Conformity: rules         	0.125		1.0		0.0667		0.86		15
  Conformity: interpersonal 	0.0		0.0		0.0		0.99		1
  Humility                  	0.0		0.0		0.0		0.95		5
  Benevolence: caring       	0.1429		0.5		0.0833		0.

In [43]:
print_report(prebert_classifier, bert_chn_loader, whole_dataset["test_chn"]["labels"], 0.25)

----- MACRO AVG. -----
  F1-score:	0.2551
  Precision:	0.2188
  Recall:	0.3798
  Accuracy:	0.8035
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.5556		0.4167		0.8333		0.92		6
  Self-direction: action    	0.2979		0.1944		0.6364		0.67		11
  Stimulation               	0.0		0.0		0.0		1.0		0
  Hedonism                  	0.0		0.0		0.0		0.98		2
  Achievement               	0.6		0.4444		0.9231		0.52		39
  Power: dominance          	0.0		0.0		0.0		0.9		1
  Power: resources          	0.375		0.2459		0.7895		0.5		19
  Face                      	0.0		0.0		0.0		0.99		1
  Security: personal        	0.495		0.3521		0.8333		0.49		30
  Security: societal        	0.4719		0.3621		0.6774		0.53		31
  Tradition                 	0.0		0.0		0.0		1.0		0
  Conformity: rules         	0.3448		0.3571		0.3333		0.81		15
  Conformity: interpersonal 	0.0		0.0		0.0		0.99		1
  Humility                  	0.0		0.0		0.0		0.95		5
  Benevolence: caring       	0.4167

In [37]:
print_report(finetune_classifier, bert_chn_loader, whole_dataset["test_chn"]["labels"], 0.25)

----- MACRO AVG. -----
  F1-score:	0.2712
  Precision:	0.2211
  Recall:	0.3944
  Accuracy:	0.836
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.375		0.2308		1.0		0.8		6
  Self-direction: action    	0.36		0.2308		0.8182		0.68		11
  Stimulation               	0.0		0.0		0.0		0.98		0
  Hedonism                  	0.0		0.0		0.0		0.98		2
  Achievement               	0.6598		0.5517		0.8205		0.67		39
  Power: dominance          	0.0		0.0		0.0		0.99		1
  Power: resources          	0.4314		0.3438		0.5789		0.71		19
  Face                      	0.0		0.0		0.0		0.98		1
  Security: personal        	0.5412		0.4182		0.7667		0.61		30
  Security: societal        	0.4444		0.3902		0.5161		0.6		31
  Tradition                 	0.0		0.0		0.0		1.0		0
  Conformity: rules         	0.4324		0.3636		0.5333		0.79		15
  Conformity: interpersonal 	0.0		0.0		0.0		0.99		1
  Humility                  	0.0		0.0		0.0		0.95		5
  Benevolence: caring       	0.3158	