In [1]:
!pip install transformers
!pip install datasets
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, http

In [2]:
# Cell for the download of the datasets
!wget https://zenodo.org/record/7550385/files/arguments-training.tsv
!wget https://zenodo.org/record/7550385/files/labels-training.tsv
!wget https://zenodo.org/record/7550385/files/arguments-validation.tsv
!wget https://zenodo.org/record/7550385/files/labels-validation.tsv
!wget https://zenodo.org/record/7550385/files/arguments-test.tsv
!wget https://zenodo.org/record/7550385/files/arguments-validation-zhihu.tsv
!wget https://zenodo.org/record/7550385/files/labels-validation-zhihu.tsv

--2023-02-04 14:16:57--  https://zenodo.org/record/7550385/files/arguments-training.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1012498 (989K) [application/octet-stream]
Saving to: ‘arguments-training.tsv’


2023-02-04 14:17:02 (346 KB/s) - ‘arguments-training.tsv’ saved [1012498/1012498]

--2023-02-04 14:17:02--  https://zenodo.org/record/7550385/files/labels-training.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 253843 (248K) [application/octet-stream]
Saving to: ‘labels-training.tsv’


2023-02-04 14:17:05 (320 KB/s) - ‘labels-training.tsv’ saved [253843/253843]

--2023-02-04 14:17:05--  https://zenodo.org/record/7550385/files/arguments-validation.tsv
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting

In [3]:
# imports for dataset loading
import numpy as np
import random
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# torch imports
import torch
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam

# progress bar
from tqdm import tqdm
# garbage collector
import gc

# imports for evaluation
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [4]:
def fix_random(seed: int) -> None:
  """Fix all the possible sources of randomness.

  Args:
    seed: the seed to use. 
  """
  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True

In [5]:
seed = 10
fix_random(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
def huggingface_from_pandas(pandas_df):
  hf_ds = Dataset.from_pandas(pandas_df, preserve_index=False)
  hf_ds = hf_ds.remove_columns(["Argument ID", "Argument ID2"])
  hf_ds = hf_ds.map(lambda x:{"labels": [int(x[col]) for col in hf_ds.column_names if
                                      col not in ['Conclusion', 'Stance', 'Premise']]})
  label_cols = [col for col in hf_ds.column_names if col not in ['Conclusion', 'Stance', 'Premise', "labels"]]
  hf_ds = hf_ds.remove_columns(label_cols)
  return hf_ds, label_cols

In [7]:
# Dataset loading and splitting
raw_training = pd.read_csv("arguments-training.tsv", encoding='utf-8', sep='\t', header=0)
raw_training_lab = pd.read_csv("labels-training.tsv", encoding='utf-8', sep='\t', header=0)
raw_test = pd.read_csv("arguments-validation.tsv", encoding='utf-8', sep='\t', header=0)
raw_test_lab = pd.read_csv("labels-validation.tsv", encoding='utf-8', sep='\t', header=0)

train = raw_training.join(raw_training_lab,how='inner' ,lsuffix='2') # joining labels
test = raw_test.join(raw_test_lab, how='inner', lsuffix='2') # joining labels
train, val = train_test_split(train ,train_size=.80, random_state=seed) # splitting training

train_ds, label_list = huggingface_from_pandas(train)
val_ds, _ = huggingface_from_pandas(val)
test_ds, _ = huggingface_from_pandas(test)

print(train_ds[0])
print(label_list)

whole_dataset = DatasetDict()
whole_dataset["train"] = train_ds.with_format("torch")
whole_dataset["val"] = val_ds.with_format("torch")
whole_dataset["test"] = test_ds.with_format("torch")

  0%|          | 0/4314 [00:00<?, ?ex/s]

  0%|          | 0/1079 [00:00<?, ?ex/s]

  0%|          | 0/1896 [00:00<?, ?ex/s]

{'Conclusion': 'We should ban the Church of Scientology', 'Stance': 'in favor of', 'Premise': "Scientology is not a true religion it is a sect or a cult which brainwashes it's followers and makes money from them.", 'labels': [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]}
['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']


In [8]:
print(whole_dataset.keys())
print(whole_dataset["train"])

dict_keys(['train', 'val', 'test'])
Dataset({
    features: ['Conclusion', 'Stance', 'Premise', 'labels'],
    num_rows: 4314
})


In [None]:
print(whole_dataset["train"]["labels"][0].shape)
print(whole_dataset["train"]["labels"])

In [9]:
# Pretrained GloVe setup

global_vectors = GloVe(name='6B', dim=100)

# the current choice is to give an id to each word
tokenizer = get_tokenizer("basic_english")

embeddings = global_vectors.get_vecs_by_tokens(tokenizer("Hello, How are you?"), lower_case_backup=True)

print(embeddings.shape)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.37MB/s]                           
100%|█████████▉| 399999/400000 [00:18<00:00, 21146.69it/s]


torch.Size([6, 100])


In [10]:
max_words = 25
embed_len = 100
batch_size = 32

# collate function where the Premises are tokenized and embedded in batches
def vectorize_batch(batch):
    X = [elem["Premise"] for elem in batch]
    Y = [elem["labels"] for elem in batch]
    X = [tokenizer(x) for x in X]
    X = [tokens+[""] * (max_words-len(tokens))  if len(tokens)<max_words else tokens[:max_words] for tokens in X]
    X_tensor = torch.zeros(len(batch), max_words, embed_len)
    Y_tensor = torch.zeros(len(batch), Y[0].shape[0])
    for i, tokens in enumerate(X):
        X_tensor[i] = global_vectors.get_vecs_by_tokens(tokens)
        Y_tensor[i] = Y[i]
    return X_tensor, Y_tensor

train_dataset = whole_dataset["train"].remove_columns(["Stance", "Conclusion"])
val_dataset = whole_dataset["val"].remove_columns(["Stance", "Conclusion"])
test_dataset = whole_dataset["test"].remove_columns(["Stance", "Conclusion"])
print(val_dataset.shape)

# Construction of the Dataloaders for train and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in vectorize_batch(x)))
val_loader  = DataLoader(val_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in vectorize_batch(x)))
test_loader  = DataLoader(test_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in vectorize_batch(x)))

(1079, 2)


In [13]:
num_classes = 20

# Simple model to perform some tests with pytorch
class EmbeddingClassifier(nn.Module):
    def __init__(self):
        super(EmbeddingClassifier, self).__init__() 
        # Not sure about this
        #self.seq_length = batch_size
        self.input_dim = (batch_size, max_words,embed_len)
        self.num_layers = 1

        self.gru = nn.GRU(input_size = embed_len,
                          hidden_size = embed_len,
                          num_layers = 1,
                          batch_first=True, 
                          bidirectional = True)
        self.flatten = nn.Flatten(start_dim=1)
        self.linear_1 = nn.Linear(max_words*embed_len*2, 512)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(512,256)
        self.linear_3 = nn.Linear(256,128)
        self.linear_4 = nn.Linear(128, 64)
        self.linear_5 = nn.Linear(64, num_classes)
        
                

    def forward(self, X_batch):
        h0 = torch.zeros(2,X_batch.shape[0], embed_len)
        h0 = h0.to(device)
        out, hn = self.gru(X_batch, h0)
        out = self.flatten(out)
        out = self.linear_1(out)
        out = self.relu(out)
        out = self.linear_2(out)
        out = self.relu(out)
        out = self.linear_3(out)
        out = self.relu(out)
        out = self.linear_4(out)
        out = self.relu(out)
        out = self.linear_5(out)
        return out

# Function needed to compute the validation loss and the accuracy
def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
      Y_shuffled, Y_preds, losses = [],[],[]
      for X, Y in val_loader:
        preds = model(X)
        loss = loss_fn(preds, Y)
        losses.append(loss.item())
        Y_shuffled.append(Y)
        Y_preds.append(preds.argmax(dim=-1))

      Y_shuffled = torch.cat(Y_shuffled)
      Y_preds = torch.cat(Y_preds)

      loss = torch.tensor(losses).mean()
      print("Valid Loss : {:.3f}".format(loss))
    return loss
    # print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().cpu().numpy(), Y_preds.detach().cpu().numpy())))

# Training function
def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs, early_stopping_info):
    patience_acc = 0
    precedent_loss = np.Inf
    model.train()
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss = CalcValLossAndAccuracy(model, loss_fn, val_loader)
        if precedent_loss - loss < early_stopping_info["delta"]:
           patience_acc = patience_acc + 1
        else:
          patience_acc = 0
          torch.save(model, "best.pth")

        if patience_acc > early_stopping_info["patience"]:
          return torch.load("best.pth")
        precedent_loss = loss

        if i%1==0:
            print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
    return model

In [14]:
from torchinfo import summary
epochs = 50
learning_rate = 1e-4

loss_fn = nn.BCEWithLogitsLoss()
embed_classifier = EmbeddingClassifier()
optimizer = Adam(embed_classifier.parameters(), lr=learning_rate)

embed_classifier.to(device)
summary(embed_classifier, 
                input_size=(1,max_words, embed_len), device="cuda")
#TrainModel(embed_classifier, loss_fn, optimizer, train_loader, val_loader, epochs)

Layer (type:depth-idx)                   Output Shape              Param #
EmbeddingClassifier                      [1, 20]                   --
├─GRU: 1-1                               [1, 25, 200]              121,200
├─Flatten: 1-2                           [1, 5000]                 --
├─Linear: 1-3                            [1, 512]                  2,560,512
├─ReLU: 1-4                              [1, 512]                  --
├─Linear: 1-5                            [1, 256]                  131,328
├─ReLU: 1-6                              [1, 256]                  --
├─Linear: 1-7                            [1, 128]                  32,896
├─ReLU: 1-8                              [1, 128]                  --
├─Linear: 1-9                            [1, 64]                   8,256
├─ReLU: 1-10                             [1, 64]                   --
├─Linear: 1-11                           [1, 20]                   1,300
Total params: 2,855,492
Trainable params: 2,855,492
Non-tr

In [15]:
fix_random(seed)
embed_classifier = TrainModel(embed_classifier, loss_fn, optimizer, train_loader, val_loader, epochs, {"patience": 3, "delta": 1e-4})

100%|██████████| 135/135 [00:02<00:00, 52.07it/s]


Valid Loss : 0.411
Train Loss : 0.492


100%|██████████| 135/135 [00:01<00:00, 70.43it/s]


Valid Loss : 0.409
Train Loss : 0.418


100%|██████████| 135/135 [00:01<00:00, 71.13it/s]


Valid Loss : 0.408
Train Loss : 0.416


100%|██████████| 135/135 [00:01<00:00, 71.55it/s]


Valid Loss : 0.406
Train Loss : 0.414


100%|██████████| 135/135 [00:01<00:00, 70.05it/s]


Valid Loss : 0.404
Train Loss : 0.412


100%|██████████| 135/135 [00:01<00:00, 70.58it/s]


Valid Loss : 0.396
Train Loss : 0.407


100%|██████████| 135/135 [00:01<00:00, 70.92it/s]


Valid Loss : 0.387
Train Loss : 0.396


100%|██████████| 135/135 [00:01<00:00, 69.24it/s]


Valid Loss : 0.379
Train Loss : 0.386


100%|██████████| 135/135 [00:02<00:00, 54.94it/s]


Valid Loss : 0.375
Train Loss : 0.379


100%|██████████| 135/135 [00:01<00:00, 70.32it/s]


Valid Loss : 0.371
Train Loss : 0.375


100%|██████████| 135/135 [00:01<00:00, 71.35it/s]


Valid Loss : 0.368
Train Loss : 0.371


100%|██████████| 135/135 [00:01<00:00, 70.12it/s]


Valid Loss : 0.365
Train Loss : 0.366


100%|██████████| 135/135 [00:01<00:00, 71.83it/s]


Valid Loss : 0.363
Train Loss : 0.362


100%|██████████| 135/135 [00:01<00:00, 68.97it/s]


Valid Loss : 0.360
Train Loss : 0.357


100%|██████████| 135/135 [00:01<00:00, 70.21it/s]


Valid Loss : 0.357
Train Loss : 0.353


100%|██████████| 135/135 [00:01<00:00, 69.98it/s]


Valid Loss : 0.356
Train Loss : 0.349


100%|██████████| 135/135 [00:01<00:00, 71.75it/s]


Valid Loss : 0.355
Train Loss : 0.345


100%|██████████| 135/135 [00:01<00:00, 71.12it/s]


Valid Loss : 0.354
Train Loss : 0.342


100%|██████████| 135/135 [00:01<00:00, 69.91it/s]


Valid Loss : 0.353
Train Loss : 0.339


100%|██████████| 135/135 [00:01<00:00, 70.98it/s]


Valid Loss : 0.353
Train Loss : 0.335


100%|██████████| 135/135 [00:01<00:00, 69.75it/s]


Valid Loss : 0.353
Train Loss : 0.332


100%|██████████| 135/135 [00:01<00:00, 69.37it/s]


Valid Loss : 0.353
Train Loss : 0.329


100%|██████████| 135/135 [00:01<00:00, 71.06it/s]


Valid Loss : 0.353
Train Loss : 0.325


100%|██████████| 135/135 [00:01<00:00, 71.52it/s]


Valid Loss : 0.353
Train Loss : 0.322


100%|██████████| 135/135 [00:01<00:00, 71.22it/s]


Valid Loss : 0.354
Train Loss : 0.318


100%|██████████| 135/135 [00:01<00:00, 72.00it/s]


Valid Loss : 0.355


In [16]:
from os import supports_effective_ids
def make_predictions(model, loader):
    Y_shuffled, Y_preds = [], []
    model.eval()
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
    gc.collect()
    Y_preds = torch.cat(Y_preds)
    Y_preds = Y_preds.sigmoid()
    return Y_preds.detach()

def keep_above_thresh(Y_preds, thr):
  Y_preds_thr = np.copy(Y_preds.numpy())
  max_rows = Y_preds_thr.shape[0]
  max_cols = Y_preds_thr.shape[1]
  for i in range(max_rows):
    new_row = np.array([1 if Y_preds_thr[i][j] > thr else 0 for j in range(max_cols)])
    Y_preds_thr[i] = new_row
  return Y_preds_thr

def compute_macro_score(M_true, M_pred, score_func):
    scores = []
    for i in range(M_true.shape[1]):
        true = M_true[:, i]
        pred = M_pred[:, i]
        if score_func == accuracy_score:
          scores.append(score_func(true, pred))
        else: 
          scores.append(score_func(true, pred, zero_division=0))
    return np.mean(scores), scores
  
def support(true, pred, zero_division):
  return sum(true)

def print_report(classifier, loader, y_true, threshold, labels=label_list):
  Y_preds = make_predictions(classifier, loader)
  Y_preds_thr = keep_above_thresh(Y_preds.to('cpu'), threshold)
  f1_macro, f1 = compute_macro_score(y_true, Y_preds_thr, f1_score)
  acc_macro, acc = compute_macro_score(y_true, Y_preds_thr, accuracy_score)
  prec_macro, prec = compute_macro_score(y_true, Y_preds_thr, precision_score)
  rec_macro, rec = compute_macro_score(y_true, Y_preds_thr, recall_score)
  _, sup = compute_macro_score(y_true, Y_preds_thr, support)
  print("----- MACRO AVG. -----")
  print(f"  F1-score:\t{round(f1_macro,4)}\n\
  Precision:\t{round(prec_macro,4)}\n\
  Recall:\t{round(rec_macro,4)}\n\
  Accuracy:\t{round(acc_macro,4)}")
  print("----- PER-CLASS VALUES -----")
  print("  \t\t\t\tF1-score\tPrecision\tRecall\t\tAccuracy\tSupport")
  for i in range(len(labels)):
    print("  " + labels[i]+" "*(len(max(labels, key=len))-len(labels[i])), end="\t")
    print(f"{round(f1[i],4)}\t\t{round(prec[i],4)}\t\t{round(rec[i],4)}\t\t{round(acc[i],4)}\t\t{sup[i]}")

In [17]:
print_report(embed_classifier, val_loader,val_dataset["labels"] ,0.3)

----- MACRO AVG. -----
  F1-score:	0.3411
  Precision:	0.4182
  Recall:	0.3587
  Accuracy:	0.8254
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.5147		0.4817		0.5526		0.8165		190
  Self-direction: action    	0.5		0.4684		0.5362		0.7257		276
  Stimulation               	0.0		0.0		0.0		0.9601		43
  Hedonism                  	0.0		0.0		0.0		0.9685		33
  Achievement               	0.5667		0.5415		0.5943		0.7322		318
  Power: dominance          	0.2648		0.3412		0.2164		0.8508		134
  Power: resources          	0.395		0.3615		0.4352		0.8665		108
  Face                      	0.0312		1.0		0.0159		0.9425		63
  Security: personal        	0.6437		0.5828		0.7188		0.722		377
  Security: societal        	0.6216		0.5219		0.7683		0.7044		341
  Tradition                 	0.3878		0.3519		0.4318		0.8888		88
  Conformity: rules         	0.4433		0.383		0.5261		0.6951		249
  Conformity: interpersonal 	0.0		0.0		0.0		0.9648		38
  Humility         

In [18]:
embed_classifier = None
gc.collect()

0

In [19]:
from transformers import BertTokenizer, BertModel

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [20]:
lengths = whole_dataset["train"].map(
    lambda x: 
    {"tok" : bert_tokenizer(x["Conclusion"])}).map(lambda x:{"len": len(x["tok"]["token_type_ids"])})
print(lengths["len"])

  0%|          | 0/4314 [00:00<?, ?ex/s]

  0%|          | 0/4314 [00:00<?, ?ex/s]

tensor([ 9,  7,  9,  ...,  9, 11, 10])


In [21]:
print(np.quantile(lengths["len"], .9))
print(np.mean(lengths["len"].numpy()))

13.0
9.60477515067223


In [23]:
max_words = 70
batch_size = 32

# collate function that uses the tokenizer relative to the bert pretrained model
def bert_vectorize_batch(batch):
    X = [elem["Premise"] + " [SEP] " + elem["Stance"] + " [SEP] " + elem["Conclusion"] for elem in batch] 
    Y = [elem["labels"] for elem in batch]
    X = bert_tokenizer(X, padding="max_length", truncation="longest_first", return_tensors = "pt", max_length = max_words) 
    Y_tensor = torch.zeros(len(batch), Y[0].shape[0])
    for i, tokens in enumerate(Y):    
        Y_tensor[i] = Y[i]
    X_tensor = torch.stack([X["input_ids"], X["token_type_ids"], X["attention_mask"]])

    return X_tensor, Y_tensor

train_dataset = whole_dataset["train"]
val_dataset = whole_dataset["val"] 
test_dataset = whole_dataset["test"] 

# Construction of the Dataloaders for train and validation
bert_train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))
bert_val_loader  = DataLoader(val_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))
bert_test_loader  = DataLoader(test_dataset, batch_size=batch_size, collate_fn=lambda x:tuple(y.to(device) for y in bert_vectorize_batch(x)))

In [25]:
str_to_test = "Top scorer"
str_to_test_2 = "Stand still"


bert_model.to(device)

normalized_batch = bert_tokenizer([str_to_test, str_to_test_2], 
                                  padding="max_length", 
                                  max_length=max_words, 
                                  truncation="longest_first", 
                                  return_tensors = "pt")
tensor_batch = torch.tensor(np.array([normalized_batch["input_ids"].numpy(), 
                                      normalized_batch["token_type_ids"].numpy(), 
                                      normalized_batch["attention_mask"].numpy()
                                      ]))

# token of the sequence 
print(bert_model(input_ids = tensor_batch[0].to(device), token_type_ids =  tensor_batch[1].to(device), attention_mask = tensor_batch[2].to(device)).pooler_output.shape)
print(bert_model(input_ids = tensor_batch[0].to(device), token_type_ids =  tensor_batch[1].to(device), attention_mask = tensor_batch[2].to(device)).pooler_output)

torch.Size([2, 768])
tensor([[-0.9275, -0.3706, -0.6260,  ..., -0.5522, -0.5421,  0.8422],
        [-0.9419, -0.2029, -0.7530,  ..., -0.4699, -0.5669,  0.8745]],
       device='cuda:0', grad_fn=<TanhBackward0>)


In [27]:
num_classes = 20

# Simple model to perform some tests with pytorch
class BertLSTM(nn.Module):
    def __init__(self):
        super(BertLSTM, self).__init__() 
        self.bert_model = bert_model
        for param in self.bert_model.parameters():
            param.requires_grad = False

        self.lstm_hiddensize = 200
        self.lstm = nn.LSTM(input_size=768,
                            hidden_size=self.lstm_hiddensize,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True)
        

        self.flatten = nn.Flatten(start_dim=1)
        
        self.linear_pooler = nn.Linear(768, self.lstm_hiddensize)

        self.linear_1 = nn.Linear(max_words*self.lstm_hiddensize*2, int((max_words*self.lstm_hiddensize*2)/4))

        


        self.linear_2 = nn.Linear(int((max_words*self.lstm_hiddensize*2)/4), 875)


        self.linear_3 = nn.Linear(875, 256)


        self.linear_4 = nn.Linear(256, 64)
        self.linear_5 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
        
                

    def forward(self, X_batch):
        bert_out = self.bert_model(input_ids=X_batch[0], token_type_ids = X_batch[1], attention_mask = X_batch[2])
        out, pooler = bert_out.last_hidden_state, bert_out.pooler_output
        h0 = torch.zeros(2 * 2, X_batch.shape[1], self.lstm_hiddensize)
        h0 = h0.to(device)
        c0 = self.linear_pooler(pooler) 
        c0 = torch.stack([c0 for i in range(0, 4)])
        c0 = c0.to(device)
        out_lstm, _  = self.lstm(out, (h0, c0))
        
        out = self.flatten(out_lstm) # this makes working with the outcome of the LSTM fairly easy
                                     # but do not allow large models to scale since it is
                                     # difficult to handle large flattened tensors while
                                     # keeping intact the semantics of the sequences

        out = self.linear_1(out)
        out = self.relu(out)
        out = self.linear_2(out)
        out = self.relu(out)
        out = self.linear_3(out)
        out = self.relu(out)
        out = self.linear_4(out)
        out = self.relu(out)
        out = self.linear_5(out)
        return out

# Function needed to compute the validation loss and the accuracy
def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
      Y_shuffled, Y_preds, losses = [],[],[]
      for X, Y in val_loader:
        preds = model(X)
        loss = loss_fn(preds, Y)
        losses.append(loss.item())
        Y_shuffled.append(Y)
        Y_preds.append(preds.argmax(dim=-1))

      Y_shuffled = torch.cat(Y_shuffled)
      Y_preds = torch.cat(Y_preds)

      loss = torch.tensor(losses).mean()
      print("Valid Loss : {:.3f}".format(loss))
    return loss

# Training function
def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs, early_stopping_info):
    patience_acc = 0
    precedent_loss = np.Inf
    model.train()
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss = CalcValLossAndAccuracy(model, loss_fn, val_loader)
        if precedent_loss - loss < early_stopping_info["delta"]:
           patience_acc = patience_acc + 1
        else:
          patience_acc = 0
          precedent_loss = loss
          torch.save(model, "best.pth")

        if patience_acc > early_stopping_info["patience"]:
          return torch.load("best.pth")

        if i%1==0:
            print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
    return model

In [28]:
from torchinfo import summary
epochs = 50
learning_rate = 1e-4

loss_fn = nn.BCEWithLogitsLoss()
prebert_classifier = BertLSTM()
optimizer = Adam(prebert_classifier.parameters(), lr=learning_rate)

prebert_classifier.to(device)
summary(prebert_classifier, input_size=(3, 1, max_words), 
        device="cuda", dtypes = [torch.int]*3)

Layer (type:depth-idx)                                  Output Shape              Param #
BertLSTM                                                [1, 20]                   --
├─BertModel: 1-1                                        [1, 768]                  --
│    └─BertEmbeddings: 2-1                              [1, 70, 768]              --
│    │    └─Embedding: 3-1                              [1, 70, 768]              (23,440,896)
│    │    └─Embedding: 3-2                              [1, 70, 768]              (1,536)
│    │    └─Embedding: 3-3                              [1, 70, 768]              (393,216)
│    │    └─LayerNorm: 3-4                              [1, 70, 768]              (1,536)
│    │    └─Dropout: 3-5                                [1, 70, 768]              --
│    └─BertEncoder: 2-2                                 [1, 70, 768]              --
│    │    └─ModuleList: 3-6                             --                        (85,054,464)
│    └─BertPooler: 2-3 

In [29]:
fix_random(seed)
prebert_classifier = TrainModel(prebert_classifier, loss_fn, optimizer, bert_train_loader, bert_val_loader, epochs, {"patience": 3, "delta": 1e-4})

100%|██████████| 135/135 [00:29<00:00,  4.55it/s]


Valid Loss : 0.395
Train Loss : 0.440


100%|██████████| 135/135 [00:29<00:00,  4.53it/s]


Valid Loss : 0.366
Train Loss : 0.385


100%|██████████| 135/135 [00:30<00:00,  4.50it/s]


Valid Loss : 0.354
Train Loss : 0.362


100%|██████████| 135/135 [00:30<00:00,  4.47it/s]


Valid Loss : 0.345
Train Loss : 0.345


100%|██████████| 135/135 [00:30<00:00,  4.43it/s]


Valid Loss : 0.337
Train Loss : 0.331


100%|██████████| 135/135 [00:30<00:00,  4.42it/s]


Valid Loss : 0.336
Train Loss : 0.318


100%|██████████| 135/135 [00:30<00:00,  4.40it/s]


Valid Loss : 0.337
Train Loss : 0.307


100%|██████████| 135/135 [00:30<00:00,  4.41it/s]


Valid Loss : 0.333
Train Loss : 0.296


100%|██████████| 135/135 [00:30<00:00,  4.39it/s]


Valid Loss : 0.331
Train Loss : 0.285


100%|██████████| 135/135 [00:30<00:00,  4.37it/s]


Valid Loss : 0.333
Train Loss : 0.272


100%|██████████| 135/135 [00:30<00:00,  4.38it/s]


Valid Loss : 0.335
Train Loss : 0.260


100%|██████████| 135/135 [00:31<00:00,  4.33it/s]


Valid Loss : 0.344
Train Loss : 0.245


100%|██████████| 135/135 [00:31<00:00,  4.34it/s]


Valid Loss : 0.351


In [30]:
print_report(prebert_classifier, bert_val_loader, val_dataset["labels"], 0.25)

----- MACRO AVG. -----
  F1-score:	0.4567
  Precision:	0.4329
  Recall:	0.521
  Accuracy:	0.8193
----- PER-CLASS VALUES -----
  				F1-score	Precision	Recall		Accuracy	Support
  Self-direction: thought   	0.6076		0.5854		0.6316		0.8563		190
  Self-direction: action    	0.5355		0.4458		0.6703		0.7025		276
  Stimulation               	0.1856		0.1667		0.2093		0.9268		43
  Hedonism                  	0.2373		0.2692		0.2121		0.9583		33
  Achievement               	0.6222		0.501		0.8208		0.7062		318
  Power: dominance          	0.3925		0.3067		0.5448		0.7905		134
  Power: resources          	0.5271		0.432		0.6759		0.8786		108
  Face                      	0.2051		0.2222		0.1905		0.9138		63
  Security: personal        	0.6674		0.5521		0.8435		0.7062		377
  Security: societal        	0.6388		0.4834		0.9413		0.6636		341
  Tradition                 	0.4767		0.4381		0.5227		0.9064		88
  Conformity: rules         	0.5128		0.4267		0.6426		0.7183		249
  Conformity: interpersonal 	0.1754		0.2632		0.131

In [32]:
prebert_classifier = None
gc.collect()

26