# Prerequisites

In [1]:
import os, sys

import numpy as np
import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm


Important paths for the notebook:

In [2]:
datasets_root_path = './datasets/'
srl_dataset_path = os.path.join(datasets_root_path, 'united-srl')
checkpoints_dir_path = './checkpoints/'
model_dir_path = os.path.join(checkpoints_dir_path, 'model_pid')

srl_dataset_dict_paths = {}
for lang in os.listdir(srl_dataset_path):
    dataset_lang_path = os.path.join(srl_dataset_path, lang)
    if os.path.isdir(dataset_lang_path):
        srl_dataset_dict_paths[lang] = {}
        for d_type in os.listdir(dataset_lang_path):
            d_name = d_type.split('.')[0]
            srl_dataset_dict_paths[lang][d_name] = os.path.join(dataset_lang_path, d_type)

In [3]:
%load_ext autoreload
%autoreload 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

Setting the seed for reproducibility:

In [4]:
SEED = 28

# random.seed(SEED) # not used
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
global_params = {
    'batch_size': 32,
    'transformer_name': "xlm-roberta-base",
}

# Dataset

In [6]:
from code_files.datasets.srl_transformer import SRLDataset_transformer

In [7]:
dataset_train_en = SRLDataset_transformer(  srl_dataset_dict_paths['EN']['train'] )

In [8]:
global_params.update({
    'n_roles_labels': len(dataset_train_en.id_to_roles),
    'n_predicates_labels': len(dataset_train_en.id_to_predicates),

    'id_to_roles': dataset_train_en.id_to_roles,
    'roles_to_id': dataset_train_en.roles_to_id,
    'roles_pad_id': dataset_train_en.roles_pad_id,
    'roles_pad': dataset_train_en.roles_pad,

    'id_to_predicates': dataset_train_en.id_to_predicates,
    'predicates_to_id': dataset_train_en.predicates_to_id,
    'predicates_pad_id': dataset_train_en.predicates_pad_id,
    'predicates_pad': dataset_train_en.predicates_pad,
})

In [9]:
np.save(os.path.join(model_dir_path, 'global_params.npy'), global_params)

In [10]:
len(dataset_train_en.data)

5501

In [11]:
dataset_train_en[0]

{'dependency_heads': [3, 3, 0, 8, 7, 7, 8, 3, 10, 8, 13, 13, 10, 3],
 'dependency_relations': ['nsubj',
  'advmod',
  'root',
  'mark',
  'det',
  'amod',
  'nsubj',
  'ccomp',
  'amod',
  'obj',
  'case',
  'amod',
  'nmod',
  'punct'],
 'lemmas': ['member',
  'also',
  'ask',
  'whether',
  'all',
  'social',
  'group',
  'enjoy',
  'equal',
  'access',
  'to',
  'higher',
  'education',
  '.'],
 'pos_tags': ['NOUN',
  'ADV',
  'VERB',
  'SCONJ',
  'DET',
  'ADJ',
  'NOUN',
  'VERB',
  'ADJ',
  'NOUN',
  'ADP',
  'ADJ',
  'NOUN',
  'PUNCT'],
 'predicates': ['_',
  '_',
  'ASK_REQUEST',
  '_',
  '_',
  '_',
  '_',
  'BENEFIT_EXPLOIT',
  '_',
  '_',
  '_',
  '_',
  '_',
  '_'],
 'roles': {'2': ['agent',
   '_',
   '_',
   'theme',
   '_',
   '_',
   '_',
   '_',
   '_',
   '_',
   '_',
   '_',
   '_',
   '_'],
  '7': ['_',
   '_',
   '_',
   '_',
   '_',
   '_',
   'beneficiary',
   '_',
   '_',
   'theme',
   '_',
   '_',
   '_',
   '_']},
 'words': ['Members',
  'also',
  'asked',
  

# Dataloader

In [12]:
from torch.utils.data import DataLoader

dataloader_train_en = DataLoader(
    dataset_train_en,
    batch_size=global_params['batch_size'],
    collate_fn=dataset_train_en.create_collate_fn(),
    shuffle=True,
)

In [13]:
for e in dataloader_train_en:
    ex_in = e
    break

In [14]:
print(ex_in['words'][1], '\n', ex_in['predicates'][1], '\n', ex_in['predicates_positions'][1])

['The', 'view', 'was', 'expressed', 'that', ',', 'to', 'avoid', 'duplication', 'and', 'to', 'identify', 'common', 'practices', ',', 'a', 'proper', 'division', 'of', 'labour', 'was', 'required', 'between', 'the', 'Department', 'of', 'Economic', 'and', 'Social', 'Affairs', 'and', 'the', 'regional', 'commissions', ',', 'as', 'well', 'as', 'among', 'the', 'regional', 'commissions', '.'] 
 ['_', '_', '_', 'SHOW', '_', '_', '_', 'ABSTAIN_AVOID_REFRAIN', '_', '_', '_', 'RECOGNIZE_ADMIT_IDENTIFY', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'REQUIRE_NEED_WANT_HOPE', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'] 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


# Training the PID model

In [15]:
loss_function = torch.nn.CrossEntropyLoss(ignore_index = dataset_train_en.predicates_pad_id) # !

In [16]:
from code_files.models.model_pid import ModelPID
from code_files.utils.utils_functions import print_summary
import torch.optim as optim

model_pid = ModelPID(
    loss_fn = loss_function,
    hparams = global_params,
    fine_tune_transformer = True,
    has_predicates_positions = False # it does both pred iden and disamb
)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
optimizer = optim.SGD(model_pid.parameters(), lr=0.0016, momentum=0.9) 

# Tests

In [18]:
from code_files.utils.Trainer_pid import Trainer_pid

trainer_pid = Trainer_pid()

In [21]:
d = trainer_pid.compute_forward(model_pid, ex_in, device, optimizer)

torch.Size([32, 87]) torch.Size([32, 87, 433])


In [50]:
predictions, batch_encoding = model_pid.forward(ex_in['words'])

In [54]:
print(ex_in['predicates'][1])
print(batch_encoding.word_ids(batch_index=1))

['_', '_', '_', 'SHOW', '_', '_', '_', 'ABSTAIN_AVOID_REFRAIN', '_', '_', '_', 'RECOGNIZE_ADMIT_IDENTIFY', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'REQUIRE_NEED_WANT_HOPE', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']
[None, 0, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 41, 42, 42, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [63]:
words_ids = batch_encoding.word_ids(batch_index = 1)
label_processed = [
    model_pid.hparams['predicates_pad_id'] if v == None or (v != None and j-1>=0 and words_ids[j-1]==words_ids[j])
    else model_pid.hparams['predicates_to_id'][ ex_in['predicates'][1][v] ]
    for j,v in enumerate(words_ids)
]
print(torch.stack([torch.tensor(label_processed), torch.tensor(label_processed)]))

tensor([[ -1,   0,   0,   0, 364,  -1,   0,   0,  -1,   0,   2,   0,  -1,   0,
           0, 318,   0,   0,  -1,   0,  -1,   0,   0,   0,   0,   0,   0, 335,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  -1,   0,
          -1,   0,   0,   0,   0,   0,   0,   0,  -1,   0,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1],
        [ -1,   0,   0,   0, 364,  -1,   0,   0,  -1,   0,   2,   0,  -1,   0,
           0, 318,   0,   0,  -1,   0,  -1,   0,   0,   0,   0,   0,   0, 335,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  -1,   0,
          -1,   0,   0,   0,   0,   0,   0,   0,  -1,   0,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1]])


In [47]:
tt.shape

torch.Size([1205472])

In [53]:
tt[1][:,0]

tensor([-0.0039,  0.2021,  0.6279,  0.6723,  0.5644,  0.6415,  0.1043,  0.6775,
         0.5614,  0.4894,  0.5786,  0.6831,  0.5759,  0.7422,  0.6217,  0.0986,
         0.7288,  0.7039,  0.7125,  0.6595,  0.0470,  0.6807,  0.6311,  0.6803,
         0.6754,  0.0680,  0.6922,  0.5441,  0.1592,  0.7077,  0.6492,  0.7187,
         0.6400,  0.7136,  0.7484,  0.1834,  0.1921,  0.1394,  0.7266,  0.2045,
         0.8026,  0.6935,  0.5865,  0.6511,  0.6731,  0.5897,  0.5445,  0.5801,
         0.0781,  0.6459,  0.2147,  0.7238,  0.6472,  0.5593,  0.0615,  0.5411,
         0.5241,  0.5330,  0.4847,  0.5169,  0.4657,  0.5749,  0.5453,  0.4864,
         0.6101,  0.4473,  0.5550, -0.0087,  0.5267,  0.4612,  0.4751,  0.5403,
         0.5501,  0.4958,  0.5626, -0.0142,  0.5247,  0.4129,  0.5264,  0.0288,
         0.5406,  0.5412, -0.0361,  0.5246,  0.0463,  0.5166,  0.5239],
       grad_fn=<SelectBackward0>)

In [23]:
predictions, output_mask, batch_encoding = model_pid.forward(ex_in['words'])

In [42]:
predictions[1]

tensor([[ 0.4796, -0.1077, -0.6213,  ...,  0.3144, -0.0081,  0.1680],
        [ 0.7578,  0.3152, -0.3741,  ...,  0.2359,  0.3185, -0.0199],
        [ 0.1988,  0.1710,  0.2189,  ...,  0.0235,  0.2823, -0.1460],
        ...,
        [ 0.4766, -0.0880, -0.6280,  ...,  0.2449, -0.0965,  0.2291],
        [-0.0333, -0.1915, -0.0106,  ..., -0.0877, -0.1617,  0.0835],
        [ 0.5764, -0.0884, -0.5672,  ...,  0.2209, -0.1072,  0.1990]],
       grad_fn=<SelectBackward0>)

In [25]:
print(batch_encoding.word_ids(batch_index=1))

[None, 0, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 41, 42, 42, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [38]:
batch_encoding['input_ids'][1]

tensor([     0,    581,  21455,    509,  36510,    297,    450,      6,      4,
            47,  71864,    115, 182867,    136,     47, 135812,  39210,  41361,
             7,      6,      4,     10,  27798,  91853,    111, 150385,    509,
         56065,  17721,     70,  63557,    111,  79048,    136,   7142, 184593,
           136,     70,  18150,  62458,      7,      6,      4,    237,   5299,
           237,  54940,     70,  18150,  62458,      7,      6,      5,      2,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1])

In [48]:
print(sum(output_mask[1]), len(ex_in['words'][1]))

43 43


In [77]:
model_pid.tokenizer.decode(test_prediction['input_ids'][1][7])

''

In [93]:
model_pid.forward(ex_in['words']).shape, len(ex_in['words']), max([len(e) for e in ex_in['words']])

AttributeError: 'tuple' object has no attribute 'shape'

In [21]:
print_summary(model_pid, short=True)

parameters: 278,376,625
trainable parameters: 278,376,625
non-trainable parameters: 0


In [22]:
history = {}

In [None]:
from code_files.utils.Trainer_pid import Trainer_pid

trainer_pid = Trainer_pid()

history = trainer_pid.train(
    model_pid, optimizer, dataloader_train_en, dataloader_dev_en,
    epochs=60, device=device,
    save_best=True, 
    min_score=0.8,
    save_path_name=os.path.join(model_dir_path, 'pid_transformer_weights.pth'),
    saved_history=history
)