In [1]:
# import all the required packages
import sys
import os
# change according to the status of GPU
# this command must before import pytorch
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2,4,5'  # setting the GPUs

In [2]:

import pandas as pd
import numpy as np
import json, csv
import torch
import matplotlib.pyplot as plt
import datasets
from datasets import load_dataset, load_metric
from transformers import BertTokenizer, AutoModel

from transformers import AutoModel, BertForSequenceClassification, BertConfig
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score,accuracy_score
from torch.utils.data import TensorDataset, DataLoader, random_split, RandomSampler



from sklearn.metrics import balanced_accuracy_score,f1_score,recall_score,precision_score
# metrics for multi-label classification
from sklearn.metrics import hamming_loss

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# check the running environment
print(sys.executable)
print(torch.cuda.is_available())
torch.cuda.device_count()

/home/user/jinqli/anaconda3/envs/new_env/bin/python
True


4

## Read the prepared German data

In [4]:
de_path = '/srv/scratch2/jinq/taskontology/task_to_GWA_IWA_DWA_DE.csv'

data_de = pd.read_csv(de_path, index_col=0)
print('numbers of records in tasks to DWAs(de):', data_de.shape[0])
print('unique types of GWA', data_de['GWA Title'].nunique())
print('unique types of IWA', data_de['IWA Title'].nunique())
print('unique types of DWA', data_de['DWA Title'].nunique())

numbers of records in tasks to DWAs(de): 23543
unique types of GWA 37
unique types of IWA 332
unique types of DWA 2085


## Encoding

In [5]:
def encode_data(data_path, tokenizer, y_level):
    # load the data
    dataset = load_dataset('csv', data_files=data_path,split='train')
    print('Size of the dataset: ',len(dataset))

    # encoding tasks
    encoded_data = [tokenizer(item['Task_de'], 
                              return_tensors="pt", padding='max_length', truncation=True, 
                              max_length=128, is_split_into_words=True) for item in dataset]


    # encoding labels
    y_encoded = LabelEncoder().fit_transform(dataset[y_level])
    # print('Encoded labels: ', y_encoded)

    # Zipping the tasks and the labels(GWA title) together again
    for enc_item, item in zip(encoded_data, y_encoded):
        enc_item['labels'] = torch.tensor(item)

    for item in encoded_data:
        for key in item:
            item[key] = torch.squeeze(item[key])
    
    return encoded_data


def split_data(encoded_data):
    # split into train, validation and test      
    train_set, test_val_set = train_test_split(encoded_data, test_size=0.2)
    test_set, val_set = train_test_split(test_val_set, test_size=0.5)

    print('length of the training set: ', len(train_set))
    print('length of the test set: ',len(test_set))
    print('length of the val set: ',len(val_set))
    
    # details in the dataset
    #for key, val in test_set[3].items():
    #    print(f'key: {key}, dimensions: {val.size()}')
    
    return train_set,val_set,test_set


## Evaluation Metrics

In [6]:
# metrics for evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    balanced_accuracy = balanced_accuracy_score(labels, preds)
    f1 = f1_score(labels, preds,average='micro')
    recall = recall_score(labels, preds,average='micro')
    precision = precision_score(labels, preds,average='micro')
    hamming_loss = hamming_loss(labels, preds,average='micro')

    return {
      'accuracy': acc,
      'balanced_accuracy': balanced_accuracy,
      'f1_score': f1,
      'recall': recall,
      'precision': precision,
      'hamming_loss': hamming_loss,
    }

## Training models

### Training Parameters

In [7]:
# hyperparameters
gwa_labels=37
iwa_labels=332
dwa_labels=2085

hidden_dropout_prob = 0.3
learning_rate = 1e-5
weight_decay = 1e-2
epochs = 10
batch_size = 32

### 1. gbert-base model

In [10]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", use_fast=True) 

# load the model
german_model = "deepset/gbert-base" 
Bertmodel =  BertForSequenceClassification.from_pretrained(german_model,num_labels=gwa_labels)

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [None]:
# encoding data with corresponding tokenizer
encoded_data = encode_data(de_path, tokenizer, 'GWA Title')

# print('An example of zipped task and label: \n', encoded_data.__getitem__(10)) 
train_set,val_set,test_set = split_data(encoded_data)

In [11]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    evaluation_strategy = IntervalStrategy.STEPS, # "steps"
    eval_steps = 100, # Evaluation and Save happens every 100 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir='trained_models/gbert_results',
    logging_dir='trained_models/gbert_logs',
    metric_for_best_model = 'hamming_loss',
    load_best_model_at_end=True
    
)

trainer_gbert = Trainer(
    model=Bertmodel,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

In [12]:
trainer_gbert.train()

***** Running training *****
  Num examples = 18834
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 1480


Step,Training Loss,Validation Loss,Loss,Accuracy,Balanced Accuracy,F1 Score,Recall,Precision,Runtime,Samples Per Second,Steps Per Second
100,No log,0.191083,3.036555,0.191083,0.060654,0.191083,0.191083,0.191083,6.4564,364.753,2.943
200,No log,0.301486,2.625865,0.301486,0.136418,0.301486,0.301486,0.301486,6.4585,364.634,2.942
300,No log,0.392781,2.369962,0.392781,0.211314,0.392781,0.392781,0.392781,6.6119,356.177,2.874
400,No log,0.417834,2.232846,0.417834,0.233082,0.417834,0.417834,0.417834,6.476,363.648,2.934
500,2.610700,0.45138,2.107863,0.45138,0.26719,0.45138,0.45138,0.45138,6.448,365.23,2.947
600,2.610700,0.467516,2.048754,0.467516,0.285829,0.467516,0.467516,0.467516,6.6045,356.578,2.877
700,2.610700,0.481529,1.987307,0.481529,0.299866,0.481529,0.481529,0.481529,6.604,356.604,2.877
800,2.610700,0.484926,1.961134,0.484926,0.300642,0.484926,0.484926,0.484926,6.4583,364.648,2.942
900,2.610700,0.491295,1.948031,0.491295,0.314623,0.491295,0.491295,0.491295,6.4607,364.51,2.941
1000,1.819900,0.505308,1.908834,0.505308,0.325425,0.505308,0.505308,0.505308,6.5585,359.076,2.897


***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
Saving model checkpoint to ./gbert_results/checkpoint-500
Configuration saved in ./gbert_results/checkpoint-500/config.json
Model weights saved in ./gbert_results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./gbert_results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./gbert_results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch

TrainOutput(global_step=1480, training_loss=1.996760724041913, metrics={'train_runtime': 1298.9833, 'train_samples_per_second': 144.99, 'train_steps_per_second': 1.139, 'total_flos': 1.23924771650304e+16, 'train_loss': 1.996760724041913, 'epoch': 10.0})

### 2. job gbert model

In [8]:
# Load the tokenizer
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-cased", use_fast=True) 

job_model = "agne/jobGBERT"
Jobmodel = BertForSequenceClassification.from_pretrained(job_model,num_labels=gwa_labels,problem_type='multi_label_classification')

Some weights of the model checkpoint at agne/jobGBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at agne/jobGBERT and are newly ini

In [9]:
# encoding data with corresponding tokenizer
encoded_data = encode_data(de_path, tokenizer_bert, 'GWA Title')

# split data into train, test validation
train_set,val_set,test_set = split_data(encoded_data)

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Size of the dataset:  23543
length of the training set:  18834
length of the test set:  2354
length of the val set:  2355


In [10]:
encoded_data[0]

{'input_ids': tensor([  101,   164,   112,   243,  3169,  1643,  1197, 17176, 14703,  2118,
         5576,  9954,  1193,  2217,  3262, 11336,  9817,  1116, 12198,  9022,
        17761,  1424,   117,   144,  6420,  5745,  1424,   184,  2692,  4167,
          268, 15475,  2227, 26567,  1179, 17129, 17030,  1377,  5576, 18653,
        20201, 10486, 24356,  3262,   229,  9824, 25821, 23199,   143, 19593,
        14407, 20901,   184,  2692, 12118,  5759,  1204, 17176,  5745,  4380,
         4167, 11300,  5800,  1424,  4167,  1398,  2176, 27750, 15624, 27863,
          184,  2692,  1436,  4060,  1306,  2083,   144, 20910, 11741,   119,
          112,   166,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [11]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 100, # Evaluation and Save happens every 100 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir='trained_models/job_results',
    logging_dir='trained_models/job_logs',
    metric_for_best_model = 'f1_score',
    load_best_model_at_end=True
    
)

trainer_job = Trainer(
    model=Jobmodel,
    tokenizer=tokenizer_bert,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

In [12]:
trainer_job.train()

***** Running training *****
  Num examples = 18834
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 1480


KeyboardInterrupt: 

### 3. Multilingual model

In [38]:
# here applied a different tokenizer compared to the other 2 models
multi_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

multilingual_model = "bert-base-multilingual-cased" 
Multi_Bertmodel =  BertForSequenceClassification.from_pretrained(multilingual_model, num_labels=gwa_labels)



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [39]:
# encoding data with corresponding tokenizer
encoded_data = encode_data(de_path, multi_tokenizer, 'GWA Title')

# print('An example of zipped task and label: \n', encoded_data.__getitem__(10)) 
train_set,val_set,test_set = split_data(encoded_data)

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Size of the dataset:  23543
length of the training set:  18834
length of the test set:  2354
length of the val set:  2355


In [28]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    evaluation_strategy = IntervalStrategy.STEPS, # "steps"
    eval_steps = 100, # Evaluation and Save happens every 100 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir='trained_models/multibert_results',
    logging_dir='trained_models/multibert_logs',
    metric_for_best_model = 'f1_score',
    load_best_model_at_end=True
    
)

trainer_multibert = Trainer(
    model=Multi_Bertmodel,
    tokenizer=multi_tokenizer,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [29]:
trainer_multibert.train()

***** Running training *****
  Num examples = 18834
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 1480


Step,Training Loss,Validation Loss,Loss,Accuracy,Balanced Accuracy,F1 Score,Recall,Precision,Runtime,Samples Per Second,Steps Per Second
100,No log,0.259023,2.714472,0.259023,0.088463,0.259023,0.259023,0.259023,7.873,299.124,2.413
200,No log,0.427601,2.196005,0.427601,0.216565,0.427601,0.427601,0.427601,7.8636,299.483,2.416
300,No log,0.464119,1.986812,0.464119,0.25702,0.464119,0.464119,0.464119,8.1544,288.803,2.33
400,No log,0.501062,1.857215,0.501062,0.30466,0.501062,0.501062,0.501062,7.867,299.353,2.415
500,2.211800,0.510403,1.803832,0.510403,0.331783,0.510403,0.510403,0.510403,7.8621,299.539,2.417
600,2.211800,0.526964,1.749402,0.526964,0.364005,0.526964,0.526964,0.526964,7.8684,299.299,2.415
700,2.211800,0.532059,1.734157,0.532059,0.365533,0.532059,0.532059,0.532059,7.8482,300.068,2.421
800,2.211800,0.524841,1.729213,0.524841,0.38237,0.524841,0.524841,0.524841,8.0907,291.076,2.348
900,2.211800,0.527389,1.736151,0.527389,0.387682,0.527389,0.527389,0.527389,7.8505,299.98,2.42
1000,1.368800,0.528662,1.729934,0.528662,0.393352,0.528662,0.528662,0.528662,7.8569,299.737,2.418


***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
Saving model checkpoint to ./multibert_results/checkpoint-500
Configuration saved in ./multibert_results/checkpoint-500/config.json
Model weights saved in ./multibert_results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./multibert_results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./multibert_results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128
***** Running Evaluation *****
  Num exa

TrainOutput(global_step=1480, training_loss=1.570254846521326, metrics={'train_runtime': 1516.2553, 'train_samples_per_second': 124.214, 'train_steps_per_second': 0.976, 'total_flos': 1.23924771650304e+16, 'train_loss': 1.570254846521326, 'epoch': 10.0})

In [34]:
best_ckpt_path = trainer_multibert.state.best_model_checkpoint
trainer_multibert.save_model(best_ckpt_path)

Saving model checkpoint to ./multibert_results/checkpoint-1000
Configuration saved in ./multibert_results/checkpoint-1000/config.json
Model weights saved in ./multibert_results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./multibert_results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./multibert_results/checkpoint-1000/special_tokens_map.json


### 4. multilingual job model

In [28]:
from transformers import XLMRobertaConfig, XLMRobertaForSequenceClassification
from transformers import XLMRobertaTokenizer

In [29]:

model_path = '/srv/scratch2/jinq/model_ep_30'

# import the model
multi_job_model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
# import the tokenizer 
SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'

tokenizer_xlm = XLMRobertaTokenizer.from_pretrained(model_path)

Some weights of the model checkpoint at /srv/scratch2/jinq/model_ep_30 were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /srv/scratch2/jinq/model_ep_30 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weig

In [30]:
# encoding data with corresponding tokenizer
encoded_data = encode_data(de_path, tokenizer_xlm, 'GWA Title')

# print('An example of zipped task and label: \n', encoded_data.__getitem__(10)) 
train_set,val_set,test_set = split_data(encoded_data)

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Size of the dataset:  23543
length of the training set:  18834
length of the test set:  2354
length of the val set:  2355


In [32]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 100, # Evaluation and Save happens every 100 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir='trained_models/m4_multijob_results',
    logging_dir='trained_models/m4_multijob_logs',
    metric_for_best_model = 'f1_score',
    load_best_model_at_end=True
    
)

trainer_multi_job = Trainer(
    model=multi_job_model,
    tokenizer=tokenizer_xlm,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
trainer_multi_job.train()

***** Running training *****
  Num examples = 18834
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 1480


RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 1206, in forward
    outputs = self.roberta(
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 853, in forward
    encoder_outputs = self.encoder(
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 526, in forward
    layer_outputs = layer_module(
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 412, in forward
    self_attention_outputs = self.attention(
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 339, in forward
    self_outputs = self.self(
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 203, in forward
    mixed_query_layer = self.query(hidden_states)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/user/jinqli/anaconda3/envs/new_env/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 103, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`


In [None]:
best_ckpt_path = trainer_multi_job.state.best_model_checkpoint
trainer_multi_job.save_model(best_ckpt_path)

### Evaluate the models performance

In [23]:
## Comparing these two models
metrics_gbert=trainer_gbert.evaluate()
print(metrics_gbert)


***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128


{'eval_loss': 1.8680411577224731, 'eval_accuracy': 0.5040339702760085, 'eval_balanced_accuracy': 0.3215903417617457, 'eval_f1_score': 0.5040339702760085, 'eval_recall': 0.5040339702760085, 'eval_precision': 0.5040339702760085, 'eval_hamming_loss': 0.5040339702760085, 'eval_runtime': 6.2722, 'eval_samples_per_second': 375.466, 'eval_steps_per_second': 3.029, 'epoch': 8.78}


In [24]:

metrics_job=trainer_job.evaluate()
print(metrics_job)

***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128


{'eval_loss': 1.8994202613830566, 'eval_accuracy': 0.48874734607218684, 'eval_balanced_accuracy': 0.29438654919459367, 'eval_f1_score': 0.48874734607218684, 'eval_recall': 0.48874734607218684, 'eval_precision': 0.48874734607218684, 'eval_hamming_loss': 0.48874734607218684, 'eval_runtime': 6.2803, 'eval_samples_per_second': 374.979, 'eval_steps_per_second': 3.025, 'epoch': 10.0}


In [31]:
metrics_multibert=trainer_multibert.evaluate()
print(metrics_multibert)

***** Running Evaluation *****
  Num examples = 2355
  Batch size = 128


{'eval_loss': 1.7299339771270752, 'eval_accuracy': 0.5286624203821656, 'eval_balanced_accuracy': 0.3933516082476829, 'eval_f1_score': 0.5286624203821656, 'eval_recall': 0.5286624203821656, 'eval_precision': 0.5286624203821656, 'eval_hamming_loss': 0.5286624203821656, 'eval_runtime': 7.4651, 'eval_samples_per_second': 315.469, 'eval_steps_per_second': 2.545, 'epoch': 10.0}
