## Required Libraries

In [1]:
# import all the required packages
import sys
import os
# change according to the status of GPU
# this command must before import pytorch
os.environ['CUDA_VISIBLE_DEVICES'] = '4,5,6'  # setting the GPUs

In [2]:

import pandas as pd
import numpy as np
import json, csv
import torch
import matplotlib.pyplot as plt
import datasets
from datasets import load_dataset, load_metric
from transformers import AutoModel, AutoTokenizer

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaConfig, XLMRobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score,accuracy_score
from torch.utils.data import TensorDataset, DataLoader, random_split, RandomSampler


from sklearn.metrics import balanced_accuracy_score,f1_score,recall_score,precision_score
# metrics for multi-label classification
from sklearn.metrics import hamming_loss

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# check the running environment
print(sys.executable)
print(torch.cuda.is_available())
torch.cuda.device_count()

/home/user/jinqli/anaconda3/envs/new_env/bin/python
True


3

## Read the prepared German data

In [4]:
de_path = '/srv/scratch2/jinq/taskontology/task_to_GWA_IWA_DWA_DE.csv'

data_de = pd.read_csv(de_path, index_col=0)
print('numbers of records in tasks to DWAs(de):', data_de.shape[0])
print('unique types of GWA', data_de['GWA Title'].nunique())
print('unique types of IWA', data_de['IWA Title'].nunique())
print('unique types of DWA', data_de['DWA Title'].nunique())

numbers of records in tasks to DWAs(de): 23543
unique types of GWA 37
unique types of IWA 332
unique types of DWA 2085


## Encoding

In [5]:
def encode_data(data_path, tokenizer, y_level):
    # load the data
    dataset = load_dataset('csv', data_files=data_path,split='train')
    print('Size of the dataset: ',len(dataset))

    # encoding tasks
    encoded_data = [tokenizer(item['Task_de'], 
                              return_tensors="pt", padding='max_length', truncation=True, 
                              max_length=64, is_split_into_words=True) for item in dataset]


    # encoding labels
    y_encoded = LabelEncoder().fit_transform(dataset[y_level])
    # print('Encoded labels: ', y_encoded)

    # Zipping the tasks and the labels(GWA title) together again
    for enc_item, item in zip(encoded_data, y_encoded):
        enc_item['labels'] = torch.tensor(item)

    for item in encoded_data:
        for key in item:
            item[key] = torch.squeeze(item[key])
    
    return encoded_data


def split_data(encoded_data):
    # split into train, validation and test (7:1:2)      
    # train_val_set, test_set = train_test_split(encoded_data, test_size=0.2)
    # train_set, val_set = train_test_split(train_val_set, test_size=0.125)
    
    percent70 = int(0.7*len(encoded_data))
    percent80 = int(0.8*len(encoded_data))
    
    train_set = encoded_data[0:percent70]
    val_set = encoded_data[percent70:percent80]
    test_set = encoded_data[percent80:-1]

    print('length of the training set: ', len(train_set))
    print('length of the test set: ',len(test_set))
    print('length of the val set: ',len(val_set)) 
    
    
    
    # details in the dataset
    #for key, val in test_set[3].items():
    #    print(f'key: {key}, dimensions: {val.size()}')
    
    return train_set,val_set,test_set


## Evaluation Metrics

In [6]:
# metrics for evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    balanced_accuracy = balanced_accuracy_score(labels, preds)
    f1 = f1_score(labels, preds,average='weighted')
    recall = recall_score(labels, preds,average='weighted')
    precision = precision_score(labels, preds,average='weighted')
    hamming = hamming_loss(labels, preds)

    return {
      'accuracy': acc,
      'balanced_accuracy': balanced_accuracy,
      'f1_score': f1,
      'recall': recall,
      'precision': precision,
      'hamming_loss': hamming,
    }

## Training models

### Training Parameters

In [7]:
# hyperparameters
gwa_labels=37
iwa_labels=332
dwa_labels=2085

hidden_dropout_prob = 0.3
learning_rate = 1e-5
weight_decay = 1e-2
epochs = 10
batch_size = 16

### 1. gbert-base model

In [9]:
german_model = "deepset/gbert-base" 
# Load the tokenizer
tokenizer_gbert = BertTokenizer.from_pretrained(german_model) 

# load the model
Bertmodel =  BertForSequenceClassification.from_pretrained(german_model,num_labels=gwa_labels)

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [10]:
# encoding data with corresponding tokenizer
encoded_data = encode_data(de_path, tokenizer_gbert, 'GWA Title')


Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Size of the dataset:  23543


In [11]:

# print('An example of zipped task and label: \n', encoded_data.__getitem__(10)) 
train_set,val_set,test_set = split_data(encoded_data)

length of the training set:  16480
length of the test set:  4708
length of the val set:  2354


In [12]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    evaluation_strategy = IntervalStrategy.STEPS, # "steps"
    eval_steps = 100, # Evaluation and Save happens every 100 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir='trained_models/gbert_results',
    logging_dir='trained_models/gbert_logs',
    metric_for_best_model = 'f1_score',
    load_best_model_at_end=True
    
)

trainer_gbert = Trainer(
    model = Bertmodel,
    tokenizer = tokenizer_gbert,
    args = training_args,
    train_dataset = train_set,
    eval_dataset = val_set,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

In [13]:
trainer_gbert.train()

***** Running training *****
  Num examples = 16480
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 3440


Step,Training Loss,Validation Loss,Loss,Accuracy,Balanced Accuracy,F1 Score,Recall,Precision,Runtime,Samples Per Second,Steps Per Second
100,No log,0.671198,2.710901,0.328802,0.192577,0.277935,0.328802,0.361291,7.0431,334.228,7.099
200,No log,0.525913,2.04949,0.474087,0.271979,0.392679,0.474087,0.383376,7.1181,330.708,7.024
300,No log,0.500425,1.846043,0.499575,0.311031,0.428614,0.499575,0.554328,7.0236,335.155,7.119
400,No log,0.495752,1.775058,0.504248,0.369232,0.484943,0.504248,0.515686,6.9935,336.598,7.149
500,2.221900,0.486831,1.760191,0.513169,0.381779,0.493637,0.513169,0.5156,7.0568,333.58,7.085
600,2.221900,0.470263,1.613918,0.529737,0.396978,0.52023,0.529737,0.530594,7.0327,334.721,7.11
700,2.221900,0.485981,1.623726,0.514019,0.415465,0.509502,0.514019,0.52769,7.1119,330.995,7.03
800,2.221900,0.460493,1.549535,0.539507,0.426157,0.525932,0.539507,0.53767,7.0252,335.08,7.117
900,2.221900,0.463042,1.525508,0.536958,0.423843,0.529829,0.536958,0.537941,7.1306,330.124,7.012
1000,1.489100,0.492353,1.59078,0.507647,0.433455,0.515071,0.507647,0.548662,7.0581,333.515,7.084


***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to trained_models/gbert_results/checkpoint-500
Configuration saved in t

TrainOutput(global_step=2600, training_loss=1.3348372092613807, metrics={'train_runtime': 1013.1762, 'train_samples_per_second': 162.657, 'train_steps_per_second': 3.395, 'total_flos': 4098452891873280.0, 'train_loss': 1.3348372092613807, 'epoch': 7.56})

In [14]:
## Comparing these models
metrics_gbert=trainer_gbert.evaluate()
print(metrics_gbert)

***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48


{'eval_loss': 1.5931720733642578, 'eval_accuracy': 0.5395072217502124, 'eval_balanced_accuracy': 0.45929879084640435, 'eval_f1_score': 0.5393576137591438, 'eval_recall': 0.5395072217502124, 'eval_precision': 0.559990045163543, 'eval_hamming_loss': 0.46049277824978757, 'eval_runtime': 7.1768, 'eval_samples_per_second': 327.999, 'eval_steps_per_second': 6.967, 'epoch': 7.56}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2. job gbert model

In [22]:
job_model = "agne/jobGBERT"

# Load the tokenizer
tokenizer_job = BertTokenizer.from_pretrained(job_model) 

Jobmodel = BertForSequenceClassification.from_pretrained(job_model, 
                                                         num_labels=gwa_labels)

loading file https://huggingface.co/agne/jobGBERT/resolve/main/vocab.txt from cache at /home/user/jinqli/.cache/huggingface/transformers/ecedf49c25be2988a09700989ec78d6c3bdf5a77746ef6ca8b516efb579aec21.4d65bbd3b91f2762e9d2c779d48ab14052439d3fcc8c3d2fe78c7322a9ac8d64
loading file https://huggingface.co/agne/jobGBERT/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/agne/jobGBERT/resolve/main/special_tokens_map.json from cache at /home/user/jinqli/.cache/huggingface/transformers/7ac8a9abe60d8cf91aee1054b444da856463b25234fabc8636456636bc9e8491.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/agne/jobGBERT/resolve/main/tokenizer_config.json from cache at /home/user/jinqli/.cache/huggingface/transformers/112eff4d5334a16dd358db473824a87a528c56294d51213185589764e59be36f.6cb0817c0112191a6ae461cb0e7c3c26facc8de9da82c7c24147c9b22862f911
loading file https://huggingface.co/agne/jobGBERT/resolve/main/tokenizer.

In [23]:
# encoding data with corresponding tokenizer
encoded_data = encode_data(de_path, tokenizer_job, 'GWA Title')

# split data into train, test validation
train_set,val_set,test_set = split_data(encoded_data)

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Size of the dataset:  23543
length of the training set:  16480
length of the test set:  4708
length of the val set:  2354


In [24]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 100, # Evaluation and Save happens every 100 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir='trained_models/job_results',
    logging_dir='trained_models/job_logs',
    metric_for_best_model = 'f1_score',
    load_best_model_at_end=True
    
)

trainer_job = Trainer(
    model=Jobmodel,
    tokenizer=tokenizer_job,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [25]:
trainer_job.train()

***** Running training *****
  Num examples = 16480
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 3440


Step,Training Loss,Validation Loss,Loss,Accuracy,Balanced Accuracy,F1 Score,Recall,Precision,Runtime,Samples Per Second,Steps Per Second
100,No log,0.581563,2.442628,0.418437,0.193099,0.330625,0.418437,0.301856,6.9917,336.686,7.151
200,No log,0.527188,1.979549,0.472812,0.308627,0.406445,0.472812,0.389134,6.9934,336.601,7.15
300,No log,0.501699,1.796745,0.498301,0.30887,0.429368,0.498301,0.495846,6.9968,336.437,7.146
400,No log,0.497026,1.790851,0.502974,0.379297,0.484315,0.502974,0.517824,6.9969,336.433,7.146
500,2.153300,0.466015,1.694031,0.533985,0.410278,0.517528,0.533985,0.534016,6.9916,336.688,7.151
600,2.153300,0.461767,1.593409,0.538233,0.42073,0.52896,0.538233,0.534523,7.0017,336.204,7.141
700,2.153300,0.459218,1.573741,0.540782,0.451226,0.529704,0.540782,0.535518,7.0042,336.085,7.139
800,2.153300,0.479609,1.59074,0.520391,0.400165,0.514673,0.520391,0.53268,7.209,326.536,6.936
900,2.153300,0.469414,1.549384,0.530586,0.418349,0.527985,0.530586,0.547267,7.2599,324.247,6.887
1000,1.468500,0.481308,1.577729,0.518692,0.465009,0.52567,0.518692,0.558576,7.0011,336.234,7.142


***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to trained_models/job_results/checkpoint-500
Configuration saved in tra

TrainOutput(global_step=2500, training_loss=1.3306952514648438, metrics={'train_runtime': 976.0716, 'train_samples_per_second': 168.84, 'train_steps_per_second': 3.524, 'total_flos': 3940536648929280.0, 'train_loss': 1.3306952514648438, 'epoch': 7.27})

In [14]:
best_ckpt_path = trainer_job.state.best_model_checkpoint
trainer_job.save_model(best_ckpt_path)

Saving model checkpoint to trained_models/job_results/checkpoint-2500
Configuration saved in trained_models/job_results/checkpoint-2500/config.json
Model weights saved in trained_models/job_results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in trained_models/job_results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in trained_models/job_results/checkpoint-2500/special_tokens_map.json


In [26]:
metrics_job=trainer_job.evaluate()
print(metrics_job)

***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48


{'eval_loss': 1.578514575958252, 'eval_accuracy': 0.5433305012744265, 'eval_balanced_accuracy': 0.46205567963079297, 'eval_f1_score': 0.5483071982145755, 'eval_recall': 0.5433305012744265, 'eval_precision': 0.570571013335827, 'eval_hamming_loss': 0.4566694987255735, 'eval_runtime': 7.0893, 'eval_samples_per_second': 332.05, 'eval_steps_per_second': 7.053, 'epoch': 7.27}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 3. Multilingual model

In [27]:
multilingual_model = "bert-base-multilingual-cased" 

# here applied a different tokenizer compared to the other 2 models
multi_tokenizer = BertTokenizer.from_pretrained(multilingual_model)

Multi_Bertmodel =  BertForSequenceClassification.from_pretrained(multilingual_model, num_labels=gwa_labels)



loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /home/user/jinqli/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /home/user/jinqli/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json from cache at /home/user/jinqli/.cache/huggingface/transformers/46880f3b0081fda494a4e15b0578769

In [28]:
# encoding data with corresponding tokenizer
encoded_data = encode_data(de_path, multi_tokenizer, 'GWA Title')

# print('An example of zipped task and label: \n', encoded_data.__getitem__(10)) 
train_set,val_set,test_set = split_data(encoded_data)

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Size of the dataset:  23543
length of the training set:  16480
length of the test set:  4708
length of the val set:  2354


In [29]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    evaluation_strategy = IntervalStrategy.STEPS, # "steps"
    eval_steps = 100, # Evaluation and Save happens every 100 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir='trained_models/m3_multibert_results',
    logging_dir='trained_models/m3_multibert_logs',
    metric_for_best_model = 'f1_score',
    load_best_model_at_end=True
    
)

trainer_multibert = Trainer(
    model=Multi_Bertmodel,
    tokenizer=multi_tokenizer,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
trainer_multibert.train()

***** Running training *****
  Num examples = 16480
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 3440


Step,Training Loss,Validation Loss,Loss,Accuracy,Balanced Accuracy,F1 Score,Recall,Precision,Runtime,Samples Per Second,Steps Per Second
100,No log,0.66695,2.840641,0.33305,0.108039,0.24982,0.33305,0.251436,10.3672,227.062,4.823
200,No log,0.584962,2.259705,0.415038,0.210099,0.335472,0.415038,0.344227,10.1939,230.923,4.905
300,No log,0.551827,2.097288,0.448173,0.260381,0.354247,0.448173,0.377159,9.8822,238.206,5.06
400,No log,0.539507,2.009151,0.460493,0.291309,0.418873,0.460493,0.408023,9.8792,238.277,5.061
500,2.426400,0.559473,2.045671,0.440527,0.334836,0.409186,0.440527,0.403789,9.8812,238.23,5.06
600,2.426400,0.503823,1.783723,0.496177,0.320821,0.440634,0.496177,0.421996,9.8788,238.288,5.061
700,2.426400,0.560323,1.905946,0.439677,0.344107,0.419609,0.439677,0.418257,9.8821,238.209,5.06
800,2.426400,0.538233,1.79021,0.461767,0.335006,0.435063,0.461767,0.431093,9.8806,238.245,5.06
900,2.426400,0.545879,1.757087,0.454121,0.35493,0.432519,0.454121,0.426826,9.8815,238.222,5.06
1000,1.701700,0.512319,1.666201,0.487681,0.35897,0.458392,0.487681,0.487188,9.8786,238.293,5.061


***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to trained_models/m3_multibert_results/checkpoint-500
Configuration sav

TrainOutput(global_step=2300, training_loss=1.580860542629076, metrics={'train_runtime': 1203.0182, 'train_samples_per_second': 136.989, 'train_steps_per_second': 2.859, 'total_flos': 3625756937994240.0, 'train_loss': 1.580860542629076, 'epoch': 6.69})

In [34]:
best_ckpt_path = trainer_multibert.state.best_model_checkpoint
trainer_multibert.save_model(best_ckpt_path)

Saving model checkpoint to ./multibert_results/checkpoint-1000
Configuration saved in ./multibert_results/checkpoint-1000/config.json
Model weights saved in ./multibert_results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./multibert_results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./multibert_results/checkpoint-1000/special_tokens_map.json


In [31]:
metrics_multibert=trainer_multibert.evaluate()
print(metrics_multibert)

***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48


{'eval_loss': 1.604386329650879, 'eval_accuracy': 0.5216652506372133, 'eval_balanced_accuracy': 0.37706050602092733, 'eval_f1_score': 0.5251785795676795, 'eval_recall': 0.5216652506372133, 'eval_precision': 0.5416940643840109, 'eval_hamming_loss': 0.47833474936278675, 'eval_runtime': 9.9992, 'eval_samples_per_second': 235.42, 'eval_steps_per_second': 5.0, 'epoch': 6.69}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 4. multilingual job model

In [8]:

model_path = '/srv/scratch2/jinq/model_ep_30'
tokenizer_xlm = XLMRobertaTokenizer.from_pretrained(model_path)
multi_job_model = XLMRobertaForSequenceClassification.from_pretrained(model_path, num_labels=gwa_labels)


Some weights of the model checkpoint at /srv/scratch2/jinq/model_ep_30 were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /srv/scratch2/jinq/model_ep_30 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj

In [9]:
# encoding data with corresponding tokenizer
encoded_data = encode_data(de_path, tokenizer_xlm, 'GWA Title')

# print('An example of zipped task and label: \n', encoded_data.__getitem__(10)) 
train_set,val_set,test_set = split_data(encoded_data)

Using custom data configuration default-2bd0bdea745cdb59
Found cached dataset csv (/home/user/jinqli/.cache/huggingface/datasets/csv/default-2bd0bdea745cdb59/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Size of the dataset:  23543
length of the training set:  16480
length of the test set:  4708
length of the val set:  2354


In [10]:
training_args = TrainingArguments(
    num_train_epochs=epochs,
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 100, # Evaluation and Save happens every 100 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    output_dir='trained_models/m4_multijob_results',
    logging_dir='trained_models/m4_multijob_logs',
    metric_for_best_model = 'f1_score',
    load_best_model_at_end=True
    
)

trainer_multi_job = Trainer(
    model=multi_job_model,
    tokenizer=tokenizer_xlm,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

In [11]:
trainer_multi_job.train()

***** Running training *****
  Num examples = 16480
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 3440


Step,Training Loss,Validation Loss,Loss,Accuracy,Balanced Accuracy,F1 Score,Recall,Precision,Runtime,Samples Per Second,Steps Per Second
100,No log,0.579864,2.524504,0.420136,0.167386,0.322845,0.420136,0.345742,13.7515,171.181,3.636
200,No log,0.517417,2.094524,0.482583,0.273462,0.420189,0.482583,0.384712,13.7567,171.116,3.635
300,No log,0.503398,1.939718,0.496602,0.265197,0.424647,0.496602,0.400308,13.7588,171.091,3.634
400,No log,0.519966,1.856138,0.480034,0.307366,0.437069,0.480034,0.464277,13.7489,171.214,3.637
500,2.312100,0.489805,1.744769,0.510195,0.331791,0.461225,0.510195,0.532626,13.758,171.1,3.634
600,2.312100,0.471113,1.628295,0.528887,0.37123,0.50346,0.528887,0.529798,13.7496,171.205,3.636
700,2.312100,0.483857,1.627411,0.516143,0.383701,0.482802,0.516143,0.529053,13.7544,171.146,3.635
800,2.312100,0.486831,1.592741,0.513169,0.380236,0.498665,0.513169,0.525459,13.7483,171.221,3.637
900,2.312100,0.469839,1.58301,0.530161,0.393054,0.513196,0.530161,0.535318,13.7442,171.273,3.638
1000,1.590300,0.483008,1.581212,0.516992,0.404634,0.511971,0.516992,0.532799,13.7523,171.171,3.636


***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to trained_models/m4_multijob_results/checkpoint-500
Configuration save

TrainOutput(global_step=2400, training_loss=1.4959378178914389, metrics={'train_runtime': 1716.1496, 'train_samples_per_second': 96.029, 'train_steps_per_second': 2.004, 'total_flos': 3783673180938240.0, 'train_loss': 1.4959378178914389, 'epoch': 6.98})

In [12]:
metrics_multi_job=trainer_multi_job.evaluate()
print(metrics_multi_job)

***** Running Evaluation *****
  Num examples = 2354
  Batch size = 48


{'eval_loss': 1.502758502960205, 'eval_accuracy': 0.5480033984706882, 'eval_balanced_accuracy': 0.4197450048927257, 'eval_f1_score': 0.5371769732121294, 'eval_recall': 0.5480033984706882, 'eval_precision': 0.5436982186366529, 'eval_hamming_loss': 0.4519966015293118, 'eval_runtime': 13.8369, 'eval_samples_per_second': 170.124, 'eval_steps_per_second': 3.614, 'epoch': 6.98}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:

# save m1 gbert model's predicted result
outfile = 'pred_test/gbert_test.csv'
m1pred.to_csv(outfile, header=True)

In [29]:
# encoding labels
y_encoded = LabelEncoder().fit(data_de['GWA Title'])

gwa_titles = y_encoded.inverse_transform(m1pred['la1'].astype(int)).tolist()
print(len(gwa_titles))
m1pred['gwa_class'] = gwa_titles
m1pred[['GWA Title', 'gwa_class']]

4708


Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Controlling Machines and Processes
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,"Inspecting Equipment, Structures, or Materials"
23538,Handling and Moving Objects,Performing General Physical Activities
23539,"Identifying Objects, Actions, and Events",Performing Administrative Activities
23540,Controlling Machines and Processes,Controlling Machines and Processes


In [30]:
compute_macro(m1pred['GWA Title'], m1pred['gwa_class'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.3718534449101467, 0.437294272612504, 0.3901104602488269)

In [25]:
model_checkpoint = m2_job

# import the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BertForSequenceClassification.from_pretrained(model_checkpoint)


In [20]:
m2pred_df = apply_classify_on_df(test_df)

# save m2 Job model's predicted result
outfile = 'pred_test/job_test.csv'
m2pred_df.to_csv(outfile, header=True)

In [21]:
gwa_titles = y_encoded.inverse_transform(m2pred_df['la1'].astype(int)).tolist()
print(len(gwa_titles))
m2pred_df['gwa_class'] = gwa_titles
m2pred_df[['GWA Title', 'gwa_class']]

4708


Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Controlling Machines and Processes
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,"Inspecting Equipment, Structures, or Materials"
23538,Handling and Moving Objects,Performing General Physical Activities
23539,"Identifying Objects, Actions, and Events",Performing Administrative Activities
23540,Controlling Machines and Processes,Controlling Machines and Processes


In [26]:
compute_macro(m2pred_df['GWA Title'], m2pred_df['gwa_class'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.3718534449101467, 0.437294272612504, 0.3901104602488269)

In [31]:
model_checkpoint = m3_multi

# import the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BertForSequenceClassification.from_pretrained(model_checkpoint)


m3pred = apply_classify_on_df(test_df)


# save m3 multilingual model's predicted result
outfile = 'pred_test/multi_test.csv'
m3pred.to_csv(outfile, header=True)

In [32]:

gwa_titles = y_encoded.inverse_transform(m3pred['la1'].astype(int)).tolist()

m3pred['gwa_class'] = gwa_titles
m3pred[['GWA Title', 'gwa_class']]

4708


Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Performing General Physical Activities
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,"Inspecting Equipment, Structures, or Materials"
23538,Handling and Moving Objects,Performing General Physical Activities
23539,"Identifying Objects, Actions, and Events",Monitoring and Controlling Resources
23540,Controlling Machines and Processes,Handling and Moving Objects


In [33]:
compute_macro(m3pred['GWA Title'], m3pred['gwa_class'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.34167679573472814, 0.39892455303591784, 0.34292711239468954)

In [35]:
model_checkpoint = m4_multi_job

# import the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = XLMRobertaForSequenceClassification.from_pretrained(model_checkpoint)

m4pred = apply_classify_on_df(test_df)


# save m4 multilingual_job model's predicted result
outfile = 'pred_test/multi_job_test.csv'
m4pred.to_csv(outfile, header=True)

In [36]:
gwa_titles = y_encoded.inverse_transform(m4pred['la1'].astype(int)).tolist()

m4pred['gwa_class'] = gwa_titles
m4pred[['GWA Title', 'gwa_class']]

Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Controlling Machines and Processes
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,Repairing and Maintaining Mechanical Equipment
23538,Handling and Moving Objects,Performing General Physical Activities
23539,"Identifying Objects, Actions, and Events",Handling and Moving Objects
23540,Controlling Machines and Processes,Handling and Moving Objects


In [37]:
compute_macro(m4pred['GWA Title'], m4pred['gwa_class'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.3851650661612, 0.4374095697649561, 0.3799185673510285)