In [None]:
%pip install accelerate datasets evaluate numpy scikit-learn torch transformers[torch]

In [1]:
from datasets import ClassLabel, Features, load_dataset, TextClassification, Value
#from os import sched_getaffinity
from torch import backends, cuda, get_num_threads, set_num_threads
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, EvalPrediction, TrainingArguments, Trainer

import evaluate
import numpy as np

In [None]:
#num_threads = max(1, get_num_threads(), len(sched_getaffinity(0)) - 1)
#set_num_threads(num_threads)
#num_threads

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')
data_path = 'BUILD/'
model_path = 'MODELS/'

#base = {'name': 'legal-bert', 'tokenizer': 'nlpaueb/legal-bert-base-uncased', 'model': 'nlpaueb/legal-bert-base-uncased'}
#base = {'name': 'distilbert', 'tokenizer': 'distilbert-base-uncased', 'model': 'distilbert-base-uncased'}
base = {'name': 'roberta', 'tokenizer': 'xlm-roberta-base', 'model': 'xlm-roberta-base'}

epochs = 3
batch_size = 4

use_cuda_if_available = True
gradient_checkpointing = False

In [3]:
if use_cuda_if_available and cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
    if not use_cuda_if_available:
        backends.cudnn.enabled = False
        cuda.is_available = lambda : False

device

'cuda:0'

In [4]:
meta_groups = ['Criminal', 'Tax']
labels = ['PREAMBLE', 'FAC', 'RLC', 'ISSUE', 'ARG_PETITIONER', 'ARG_RESPONDENT', 'ANALYSIS', 'STA', 'PRE_RELIED', 'PRE_NOT_RELIED', 'RATIO', 'RPC', 'NONE']

dataset = load_dataset(
    'csv',
    data_files={
        'train': f'{data_path}train.csv',
        'test': f'{data_path}dev.csv',
    },
    features=Features({
        'doc_id': Value('uint32'),
        'doc_index': Value('uint16'),
        'sentence_index': Value('uint16'),
        'annotation_id': Value('string'),
        'text': Value('string'),
        'meta_group': ClassLabel(names=meta_groups),
        'labels': ClassLabel(names=labels),
    })
)

dataset

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'doc_index', 'sentence_index', 'annotation_id', 'text', 'meta_group', 'labels'],
        num_rows: 26087
    })
    test: Dataset({
        features: ['doc_id', 'doc_index', 'sentence_index', 'annotation_id', 'text', 'meta_group', 'labels'],
        num_rows: 2899
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(base['tokenizer'])

tokenized_dataset = dataset.map(
    lambda t: tokenizer(t['text'], truncation=True),
    batched=True
).remove_columns('text')

tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'doc_index', 'sentence_index', 'annotation_id', 'meta_group', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 26087
    })
    test: Dataset({
        features: ['doc_id', 'doc_index', 'sentence_index', 'annotation_id', 'meta_group', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2899
    })
})

In [6]:
metric = evaluate.load('f1')

def compute_metrics(eval_pred: EvalPrediction):
    predictions, references = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(
        predictions=predictions,
        references=references,
        average='weighted'
    )

In [7]:
train_data = tokenized_dataset['train']#.train_test_split(train_size=0.05, stratify_by_column='labels')['train']
trainer = Trainer(
    AutoModelForSequenceClassification.from_pretrained(
        f'{model_path}{base["name"]}_{epochs-1}_model' if epochs > 1 else base['model'],
        num_labels=len(labels),
        id2label={ i: labels[i] for i in range(len(labels)) },
        label2id={ labels[i]: i for i in range(len(labels)) }
    ).to(device),
    TrainingArguments(
        output_dir=f'{base["name"]}_{epochs}_model',
        evaluation_strategy='epoch',
        num_train_epochs=1,
        save_strategy='epoch',
        #save_steps=1,
        label_names=["labels"],
        load_best_model_at_end=True,
        logging_dir='./logs',
        logging_steps=10,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=8,
        gradient_checkpointing=gradient_checkpointing,
        learning_rate=2e-5,
        weight_decay=0.01,
        optim='adafactor',
    ),
    DataCollatorWithPadding(tokenizer=tokenizer),
    train_data,
    train_data,
    tokenizer,
    compute_metrics=compute_metrics,
)

In [8]:
#for batch in trainer.get_eval_dataloader():
#    break

#batch = {k: v.to(device) for k, v in batch.items()}

#with torch.no_grad():
#    outputs = trainer.model(**batch)

#predictions = outputs.logits.cpu().numpy()
#labels = batch["labels"].cpu().numpy()

#print(compute_metrics((predictions, labels)))

trainer.train()
trainer.save_model()

  0%|          | 0/815 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.1606, 'learning_rate': 1.9754601226993868e-05, 'epoch': 0.01}
{'loss': 1.1894, 'learning_rate': 1.9509202453987733e-05, 'epoch': 0.02}
{'loss': 1.2219, 'learning_rate': 1.9263803680981596e-05, 'epoch': 0.04}
{'loss': 1.2721, 'learning_rate': 1.9018404907975462e-05, 'epoch': 0.05}
{'loss': 1.2163, 'learning_rate': 1.8773006134969328e-05, 'epoch': 0.06}
{'loss': 1.1682, 'learning_rate': 1.852760736196319e-05, 'epoch': 0.07}
{'loss': 1.2186, 'learning_rate': 1.828220858895706e-05, 'epoch': 0.09}
{'loss': 1.0154, 'learning_rate': 1.8036809815950922e-05, 'epoch': 0.1}
{'loss': 1.0596, 'learning_rate': 1.7791411042944788e-05, 'epoch': 0.11}
{'loss': 1.1151, 'learning_rate': 1.7546012269938654e-05, 'epoch': 0.12}
{'loss': 1.2714, 'learning_rate': 1.7300613496932516e-05, 'epoch': 0.13}
{'loss': 1.1019, 'learning_rate': 1.7055214723926382e-05, 'epoch': 0.15}
{'loss': 1.1097, 'learning_rate': 1.6809815950920248e-05, 'epoch': 0.16}
{'loss': 1.1397, 'learning_rate': 1.656441717791411e-0

  0%|          | 0/6522 [00:00<?, ?it/s]

{'eval_loss': 0.8869172930717468, 'eval_f1': 0.7028425816125067, 'eval_runtime': 111.762, 'eval_samples_per_second': 233.416, 'eval_steps_per_second': 58.356, 'epoch': 1.0}
{'train_runtime': 708.6828, 'train_samples_per_second': 36.811, 'train_steps_per_second': 1.15, 'train_loss': 1.0079269104940027, 'epoch': 1.0}
