In [None]:
%pip install accelerate datasets evaluate numpy scikit-learn torch transformers[torch]

In [None]:
from datasets import ClassLabel, Features, load_dataset, TextClassification, Value
from os import sched_getaffinity
from torch import backends, cuda, get_num_threads, set_num_threads
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, EvalPrediction, TrainingArguments, Trainer

import evaluate
import numpy as np

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
data_path = 'BUILD/'

#base = {'name': 'legal-bert', 'tokenizer': 'nlpaueb/legal-bert-base-uncased', 'model': 'nlpaueb/legal-bert-base-uncased'}
base = {'name': 'distilbert', 'tokenizer': 'distilbert-base-uncased', 'model': 'distilbert-base-uncased'}
#base = {'name': 'roberta', 'tokenizer': 'xlm-roberta-base', 'model': 'xlm-roberta-base'}

epochs = 1
batch_size = 1

use_cuda_if_available = True
gradient_checkpointing = False

In [None]:
num_threads = max(1, get_num_threads(), len(sched_getaffinity(0)) - 1)
set_num_threads(num_threads)

num_threads

In [None]:
if use_cuda_if_available and cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
    if not use_cuda_if_available:
        backends.cudnn.enabled = False
        cuda.is_available = lambda : False

device

In [None]:
meta_groups = ['Criminal', 'Tax']
labels = ['PREAMBLE', 'FAC', 'RLC', 'ISSUE', 'ARG_PETITIONER', 'ARG_RESPONDENT', 'ANALYSIS', 'STA', 'PRE_RELIED', 'PRE_NOT_RELIED', 'RATIO', 'RPC', 'NONE']

dataset = load_dataset(
    'csv',
    data_files={
        'train': f'{data_path}train.csv',
        'test': f'{data_path}dev.csv',
    },
    features=Features({
        'doc_id': Value('uint32'),
        'doc_index': Value('uint16'),
        'sentence_index': Value('uint16'),
        'annotation_id': Value('string'),
        'text': Value('string'),
        'meta_group': ClassLabel(names=meta_groups),
        'labels': ClassLabel(names=labels),
    }),
    task=TextClassification()
)

dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base['tokenizer'])

tokenized_dataset = dataset.map(
    lambda t: tokenizer(t['text'], truncation=True),
    batched=True
).remove_columns('text')

tokenized_dataset

In [None]:
metric = evaluate.load('f1')

def compute_metrics(eval_pred: EvalPrediction):
    predictions, references = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(
        predictions=predictions,
        references=references,
        average='weighted'
    )

In [None]:
train_data = tokenized_dataset['train']#.train_test_split(train_size=0.05, stratify_by_column='labels')['train']
trainer = Trainer(
    AutoModelForSequenceClassification.from_pretrained(
        base['model'],
        num_labels=len(labels),
        id2label={ i: labels[i] for i in range(len(labels)) },
        label2id={ labels[i]: i for i in range(len(labels)) }
    ).to(device),
    TrainingArguments(
        output_dir=f'{base["name"]}_{epochs}_model',
        evaluation_strategy='epoch',
        num_train_epochs=epochs,
        save_strategy='epoch',
        #save_steps=1,
        label_names=["labels"],
        load_best_model_at_end=True,
        logging_dir='./logs',
        logging_steps=10,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=8,
        gradient_checkpointing=gradient_checkpointing,
        learning_rate=2e-5,
        weight_decay=0.01,
        optim='adafactor',
    ),
    DataCollatorWithPadding(tokenizer=tokenizer),
    train_data,
    train_data,
    tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
#for batch in trainer.get_eval_dataloader():
#    break

#batch = {k: v.to(device) for k, v in batch.items()}

#with torch.no_grad():
#    outputs = trainer.model(**batch)

#predictions = outputs.logits.cpu().numpy()
#labels = batch["labels"].cpu().numpy()

#print(compute_metrics((predictions, labels)))

trainer.train()
trainer.save_model()