In [None]:
%pip install accelerate datasets evaluate numpy pandas scikit-learn torch transformers[torch]

In [None]:
from datasets import ClassLabel, Features, load_dataset, TextClassification, Value
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support
from pandas import DataFrame
from time import time
from torch import backends, cuda
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
data_path = 'BUILD/'

model_name = 'distilbert'

log_every = 100

use_cuda_if_available = True

In [None]:
if use_cuda_if_available and cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
    if not use_cuda_if_available:
        backends.cudnn.enabled = False
        cuda.is_available = lambda : False

device

In [None]:
meta_groups = ['Criminal', 'Tax']
labels = ['PREAMBLE', 'FAC', 'RLC', 'ISSUE', 'ARG_PETITIONER', 'ARG_RESPONDENT', 'ANALYSIS', 'STA', 'PRE_RELIED', 'PRE_NOT_RELIED', 'RATIO', 'RPC', 'NONE']

dataset = load_dataset(
    'csv',
    data_files={
        'train': f'{data_path}train.csv',
        'test': f'{data_path}dev.csv',
    },
    features=Features({
        'doc_id': Value('uint32'),
        'doc_index': Value('uint16'),
        'sentence_index': Value('uint16'),
        'annotation_id': Value('string'),
        'text': Value('string'),
        'meta_group': ClassLabel(names=meta_groups),
        'labels': ClassLabel(names=labels),
    }),
    task=TextClassification(),
    split='test[:]'
)


In [None]:
classifier = pipeline('text-classification', model=f'{model_name}_model', device=device)

In [None]:
step_i = 0
def log_step(x):
    global step_i
    if step_i % log_every == 0: print(step_i)
    step_i += 1
    return x

In [None]:
start = time()
out = [log_step(x) for x in classifier(KeyDataset(dataset, 'text'))]
elapsed = time() - start

print(f'Elapsed: {elapsed}s')

In [None]:
label2id = {labels[i]: i for i in range(len(labels))}
df_out = DataFrame({
    'labels': [int(label) for label in dataset['labels']],
    'pred': [label2id[o['label']] for o in out],
})

In [None]:
def eval(df: DataFrame):
    ground_truth_labels = df['labels']
    submission_labels = df['pred']
    precision, recall, f1, _ = precision_recall_fscore_support(
        ground_truth_labels,
        submission_labels,
        average='weighted',
        zero_division=0
    )
    print(f'{precision:.3f} & {recall:.3f} & {f1:.3f}')

In [None]:
print(eval(df_out))

In [None]:
#confusion = confusion_matrix(df_out['labels'], df_out['pred'], labels=labels)
#confusion_plot = ConfusionMatrixDisplay(confusion, display_labels=labels)
#confusion_plot.plot(xticks_rotation=60)