In [1]:
%pip install accelerate datasets evaluate numpy pandas scikit-learn torch transformers[torch]

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import ClassLabel, Features, load_dataset, TextClassification, Value
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support
from pandas import DataFrame, read_csv
from time import time
from torch import backends, cuda
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')
data_path = 'BUILD/'
model_path = 'MODELS/'
output_path = 'OUTPUT/'

model_name = 'legalbert_2'

log_every = 100

use_cuda_if_available = True

In [4]:
if use_cuda_if_available and cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
    if not use_cuda_if_available:
        backends.cudnn.enabled = False
        cuda.is_available = lambda : False

device

'cuda:0'

In [5]:
train = read_csv(f'{data_path}train.csv')
dev = read_csv(f'{data_path}dev.csv')

In [6]:
meta_groups = ['Criminal', 'Tax']
labels = ['PREAMBLE', 'FAC', 'RLC', 'ISSUE', 'ARG_PETITIONER', 'ARG_RESPONDENT', 'ANALYSIS', 'STA', 'PRE_RELIED', 'PRE_NOT_RELIED', 'RATIO', 'RPC', 'NONE']

dataset = load_dataset(
    'csv',
    data_files={
        'train': f'{data_path}train.csv',
        'test': f'{data_path}dev.csv',
    },
    features=Features({
        'doc_id': Value('uint32'),
        'doc_index': Value('uint16'),
        'sentence_index': Value('uint16'),
        'annotation_id': Value('string'),
        'text': Value('string'),
        'meta_group': ClassLabel(names=meta_groups),
        'labels': ClassLabel(names=labels),
    }),
    split='test[:]'
)


In [17]:
classifier = pipeline('text-classification', model=f'{model_path}{model_name}_model', device=device, return_all_scores=True)



In [18]:
classifier("The Court first considers the question of the interpretation placed on the word „education‟ occurring in Section 2 (15) of the Act.")

[[{'label': 'PREAMBLE', 'score': 0.016414782032370567},
  {'label': 'FAC', 'score': 0.012092282064259052},
  {'label': 'RLC', 'score': 0.005243601277470589},
  {'label': 'ISSUE', 'score': 0.04265857860445976},
  {'label': 'ARG_PETITIONER', 'score': 0.009925277903676033},
  {'label': 'ARG_RESPONDENT', 'score': 0.006513834930956364},
  {'label': 'ANALYSIS', 'score': 0.7462190389633179},
  {'label': 'STA', 'score': 0.05020399019122124},
  {'label': 'PRE_RELIED', 'score': 0.07606191188097},
  {'label': 'PRE_NOT_RELIED', 'score': 0.013796793296933174},
  {'label': 'RATIO', 'score': 0.013640145771205425},
  {'label': 'RPC', 'score': 0.0027304862160235643},
  {'label': 'NONE', 'score': 0.004499299917370081}]]

In [19]:
# pre relied
classifier("The preparation and distribution of text books certainly contributes to the process of training and development of the mind and the character of students.")

[[{'label': 'PREAMBLE', 'score': 0.04151172935962677},
  {'label': 'FAC', 'score': 0.005441292654722929},
  {'label': 'RLC', 'score': 0.003714759135618806},
  {'label': 'ISSUE', 'score': 0.0013815858401358128},
  {'label': 'ARG_PETITIONER', 'score': 0.03285778686404228},
  {'label': 'ARG_RESPONDENT', 'score': 0.008090818300843239},
  {'label': 'ANALYSIS', 'score': 0.7561227083206177},
  {'label': 'STA', 'score': 0.005678412038832903},
  {'label': 'PRE_RELIED', 'score': 0.10927850008010864},
  {'label': 'PRE_NOT_RELIED', 'score': 0.0084913969039917},
  {'label': 'RATIO', 'score': 0.023553764447569847},
  {'label': 'RPC', 'score': 0.001964857103303075},
  {'label': 'NONE', 'score': 0.0019123171223327518}]]

In [23]:
classifier("My client thinks the preparation and distribution of text books certainly contributes to the process of training and development of the mind and the character of students.")

[[{'label': 'PREAMBLE', 'score': 0.008173773996531963},
  {'label': 'FAC', 'score': 0.058420728892087936},
  {'label': 'RLC', 'score': 0.013787658885121346},
  {'label': 'ISSUE', 'score': 0.001621848321519792},
  {'label': 'ARG_PETITIONER', 'score': 0.10676918923854828},
  {'label': 'ARG_RESPONDENT', 'score': 0.028795825317502022},
  {'label': 'ANALYSIS', 'score': 0.7135751843452454},
  {'label': 'STA', 'score': 0.002151449676603079},
  {'label': 'PRE_RELIED', 'score': 0.03360213711857796},
  {'label': 'PRE_NOT_RELIED', 'score': 0.005988121964037418},
  {'label': 'RATIO', 'score': 0.022367306053638458},
  {'label': 'RPC', 'score': 0.002942971419543028},
  {'label': 'NONE', 'score': 0.001803829800337553}]]

In [20]:
classifier("The creation and distribution of textbooks are important factors in the education and growth of students' minds and characters")

[[{'label': 'PREAMBLE', 'score': 0.053299013525247574},
  {'label': 'FAC', 'score': 0.01228675339370966},
  {'label': 'RLC', 'score': 0.0074378615245223045},
  {'label': 'ISSUE', 'score': 0.0022775225806981325},
  {'label': 'ARG_PETITIONER', 'score': 0.03199724853038788},
  {'label': 'ARG_RESPONDENT', 'score': 0.007470325566828251},
  {'label': 'ANALYSIS', 'score': 0.6746957302093506},
  {'label': 'STA', 'score': 0.012679584324359894},
  {'label': 'PRE_RELIED', 'score': 0.16621693968772888},
  {'label': 'PRE_NOT_RELIED', 'score': 0.008712699636816978},
  {'label': 'RATIO', 'score': 0.015974504873156548},
  {'label': 'RPC', 'score': 0.004776945803314447},
  {'label': 'NONE', 'score': 0.0021748836152255535}]]

In [8]:
# ratio
classifier("Reverting to the case on hand, the Court finds that what the ITAT has held in the impugned order is contrary to the settled law as explained in the above decisions.")

[{'label': 'ANALYSIS', 'score': 0.7128999829292297}]

In [8]:
step_i = 0
def log_step(x):
    global step_i
    if step_i % log_every == 0: print(step_i)
    step_i += 1
    return x

In [9]:
start = time()
out = [log_step(x) for x in classifier(KeyDataset(dataset, 'text'))]
elapsed = time() - start
dev['pred'] = [o['label'] for o in out]

print(f'Elapsed: {elapsed}s')

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
Elapsed: 28.69153666496277s


In [10]:
label2id = {labels[i]: i for i in range(len(labels))}
df_out = DataFrame({
    'labels': [int(label) for label in dataset['labels']],
    'pred': [label2id[l] for l in dev['pred']],
})

In [11]:
def eval(df: DataFrame):
    ground_truth_labels = df['labels']
    submission_labels = df['pred']
    precision, recall, f1, _ = precision_recall_fscore_support(
        ground_truth_labels,
        submission_labels,
        average='weighted',
        zero_division=0
    )
    print(f'{100*precision:.1f} & {100*recall:.1f} & {100*f1:.1f}')

In [12]:
dev[['annotation_id', 'pred']].rename(columns={'pred': 'labels'}).to_csv(f'{output_path}{model_name}.csv', index=False)

In [13]:
print(eval(df_out))

62.0 & 63.3 & 61.6
None


In [14]:
#confusion = confusion_matrix(df_out['labels'], df_out['pred'], labels=labels)
#confusion_plot = ConfusionMatrixDisplay(confusion, display_labels=labels)
#confusion_plot.plot(xticks_rotation=60)