In [1]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, DownloadConfig,DatasetDict
import polars as pl
import torch
import json
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [108]:
torch.cuda.is_available()
torch.version.cuda

'11.8'

In [109]:
config = DownloadConfig(
    cache_dir="C:/Users/mike/.cache/huggingface",
    local_files_only=True,
    force_download=False,
    use_etag=True,
    resume_download=True
)

In [110]:
dataset = load_dataset("coastalcph/multi_eurlex", name="en", split="train", download_config=config)


In [111]:
temp_split = dataset.train_test_split(test_size=0.2, seed=42)

train_dataset = temp_split["train"]
split_data = temp_split["test"].train_test_split(test_size=0.5, seed=42)
val_dataset = split_data["train"]
test_dataset = split_data["test"]

In [112]:
dataset = DatasetDict({
    "train":train_dataset,
    "validation":val_dataset,
    "test":test_dataset
})

In [113]:
dataset

DatasetDict({
    train: Dataset({
        features: ['celex_id', 'text', 'labels'],
        num_rows: 44000
    })
    validation: Dataset({
        features: ['celex_id', 'text', 'labels'],
        num_rows: 5500
    })
    test: Dataset({
        features: ['celex_id', 'text', 'labels'],
        num_rows: 5500
    })
})

In [8]:
string_len = [len(text) for text in test_data["text"]]
for i,j in enumerate(string_len):
    if j == max(string_len):
        print(f"Max string at: {i}")
max(string_len), min(string_len), np.mean(string_len), np.std(string_len)

Max string at: 2764


(343062, 719, np.float64(7150.912727272727), np.float64(16302.266859774212))

In [116]:
sample = dataset["train"][:10]["labels"]

In [119]:
dataset.set_format("torch")

In [120]:
dataset["train"][0]["labels"]

tensor([ 3, 17])

In [117]:
sample

[[3, 17],
 [17, 5, 6],
 [3, 17, 5],
 [2, 4, 3, 12, 18, 15],
 [17, 19, 5, 6, 18],
 [18, 15, 19, 6],
 [3, 17, 15],
 [4, 5, 14, 20, 15],
 [18, 19],
 [3, 0, 18, 6]]

In [130]:
def code_labels(example):
    labels = F.one_hot(torch.tensor(example["labels"]), num_classes=21)
    example["labels"]=labels.to(torch.float32)
    return example
    
dataset_coded = dataset.map(code_labels)

Map:   0%|          | 0/44000 [00:00<?, ? examples/s]

  labels = F.one_hot(torch.tensor(example["labels"]), num_classes=21)


Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

In [134]:
type(dataset_coded["train"]["labels"])

list

In [10]:
labels = np.unique(np.concatenate(dataset["train"]["labels"]))
labels

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])

In [82]:
def one_hot_code(label_list):
    one_hot = np.zeros(len(labels),dtype=float)
    one_hot[label_list] = 1.0
    return one_hot

In [12]:
dataset["train"][0]

{'celex_id': '32001R1840',
 'text': 'Commission Regulation (EC) No 1840/2001\nof 19 September 2001\namending for the third time Regulation (EC) No 23/2001 laying down special measures for the beef sector that depart from the provisions of Regulation (EC) No 800/1999, Regulation (EEC) No 3719/88, Regulation (EC) No 1291/2000 and Regulation (EEC) No 1964/82\nTHE COMMISSION OF THE EUROPEAN COMMUNITIES,\nHaving regard to the Treaty establishing the European Community,\nHaving regard to Council Regulation (EC) No 1254/1999 of 17 May 1999 on the common organisation of the market in beef and veal(1), as last amended by Regulation (EC) No 1512/2001(2), and in particular Article 29(2)(a), Article 33(12) and Article 41 thereof,\nWhereas:\n(1) The health protection measures adopted by the authorities of certain non-member countries regarding exports of bovine animals and the meat of those animals in response to bovine spongiform encephalopathy have had serious economic consequences for exporters.

In [81]:
type(dataset["train"][0]["labels"][0])

int

In [83]:
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased", use_fast=True)
def tokenize_function(examples):
    tokenized_input = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    tokenized_input["labels"] = [one_hot_code(label) for label in examples["labels"]]
    return tokenized_input

In [85]:
tokinezed_data = dataset["test"].map(tokenize_function,remove_columns = ["text"] ,batched=True)

Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

In [123]:
tokinezed_data["labels"][0]

[0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]

In [15]:
tokinezed_data.remove_columns = ["text"]

In [16]:
tokenizer.decode(tokinezed_data["train"][0]["input_ids"])

'[CLS] commission regulation ( ec ) no 1840 / 2001 of 19 september 2001 amending for the third time regulation ( ec ) no 23 / 2001 laying down special measures for the beef sector that depart from the provisions of regulation ( ec ) no 800 / 1999, regulation ( eec ) no 3719 / 88, regulation ( ec ) no 1291 / 2000 and regulation ( eec ) no 1964 / 82 the commission of the european communities, having regard to the treaty establishing the european community, having regard to council regulation ( ec ) no 1254 / 1999 of 17 may 1999 on the common organisation of the market in beef and veal ( 1 ), as last amended by regulation ( ec ) no 1512 / 2001 ( 2 ), and in particular article 29 ( 2 ) ( a ), article 33 ( 12 ) and article 41 thereof, whereas : ( 1 ) the health protection measures adopted by the authorities of certain non - member countries regarding exports of bovine animals and the meat of those animals in response to bovine spongiform encephalopathy have had serious economic consequences

In [17]:
len(tokenizer.decode(tokinezed_data["validation"][2]["input_ids"]))

2565

In [18]:
with open("../data/eurovoc_descriptors.json") as f:
    eurovoc = json.load(f)

labels = dataset["train"].features["labels"].feature.names
descriptions = [eurovoc.get(label, {"en": "Unknown"})["en"] for label in labels]

id2labels = {label:desc for label,desc in zip(labels, descriptions)}
label2id = {desc:label for label,desc in zip(labels, descriptions)}
print(id2labels)
print(label2id)

{'100149': 'social questions', '100160': 'industry', '100148': 'finance', '100147': 'trade', '100152': 'business and competition', '100143': 'international relations', '100156': 'agriculture, forestry and fisheries', '100158': 'production, technology and research', '100154': 'transport', '100153': 'employment and working conditions', '100142': 'politics', '100145': 'law', '100150': 'education and communications', '100162': 'international organisations', '100159': 'energy', '100144': 'EUROPEAN UNION', '100151': 'science', '100157': 'agri-foodstuffs', '100161': 'geography', '100146': 'economics', '100155': 'environment'}
{'social questions': '100149', 'industry': '100160', 'finance': '100148', 'trade': '100147', 'business and competition': '100152', 'international relations': '100143', 'agriculture, forestry and fisheries': '100156', 'production, technology and research': '100158', 'transport': '100154', 'employment and working conditions': '100153', 'politics': '100142', 'law': '100145'

In [19]:
tokinezed_data["train"][0]["labels"]

[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [65]:
tokinezed_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
type(tokinezed_data["train"])

datasets.arrow_dataset.Dataset

In [21]:
type(tokinezed_data)

datasets.dataset_dict.DatasetDict

In [22]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased",problem_type = "multi_label_classification", num_labels=len(labels),id2label=labels,label2id=descriptions)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"output_dir",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [24]:
from sklearn.metrics import f1_score, precision_score, recall_score,roc_auc_score
from transformers import EvalPrediction

def multi_label_metriccs(predictions,labels,threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(predictions))
    preds = (probs > threshold).astype(int)
    f1 = f1_score(labels, preds, average="micro")
    precision = precision_score(labels, preds, average="micro")
    recall = recall_score(labels, preds, average="micro")
    roc_auc = roc_auc_score(labels, predictions, average="macro", multi_class="ovr")
    return {"f1": f1, "precision": precision, "recall": recall, "roc_auc": roc_auc}

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    labels = p.label_ids
    return multi_label_metriccs(preds,labels)

In [25]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokinezed_data["train"],
    eval_dataset=tokinezed_data["validation"],
    compute_metrics=compute_metrics
)

In [32]:
tokinezed_data["train"][0]["labels"].type()

'torch.LongTensor'

In [34]:
def convert(example):
    return {"labels": torch.tensor(example["labels"], dtype=torch.float32)}
tokenized_dataset = tokinezed_data.map(convert,load_from_cache_file=True )

Map:   0%|          | 0/44000 [00:00<?, ? examples/s]

  return {"labels": torch.tensor(example["labels"], dtype=torch.float32)}


Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

In [37]:
tokenized_dataset["train"][:5]["labels"]

tensor([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]])

In [39]:
# Example: Convert a torch.LongTensor to torch.float32
long_tensor = torch.tensor([1, 2, 3], dtype=torch.long)
float_tensor = long_tensor.to(torch.float32)
print(float_tensor)

tensor([1., 2., 3.])


In [43]:
sample=tokenized_dataset["train"][0]["labels"].type(torch.float32)
sample.type()

'torch.FloatTensor'

In [53]:
def convert(example):
    # Convert the list of labels into a single torch.FloatTensor
    return {"labels": torch.tensor(example["labels"], dtype=torch.float32)}
tokenized_dataset = tokenized_dataset.map(convert, batched=True, load_from_cache_file=False)
tokenized_dataset["train"][0]["labels"].type()

Map:   0%|          | 0/44000 [00:00<?, ? examples/s]

  return {"labels": torch.tensor(example["labels"], dtype=torch.float32)}


Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

'torch.LongTensor'

In [54]:
print(tokenized_dataset["train"].features["labels"])

Sequence(feature=ClassLabel(names=['100149', '100160', '100148', '100147', '100152', '100143', '100156', '100158', '100154', '100153', '100142', '100145', '100150', '100162', '100159', '100144', '100151', '100157', '100161', '100146', '100155'], id=None), length=-1, id=None)


In [51]:
tokenized_dataset["train"][0]["labels"].type()

'torch.LongTensor'

In [75]:
def convert(example):
    # Convert labels to torch.FloatTensor
    labels = example["labels"].to(torch.float32)
    return {"labels": labels}
# sample = convert(tokenized_dataset["train"][0])
# sample["labels"].type()

tokenized_dataset = tokenized_dataset["test"].map(convert, batched=False, load_from_cache_file=False)
tokenized_dataset["labels"][0].type()

KeyError: "Column test not in the dataset. Current columns in the dataset: ['celex_id', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']"

In [68]:
tokenized_dataset.set_format("torch")

In [69]:
type(tokenized_dataset["validation"][0]["labels"])

torch.Tensor

In [70]:
tokenized_dataset["train"][0]["labels"].type()

'torch.LongTensor'

In [105]:
import torch
import torch.nn.functional as F
label = dataset["train"][0]["labels"]
label_t = torch.tensor(label,dtype=torch.long)
one_hot_code=F.one_hot(label_t,21)
one_hot = one_hot_code.to(torch.float32)
one_hot.type()

'torch.FloatTensor'

In [107]:
tokenized_dataset["labels"]

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 1, 0],
        ...,
        [1, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])