In [None]:
! pip install transformers datasets
! pip install python-docx

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.3 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 46.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 32.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |█████████

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
from os import walk
from docx import Document
from tqdm.notebook import tqdm
from transformers import pipeline

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def get_text(fn, folder_path):
  fref = open(folder_path + fn, "rb")
  doc = Document(fref)
  text = []
  for para in doc.paragraphs:
    text.append(para.text)

  return '\n'.join(text)

def load_hudoc(folder_path):
  counter = 0
  texts = []
  fns = next(walk(folder_path), (None, None, []))[2]  # [] if no file
  for fn in tqdm(fns):
    try:
      texts.append(get_text(fn, folder_path))
    except:
      counter += 1

  print("{} files not extracted.".format(counter))

  return texts

def fill_mask(unmasker, sentence):
  result = unmasker(sentence)

  n = 5
  for r in result:
    print("'{}': {:.4f}".format(r['token_str'], r['score']))


## Creating dataset

In [None]:
# https://huggingface.co/transformers/custom_datasets.html
from google.colab import drive
import sys
drive.mount('/drive')

# Add path
sys.path.append('/drive/MyDrive/MRP1')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
data = load_hudoc('/drive/MyDrive/MRP1')

  0%|          | 0/362 [00:00<?, ?it/s]

128 files not extracted.


In [None]:
train_encodings = tokenizer(data, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# import torch

# class HUDOCDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx])
#         return item

#     def __len__(self):
#         return len(self.labels)

# train_dataset = HUDOCDataset(train_encodings, train_labels)
# val_dataset = HUDOCDataset(val_encodings, val_labels)
# test_dataset = HUDOCDataset(test_encodings, test_labels)

## Reusing pretrained model by Chalkidis (https://huggingface.co/nlpaueb/legal-bert-base-uncased)

In [None]:
model_path = 'nlpaueb/bert-base-uncased-echr'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

unmasker = pipeline('fill-mask', model=model_path)

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlpaueb/bert-base-uncased-echr were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Should be dismissed
fill_mask(unmasker, "The Constitutional Court summarily [MASK] the third applicant’s complaint of partiality as ill-founded, stating that his allegations represented only his subjective impression and that he had failed to procure evidence of partiality or arbitrariness.")

'dismissed': 0.8566
'rejected': 0.1431
'discarded': 0.0001
'refused': 0.0001
'dismissing': 0.0000


In [None]:
# Should be decision
fill_mask(unmasker, "The Constitutional Court’s decisions in their cases were identical or summarised versions of its [MASK] in the third applicant’s case.")

'decision': 0.4993
'decisions': 0.2673
'judgment': 0.0749
'findings': 0.0519
'reasoning': 0.0239


## Finetune ourself

An important distinction to make is whether we want to fine-tune our model, or whether we want to expose it to additional pretraining.

The former is simply a way to train BERT to adapt to a specific supervised task (e.g. NER / RE), for which we generally need in the order of 1000 or more samples including labels.

Pretraining, on the other hand, is basically trying to help BERT better "understand" data from a certain domain, by basically continuing its unsupervised training objective ([MASK]ing specific words and trying to predict what word should be there), for which you do not need labeled data. However, this has already been done by more computing power than we can ever acces. Should we do it even more (because we have more data)?

In [None]:
model_path = "bert-base-cased"
raw_datasets = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained(model_path)

print(raw_datasets['train']['text'][0])

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) 
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) 
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

training_args = TrainingArguments("test_trainer")

trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

trainer.train()

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=375, training_loss=0.2837846883138021, metrics={'train_runtime': 531.9572, 'train_samples_per_second': 5.64, 'train_steps_per_second': 0.705, 'total_flos': 789333166080000.0, 'train_loss': 0.2837846883138021, 'epoch': 3.0})

In [None]:
metric = load_metric("accuracy")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'eval_accuracy': 0.866,
 'eval_loss': 0.5990786552429199,
 'eval_runtime': 63.7055,
 'eval_samples_per_second': 15.697,
 'eval_steps_per_second': 1.962}