In [1]:
! pip install python-docx
! pip install spacy_annotator

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py): started
  Building wheel for python-docx (setup.py): finished with status 'done'
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184605 sha256=eca0efdc836126033504dc70f9f201065e61a51039d18b59cbab9426ef0c246d
  Stored in directory: c:\users\jj199\appdata\local\pip\cache\wheels\32\b8\b2\c4c2b95765e615fe139b0b17b5ea7c0e1b6519b0a9ec8fb34d
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11
Collecting spacy_annotator
  Downloading spacy_annotator-2.1.1.tar.gz (5.6 kB)
Building wheels for collected packages: spacy-annotator
  Building wheel for spacy-annotator (setup.py): started
  Building wheel for spacy-annotator (setup.py): finished with status 'done'
  Created wheel for spacy-annotator: filename=spacy_annotator-2.1.1-py3-no

In [86]:
from os import walk
from docx import Document
from tqdm.notebook import tqdm
import spacy_annotator
import pandas as pd
import spacy

In [129]:
def get_text(fn, folder_path):
  fref = open(folder_path + fn, "rb")
  doc = Document(fref)
  text = []
  for para in doc.paragraphs:
    text.append(para.text)

  return '\n'.join(text)

def load_hudoc(folder_path):
  counter = 0
  texts = []
  fns = next(walk(folder_path), (None, None, []))[2]  # [] if no file
  for fn in tqdm(fns):
    try:
      texts.append(get_text(fn, folder_path))
    except:
      counter += 1

  print("{} files not extracted.".format(counter))

  return texts

def clean_file(courts):
  return [court.replace('\xa0', ' ') for court in courts]

def annotate_by_sentence(courts, nlp):
  annotated_data = []

  # For every court
  for court in tqdm(courts):
    annotated_court = []

    # Get all individual sentences (using spaCy tokenizer)
    sentences = [str(x) for x in nlp(court).sents]

    # For every sentence
    for sentence in sentences:
      annotated_sentence = []
      for token in nlp(sentence):
        annotated_sentence.append((token, token.ent_iob_, token.ent_type_))

      # Add sentence to court list
      annotated_court.append(annotated_sentence)
    
    # Add court to all data
    annotated_data.append(annotated_court)

  return annotated_data

In [7]:
# https://huggingface.co/transformers/custom_datasets.html
from google.colab import drive
import sys
drive.mount('/drive')

# Add path
sys.path.append('/drive/MyDrive/MRP1')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [50]:
courts = load_hudoc('/drive/MyDrive/MRP1/')
courts = clean_file(courts)

  0%|          | 0/362 [00:00<?, ?it/s]

128 files not extracted.


## Using spaCy (only) to annotate data

In [51]:
# Load model
nlp = spacy.load("en_core_web_sm")

In [130]:
annotate_by_sentence(courts, nlp)

  0%|          | 0/5 [00:00<?, ?it/s]

[[[(
    
    , 'O', ''),
   (FOURTH, 'B', 'LAW'),
   (SECTION, 'I', 'LAW'),
   (, 'O', ''),
   (CASE, 'O', ''),
   (OF, 'O', ''),
   (IGNAT, 'O', ''),
   (v., 'O', ''),
   (ROMANIA, 'O', ''),
   (, 'O', '')],
  [((, 'O', ''),
   (Application, 'O', ''),
   (no, 'O', ''),
   (., 'O', ''),
   (17325/16, 'B', 'CARDINAL'),
   (), 'O', ''),
   (
    
    
    
    , 'O', '')],
  [(JUDGMENT, 'O', ''), (
    , 'O', '')],
  [(Art, 'O', ''), (6, 'B', 'CARDINAL'), (§, 'O', ''), (1, 'O', '')],
  [((, 'O', ''), (criminal, 'O', ''), (), 'O', '')],
  [(•, 'O', ''), (Fair, 'O', '')],
  [(hearing, 'O', '')],
  [(•, 'O', '')],
  [(Overall, 'O', ''),
   (fairness, 'O', ''),
   (of, 'O', ''),
   (proceedings, 'O', ''),
   (overturning, 'O', ''),
   (applicant, 'O', ''),
   (’s, 'O', ''),
   (acquittal, 'O', ''),
   (without, 'O', ''),
   (directly, 'O', ''),
   (hearing, 'O', ''),
   (evidence, 'O', ''),
   (or, 'O', ''),
   (reviewing, 'O', ''),
   (testimony, 'O', ''),
   (of, 'O', ''),
   (defence, 'O

## Annotate dataset by using human interaction

In [92]:
# Creating sentences (by looking at newline character)
sentences = [x for x in courts[0].split("\n") if x != ""]

# Create dataframe
df = pd.DataFrame({
    "text": sentences
})

# Create annotater
annotator = spacy_annotator.Annotator(labels=['GPE','PERSON', 'JUDGE'], model = nlp)

In [102]:
# Annotate labels (using visualiser)
df_labels = annotator.annotate(df=df, col_text="text", shuffle=True)

HTML(value='-1 examples annotated, 117 examples left')

Text(value='', description='GPE', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three')

Text(value='', description='PERSON', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three')

Text(value='', description='JUDGE', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three')

HBox(children=(Button(button_style='success', description='submit', style=ButtonStyle()), Button(button_style=…

Output()

In [101]:
# Example
df_labels['annotations'][0]

('11.  On 27 April 2014 G.Sz. and G.S., who was accompanied by his former wife, E.K., went to an inn (“N.”) located between Gherla and Dej, where G.Sz. was expecting to meet with “F.”. While the three of them were talking at a table, a stranger (later identified as the applicant) approached them and whispered to G.S. that he had come on behalf of “F.”, who had instructed that “it” (possibly the money) had to be given to the father of “F.”, who was waiting in the inn’s toilets. The stranger then left the inn. G.Sz. refused to give the money as instructed, saying that she would only give the money to “F.” in person.',
 {'entities': [(22, 27, 'GPE'),
   (32, 36, 'GPE'),
   (122, 128, 'GPE'),
   (133, 136, 'GPE'),
   (144, 149, 'GPE'),
   (312, 316, 'GPE'),
   (512, 517, 'GPE'),
   (78, 82, 'PERSON')]})