In [1]:
import json
from pathlib import Path
from itertools import chain

### Load data

In [9]:
def load_test_data(dir="data/test_data/", format="json"):
    """Load test data from directory"""
    for file in Path(dir).glob(f"*.{format}"):
        with open(file) as f:
            yield json.load(f)["annotations"]

data = list(chain.from_iterable(load_test_data()))

### Map entity names

In [10]:
monty = {
    ("PER",): "{{NAME}}",
    ("LOC",): "{{LOCATION}}",
    ("ORG",): "{{ORGANIZATION",
    ("CPR",): "{{CPR NUMBER",
    ("EMAIL",): "{{EMAIL}}",
    ("PHONE",): "{{PHONE NUMBER}}",
    ("LINK",): "{{LINK}}",
}

working_monty = {}
for k, v in monty.items():
    for key in k:
        working_monty[key] = v

### Reformat to Prodigy standard

In [11]:
new_data = []
for example in data:
    inner = []

    if example[1]["entities"]:
        for entity in example[1]["entities"]:
            start, end, label = entity
            inner.append(
                {
                    "label": working_monty[label],
                    "start": start,
                    "end": end,
                }
            )

    new_data.append(
        {
            "text": example[0].strip("\r").strip(),
            "entities": inner
        }
    )
del data

### Remove duplicates

In [12]:
data = []
texts = []
for example in new_data:
    if example["text"] not in texts:
        data.append(example)
        texts.append(example["text"])
        

print(len(data), len(new_data))
del new_data, texts

867 898


In [13]:
from pie import Pie
from tqdm import tqdm

In [14]:
pie = Pie()

In [17]:

for example in tqdm(data[20:40]):
    response = pie.bake(example["text"])
    example["predicted"] = response

 15%|█▌        | 3/20 [00:06<00:31,  1.85s/it]

100%|██████████| 20/20 [01:32<00:00,  4.60s/it]


In [18]:
data[20:40]

[{'text': '_\\]{}##',
  'entities': [],
  'predicted': 'Hej {NAME},\n\nDu har modtaget denne e-mail, da du har tilmeldt dig vores kursus. Vi glæder os til at se dig den {DATE-OF-BIRTH} på {SCHOOL}, {STREET}, {CITY}, {ZIP CODE}.\n\nHvis du har spørgsmål, så kontakt os på {PHONE} eller {EMAIL}.\n\nVenlig hilsen\n{NAME}'},
 {'text': '__]{{',
  'entities': [],
  'predicted': 'Hej {NAME},\n\nDu har modtaget denne e-mail, fordi du har tilmeldt dig vores kursus, der finder sted den {DATE-OF-BIRTH} på {SCHOOL} i {CITY}.\n\nHvis du har spørgsmål, så kontakt os på {PHONE} eller {EMAIL}.\n\nVenlig hilsen\n\n{NAME}'},
 {'text': '_\\]]{{}}##%', 'entities': [], 'predicted': '{NAME}'},
 {'text': '\\\\{{]]{{}', 'entities': [], 'predicted': '\\\\{{]]{{}'},
 {'text': '\\]{{}', 'entities': [], 'predicted': '\\]{{}'},
 {'text': '\\]{#%', 'entities': [], 'predicted': '\\]{#%'},
 {'text': 'Det var lige i slutningen af 2019 og lige i starten af 2020 at mon morfar og mie døde.  Og jo jeg bor i hobro',
  'enti

In [84]:
response = pie.bake("Hej, jeg hedder Nicolai.")

In [86]:
response