In [1]:
from collections import Counter
import json
from random import seed, shuffle, sample

import spacy
from spacy.tokens import DocBin

from nltk.stem import WordNetLemmatizer

from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# you might need to do this in an external terminal, if it throws a permissions error
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.2/42.8 MB 3.3 MB/s eta 0:00:14
     ---------------------------------------- 0.5/42.8 MB 6.2 MB/s eta 0:00:07
      --------------------------------------- 0.8/42.8 MB 6.1 MB/s eta 0:00:07
      --------------------------------------- 1.0/42.8 MB 6.0 MB/s eta 0:00:07
     - -------------------------------------- 1.4/42.8 MB 6.0 MB/s eta 0:00:07
     - -------------------------------------- 1.7/42.8 MB 6.1 MB/s eta 0:00:07
     - -------------------------------------- 2.1/42.8 MB 6.3 MB/s eta 0:00:07
     -- ------------------------------------- 2.4/42.8 MB 6.4 MB/s eta 0:00:07
     -- ------------------------------------- 2.8/42.8 MB 6.5 MB/s eta 0:00:07
     -- ---------------------------------

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Bruger\\Code\\School\\Data_Wrangling\\data-inthewild\\new_env\\Lib\\site-packages\\~pacy\\attrs.cp310-win_amd64.pyd'
Check the permissions.


[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Annotation of ingredients 

This notebook includes the annotation and prediction of ingredients.

The process is structured in the following way:

- Extracting a sample of the raw ingredient data from the dataset, making some preliminary predictions, and saving it as _.jsonl_ for use in the annotation software _Doccano_. The preliminary annotations are made with another model, that we did not end up using, and serve to ease the annotation process.
- (not in the notebook) Annotating the data using _Doccano_
- Constructing a training dataset from our annotations, using a majority vote system (here we only use 3 annotators).
- Training a model with _spacy_, using on these training data.
- Computing labels for the rest of the data.

### Computing preliminary annotations

In [4]:
RAW_DATA_PATH = '../../../data/raw/data_raw.json'

LABELING_MODEL_PATH = "PREVIOUS MODEL INSERT PATH HERE (not needed anymore)"

ANNOTATION_FOLDER = '../../../data/interim/annotation/'
TO_ANNOTATE = ANNOTATION_FOLDER + "TO_ANNOTATE.jsonl"

In [None]:
with open(RAW_DATA_PATH, 'r', encoding = 'utf8') as file:
    data = json.load(file)

# we sample 30 recipes
seed(69)
keys = sample(list(data.keys()), 30)

ingredients = []

for key in keys:
    ingredients.extend(data[key]['ingredients'])

# we trained this model with smaller amount of data
nlp = spacy.load(LABELING_MODEL_PATH)

outs = []

for ingredient in ingredients:

    doc = nlp(ingredient)

    label = []

    for ent in doc.ents:
        label.append([ent.start_char, ent.end_char, 'ingredient'])

    outs.append({"text": ingredient, 
                 "label": label})
    
with open(ANNOTATION_FOLDER + TO_ANNOTATE, 'w', encoding="utf8") as file:
    for out in outs:
        file.write(json.dumps(out, ensure_ascii=False) + "\n")

### Annotation

_Done in Doccano_

### Generate combined training data 

In [5]:
NO_ANNOTATORS = 3
COMBINED_PATH = ANNOTATION_FOLDER + 'annotations/COMBINED.jsonl'

ANNOTATED_FILES = [
    'bogdan.jsonl',
    'gino.jsonl',
    'veron.jsonl'
]

TRAIN_PATH = ANNOTATION_FOLDER + "spacy/train.spacy"
DEV_PATH = ANNOTATION_FOLDER + "spacy/dev.spacy"
TEST_PATH = ANNOTATION_FOLDER + "spacy/test.jsonl"

In [7]:
annotator_lists = []

for annotator_file in ANNOTATED_FILES:
    with open(ANNOTATION_FOLDER + 'annotations/' + annotator_file, 'r', encoding='utf-8') as file:
        annotator_list = [
            json.loads(line)
            for line in file.readlines()
        ]
        annotator_lists.append(sorted(annotator_list, key=lambda x: x['id']))

# we only use 3 of the annotators, 
# because we had issues with majority voting with an even number of annotators

final_annotations = []

for annotations in zip(*annotator_lists[:3]):
    
    labels = [
        (label[0], label[1])
        for annotation in annotations
            for label in annotation['label']
    ]

    to_keep = [
        label
        for label, count
            in Counter(labels).items()
        if count / NO_ANNOTATORS > 0.5
    ]

    final_annotations.append(
        {
            'text': annotations[0]['text'],
            'label': [
                [label[0], label[1], 'ingredient']
                for label in to_keep
            ]
        }
    )
    
with open(COMBINED_PATH, 'w', encoding='utf-8') as file:
    for annotation in final_annotations:
        file.write(json.dumps(annotation, ensure_ascii=False) + '\n')


In [11]:
def make_data(outfile_name, annotations):
    '''
    takes jsonl-data (annotations) as saves it as a spacy binary dataset
    '''
    nlp = spacy.blank("en")
    db = DocBin()

    for annotation in annotations:
        doc = nlp(annotation["text"])
        ents = []
        for start, end, label in annotation['label']:
            span = doc.char_span(start, end, label=label)
            if span != None:
                ents.append(span)
        doc.ents = ents
        db.add(doc)

    db.to_disk(outfile_name)

In [12]:
with open(COMBINED_PATH, 'r', encoding='utf-8') as f:
    final_annotations = [
        json.loads(line)
        for line 
            in f.readlines()
    ]

In [13]:
seed(100)
shuffle(final_annotations)

train_i = len(final_annotations) - (len(final_annotations) // 3)
dev_i = len(final_annotations) - (len(final_annotations) // 6)

train = final_annotations[:train_i]
dev = final_annotations[train_i:dev_i]
test = final_annotations[dev_i:]

make_data(TRAIN_PATH, train)
make_data(DEV_PATH, dev)

with open(TEST_PATH, 'w', encoding='utf-8') as f:
    for annotation in test:
        f.write(json.dumps(annotation, ensure_ascii=False))
        f.write('\n')


### Training annotator model

_Done with spacy in shell_

_python -m spacy train ml_models/annotation/config.cfg --output ml_models/annotation/models

### Making synthetic annotations using trained model

In [14]:
OUT_PATH = ANNOTATION_FOLDER + 'data.json'
MODEL_PATH = '../../../ml_models/annotation/models/model-best/'

In [15]:
with open(RAW_DATA_PATH, 'r', encoding = 'utf8') as file:
    data = json.load(file)

keys = data.keys()

nlp = spacy.load(MODEL_PATH)
lem = WordNetLemmatizer()

for key in keys:
    ingredients = data[key]['ingredients']
    data[key]['ingredient_annotations'] = []
    for ingredient in ingredients:
        doc = nlp(ingredient)
        for ent in doc.ents:
            data[key]['ingredient_annotations'].append(' '.join([lem.lemmatize(token.text) for token in ent]).lower())

with open(OUT_PATH, 'wb') as f:
    f.write(json.dumps(data, indent = 4, ensure_ascii=False).encode("utf8"))

## Model evaluation

We test on the test data using f1-score

In [16]:
nlp = spacy.load(MODEL_PATH)

with open(TEST_PATH, 'r', encoding='utf-8') as f:
    test_data = [
        json.loads(ingr)
        for ingr in f.readlines()
    ]

ground_truth = [
    [
        ingr['text'][label[0]:label[1]] 
        for label 
            in ingr['label']
    ]
    
    for ingr in test_data
]

pred = [
    [
        ent.text
        for ent
            in nlp(ingr['text']).ents
    ]
    
    for ingr in test_data
]

In [17]:
binarizer = MultiLabelBinarizer().fit(pred + ground_truth)
f1_score(binarizer.transform(pred), binarizer.transform(ground_truth), average='micro')

0.7642276422764227