src: https://github.com/explosion/projects/tree/v3/pipelines/ner_demo
    

In [31]:
%%writefile project.yml

title: "Fine-tune the NER component in a pretrained pipeline"
description: "Rather than training from scratch, we will fine-tune the ner pipeline of a pre-trained model."
spacy_version: ">=3.0.6,<4.0.0"
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  name: "ner_finetune"
  lang: "en"
  # for en_core_web_trf or other trf pipeline, use spacy-transformers>=1.1.1
  pipeline: "en_core_web_sm"
  train: "train.json"
  dev: "dev.json"
  version: "0.0.0"
  # Set your GPU ID, -1 is CPU
  gpu_id: -1

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "corpus", "configs", "training", "scripts", "packages"]

# Fetch the litbank data from GitHub and save to the assets folder

assets:
  - dest: "assets/litbank"
    git:
      repo: "https://github.com/dbamman/litbank"
      branch: "master"
      path: ""
        
# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
  all:
    - convert
    - create-config
    - train
    - evaluate

# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
  - name: "download"
    help: "Download the pretrained pipeline"
    script:
      - "python -m spacy download ${vars.pipeline}"
    
  - name: "convert"
    help: "Convert the data to spaCy's binary format"
    script:
      - "python scripts/convert.py ${vars.lang} 0.2"
    deps:
      - "scripts/convert.py"
    outputs:
      - "corpus/train.spacy"
      - "corpus/dev.spacy"

  - name: "create-config"
    help: "Create a config for updating only NER from an existing pipeline"
    script:
      - "python scripts/create_config.py ${vars.pipeline} ner configs/config.cfg"
    deps:
      - "scripts/create_config.py"
    outputs:
      - "configs/config.cfg"

  - name: "train"
    help: "Update the NER model"
    script:
      - "python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.max_steps 500 --training.patience 50 --gpu-id ${vars.gpu_id}"
    deps:
      - "configs/config.cfg"
      - "corpus/train.spacy"
      - "corpus/dev.spacy"
    outputs:
      - "training/model-best"

  - name: "evaluate"
    help: "Evaluate the model and export metrics"
    script:
      - "python -m spacy evaluate training/model-best corpus/dev.spacy --output training/metrics.json"
    deps:
      - "corpus/dev.spacy"
      - "training/model-best"
    outputs:
      - "training/metrics.json"

  - name: package
    help: "Package the trained model as a pip package"
    script:
      - "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force"
    deps:
      - "training/model-best"
    outputs_no_cache:
      - "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.tar.gz"

  - name: visualize-model
    help: Visualize the model's output interactively using Streamlit
    script:
      - "streamlit run scripts/visualize_model.py training/model-best \"I saw Shaka Khan in London. I like horses.\""
    deps:
      - "scripts/visualize_model.py"
      - "training/model-best"


Overwriting project.yml


In [48]:
%%writefile ./scripts/convert.py

"""Convert LitBank data to spaCy v3
.spacy format."""
import srsly
import typer
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from sklearn.model_selection import train_test_split



def convert(lang: str, test_size:float):
    entities_path = Path.cwd() / 'assets' / 'litbank' / 'entities' / 'brat'
    assert entities_path.exists()
    
    text_files = [f for f in entities_path.iterdir() if f.suffix == '.txt']

    docs = []

    #note: not using pretrained model because it adds predictions, just want LitBank data
    nlp = spacy.blank(lang)

    for text_file in text_files:
        doc = nlp.make_doc(text_file.read_text())
        annotation_file = (entities_path / (text_file.stem +'.ann'))
        annotations = annotation_file.read_text().split('\n')
        ents = []
        for annotation in annotations[:-1]:
            label, start, end = annotation.split('\t')[1].split()
            span = doc.char_span(int(start), int(end), label=label)
            if span: # when start and end do not match a valid string, spaCy returns a NoneType span
                ents.append(span)

        filtered = filter_spans(ents)
        doc.ents = filtered
        docs.append(doc)

    train_set, validation_set = train_test_split(docs, test_size=test_size)
    print(f'Created {len(train_set)} training docs')
    print(f'Created {len(validation_set)} validation docs')
    
    # the DocBin will store the training documents
    train_db = DocBin()
    for doc in train_set:
        train_db.add(doc)
    train_db.to_disk((Path.cwd() / 'corpus' /"train.spacy"))

    # Save the validation Docs to disk 
    validation_db = DocBin()
    for doc in validation_set:
        validation_db.add(doc)

    validation_db.to_disk((Path.cwd() / 'corpus' / "dev.spacy"))


if __name__ == "__main__":
    typer.run(convert)


Overwriting ./scripts/convert.py


# Now let's run the project! 

In [49]:
# Download the pretrained model
!spacy project run download 

[1m
Running command: /home/ds/anaconda3/envs/spacy-gpu/bin/python -m spacy download en_core_web_sm
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     |████████████████████████████████| 13.9 MB 14.7 MB/s            
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [50]:
# Fetch the LitBank data
!spacy project assets 

[38;5;4mℹ Fetching 1 asset(s)[0m
[38;5;2m✔ Downloaded asset
/home/ds/projects/course-materials/w2/practical-intro/ner_projects/ner_finetune/assets/litbank[0m


In [51]:
# Convert the LitBank data
!spacy project run convert 

[1m
Running command: /home/ds/anaconda3/envs/spacy-gpu/bin/python scripts/convert.py en 0.2
Created 80 training docs
Created 20 validation docs


In [52]:
# create a config file using the create_config script
!spacy project run create-config

[1m
[38;5;4mℹ Skipping 'create-config': nothing changed[0m


In [53]:
!spacy project run train

[1m
Running command: /home/ds/anaconda3/envs/spacy-gpu/bin/python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.max_steps 500 --training.patience 50 --gpu-id -1
[38;5;4mℹ Saving to output directory: training[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2021-12-16 14:42:44,439] [INFO] Set up nlp object from config
[2021-12-16 14:42:44,451] [INFO] Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
[2021-12-16 14:42:44,451] [INFO] Resuming training for: ['ner']
[2021-12-16 14:42:44,463] [INFO] Copying tokenizer from: en_core_web_sm
[2021-12-16 14:42:44,957] [INFO] Copying vocab from: en_core_web_sm
[2021-12-16 14:42:45,071] [INFO] Created vocabulary
[2021-12-16 14:42:45,072] [INFO] Finished initializing nlp object
[2021-12-16 14:42:45,072] [INFO] Initialized pipeline components: []
[38;5;2m✔ Initi

In [None]:
# We can run spacy project run visualize-model from the terminal (not in Jupyter)