In [24]:
#!/usr/bin/env python
# coding: utf8
"""Example of training an additional entity type

This script shows how to add a new entity type to an existing pre-trained NER
model. To keep the example short and simple, only four sentences are provided
as examples. In practice, you'll need many more — a few hundred would be a
good start. You will also likely need to mix in examples of other entity
types, which might be obtained by running the entity recognizer over unlabelled
sentences, and adding their annotations to the training set.

The actual training is performed by looping over the examples, and calling
`nlp.entity.update()`. The `update()` method steps through the words of the
input. At each word, it makes a prediction. It then consults the annotations
provided on the GoldParse instance, to see whether it was right. If it was
wrong, it adjusts its weights so that the correct action will score higher
next time.

After training your model, you can save it to a directory. We recommend
wrapping models as Python packages, for ease of deployment.

For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities

Compatible with: spaCy v2.1.0+
Last tested with: v2.1.0
"""
from __future__ import unicode_literals
from __future__ import print_function
import re
import plac
import random
from pathlib import Path
import spacy
import json
import logging
from spacy.tokens import DocBin
from spacy.training import Example

LABEL = "SKILL"
TRAIN_DATA_FILE = "vaia-annotated.json"
model="C:/Users/tom/projects/skill-skeleton/notebooks/pipelines/NER/spacy/train/model-best"
new_model_name="training"
output_dir='C:/Users/tom/projects/skill-skeleton/models/NER/Model_02'
n_iter=30
   

"""Set up the pipeline and entity recognizer, and train the new entity."""
random.seed(0)

nlp = spacy.load(model)  # load existing spaCy model
print("Loaded model '%s'" % model)

print(nlp.pipe_names)
print(nlp.get_pipe("ner").labels)

ner = nlp.get_pipe("ner")

Loaded model 'C:/Users/tom/projects/skill-skeleton/notebooks/pipelines/NER/spacy/train/model-best'
['tok2vec', 'ner']
('SKILL',)


In [25]:
with open(TRAIN_DATA_FILE, 'r') as f:
        data = json.load(f)

TRAIN_DATA = data['annotations'] 

In [27]:
move_names = list(ner.move_names)
optimizer = nlp.begin_training()
# batch up the examples using spaCy's minibatch
for itn in range(n_iter):
    print("Starting iteration " + str(itn))
    random.shuffle(TRAIN_DATA)
    losses = {}        
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example],sgd=optimizer, losses=losses, drop=0.3)
    print("Losses", losses)

Starting iteration 0
Losses {'tok2vec': 5.472005670835908, 'ner': 2208.779726286151}
Starting iteration 1
Losses {'tok2vec': 1.4314704520787103, 'ner': 108.56472799972738}
Starting iteration 2
Losses {'tok2vec': 2.782760229851, 'ner': 130.06670160788235}
Starting iteration 3
Losses {'tok2vec': 1.9067399650156909, 'ner': 75.84146406856851}
Starting iteration 4
Losses {'tok2vec': 3.3555507963583673, 'ner': 62.74332898228721}
Starting iteration 5
Losses {'tok2vec': 3.9188381312284197, 'ner': 69.78457094639084}
Starting iteration 6
Losses {'tok2vec': 2.130813261622959, 'ner': 50.28858127320698}
Starting iteration 7
Losses {'tok2vec': 4.112041435201656, 'ner': 66.17496203707601}
Starting iteration 8
Losses {'tok2vec': 9.819685707946835, 'ner': 134.07488648359262}
Starting iteration 9
Losses {'tok2vec': 3.0078020300368715, 'ner': 49.107000549249115}
Starting iteration 10
Losses {'tok2vec': 4.01104944064901, 'ner': 39.67277101276683}
Starting iteration 11
Losses {'tok2vec': 0.6995252381864677

In [28]:
# test the trained model
test_text = "This course is intended as a basic introduction to using NVivo for qualitative data analysis.\n\t\n                            <p>NVivo is a widely used computer assisted qualitative data analysis software package which provides a potentially useful tool for the management and analysis of qualitative research data. This course is intended as a basic introduction to using NVivo for qualitative data analysis. Whether you are completely new to NVivo or have some previous experience with it  you will find this course both useful and enjoyable. This course blends lectures with hands-on exercises which allows you to try out the tools you've seen in the class under guidance.<br></p>\n<p><strong>What you will learn</strong>:</p>\n<p>At the end of this course you will master the core functionalities to apply the latest version of NVivo (1.0) to your project  including:</p>\n<ul><li><strong>Import</strong> - Creating a research project and importing different data formats such as Word documents  PDFs  webpages  audio  video and images into NVivo; classifying data files and managing their classifications</li><li><strong>Organize</strong> - Organizing codes  code text and create codes; apply coding stripes and highlights; use cases with classification and attributes; make annotations and memos  create sets and links to files</li><li><strong>Explore</strong> - Exploring lexical queries  word frequency and text search; apply code and matrix queries; illustrate with visualizations such as mind maps  concept maps  and coding matrix charts; coordinate team work by applying coding comparison</li></ul><p><strong>Not included in this course</strong>:</p>\n<ul><li>Theoretical framework of qualitative data analysis - Although this course will introduce some basic concepts of qualitative data analysis it is not a systematic review of the different theories.</li><li>Advanced qualitative methodologies - This course covers only the most salient features of NVivo and does not teach how to analyse qualitative data according to specific qualitative methods or designs  such as thematic analysis  grounded theory  content analysis  discourse analysis etc.</li></ul><h2>Target audience</h2>\n<p>Young researchers and data analysts who are new to qualitative research and curious about NVivo.</p>\n<h2>Fees</h2>\n<p>The participation fee is 330 EUR for participants from the private sector. Reduced prices apply to students and staff from non-profit  social profit  and government organizations. An exam fee of 35 EUR will be applied.</p>\n<ul><li>Industry  private sector  profession*: <strong>\u20ac 330</strong></li><li>Non profit  government  higher education staff: <strong>\u20ac 250</strong></li><li>(Doctoral) students  unemployed: <strong>\u20ac 150</strong></li></ul><p>*If two or more employees from the same company enrol simultaneously for this course a reduction of 20% on the course fee is taken into account starting from the second enrolment.</p>\n<h2>Registration</h2>\n<p>More information and registration on our <a href=\"https://beta-academy.ugent.be/en/program/short-and-long-running-initiatives/2023-2024-2023m9nv-module-9-getting-started-with-nvivo\" tabindex=\"-1\">Beta-Academy website</a>.</p>\n                       "
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

Entities in 'This course is intended as a basic introduction to using NVivo for qualitative data analysis.
	
                            <p>NVivo is a widely used computer assisted qualitative data analysis software package which provides a potentially useful tool for the management and analysis of qualitative research data. This course is intended as a basic introduction to using NVivo for qualitative data analysis. Whether you are completely new to NVivo or have some previous experience with it  you will find this course both useful and enjoyable. This course blends lectures with hands-on exercises which allows you to try out the tools you've seen in the class under guidance.<br></p>
<p><strong>What you will learn</strong>:</p>
<p>At the end of this course you will master the core functionalities to apply the latest version of NVivo (1.0) to your project  including:</p>
<ul><li><strong>Import</strong> - Creating a research project and importing different data formats such as Word doc

In [29]:
# save model to output directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta["name"] = new_model_name  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

    # test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    # Check the classes have loaded back consistently
    assert nlp2.get_pipe("ner").move_names == move_names
    doc2 = nlp2(test_text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)

Saved model to C:\Users\tom\projects\skill-skeleton\models\NER\Model_02
Loading from C:\Users\tom\projects\skill-skeleton\models\NER\Model_02
SKILL data analysis
SKILL computer
SKILL data analysis
SKILL data analysis
SKILL Word
SKILL word
SKILL data analysis
SKILL data analysis
SKILL content


In [None]:
# https://www.youtube.com/watch?v=IqOJU1-_Fi0&list=PLBmcuObd5An559HbDr_alBnwVsGq-7uTF&index=4