# Creating training data


In [None]:
import os
import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English

DATA_PATH = "../data/raw/"
nlp = English()


In [None]:
TEXTS = [
    "How to preorder the iPhone X",
    "iPhone X is coming",
    "Should I pay $1,000 for the iPhone X?",
    "The iPhone 8 reviews are here",
    "Your iPhone goes up to 11 today",
    "I need a new phone! Any tips?",
]

matcher = Matcher(nlp.vocab)

In [None]:
# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]

# Add patterns to the matcher
matcher.add("GADGET", [pattern1, pattern2])


In [None]:
# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Find the matches in the doc
    matches = matcher(doc)

    # Get a list of (start, end, label) tuples of matches in the text
    entities = [(start, end, "GADGET") for match_id, start, end in matches]
    print(doc.text, entities)


In [None]:
TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]

    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

print(*TRAINING_DATA, sep="\n")


## The training loop


In [None]:
# Create a blank 'en' model
nlp = spacy.blank("en")

# Create a new entity recognizer and add it to the pipeline
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)

# Add the label 'GADGET' to the entity recognizer
ner.add_label("GADGET")


In [None]:
import random


In [None]:
# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)


In [None]:
TEST_DATA = [
    "Apple is slowing down the iPhone 8 and iPhone X - how to stop it",
    "I finally understand what the iPhone X 'notch' is for",
    "Everything you need to know about the Samsung Galaxy S9",
    "Looking to compare iPad models? Hereâ€™s how the 2018 lineup stacks up",
    "The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple",
    "what is the cheapest ipad, especially ipad pro???",
    "Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics",
]


## Training best practices


In [None]:
TRAINING_DATA = [
    (
        "i went to amsterdem last year and the canals were beautiful",
        {"entities": [(10, 19, "GPE")]},
    ),
    (
        "You should visit Paris once in your life, but the Eiffel Tower is kinda boring",
        {"entities": [(17, 22, "GPE")]},
    ),
    (
        "There's also a Paris in Arkansas, lol",
        {"entities": [(15, 20, "GPE"), (24, 32, "GPE")]},
    ),
    (
        "Berlin is perfect for summer holiday: lots of parks, great nightlife, cheap beer!",
        {"entities": [(0, 6, "GPE")]},
    ),
]

print(*TRAINING_DATA, sep="\n")


In [None]:
TRAINING_DATA = [
    (
        "Reddit partners with Patreon to help creators build communities",
        {"entities": [(0, 6, "WEBSITE"), (21, 28, "WEBSITE")]},
    ),
    (
        "PewDiePie smashes YouTube record",
        {"entities": [(0, 9, "PERSON"), (18, 25, "WEBSITE")]},
    ),
    (
        "Reddit founder Alexis Ohanian gave away two Metallica tickets to fans",
        {"entities": [(0, 6, "WEBSITE"), (15, 29, "PERSON")]},
    ),
    # And so on...
]
