In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

### Span

In [None]:
from spacy.tokens import Span

# Process a text to create a Doc object
doc = nlp("iPhone X is release on coming september third week!")
# doc.ents = [Span(doc, 0, 2, label="GADGET")]

In [None]:
# Accessing a span of tokens in the document
span_of_tokens = doc[2:6]  # Represents tokens from index 2 to 5 (exclusive)
span_of_tokens

In [None]:
# Working with the Span
print("Text of the span:", span_of_tokens.text)
print("Tokens in the span:", [token.text for token in span_of_tokens])
print("Start index of the span:", span_of_tokens.start)
print("End index of the span:", span_of_tokens.end)

In [None]:
# create a new Span
new_span = Span(doc, start=0, end=2, label="CUSTOM_LABEL")
print("New span text:", new_span.text)
print("New span label:", new_span.label_)
print(type(new_span))

### Spacy.blank

In [None]:
import spacy

# Create a blank English spaCy model
nlp = spacy.blank("en")

# Process a text with the blank model
doc = nlp("This is a blank spaCy model.")

# Access tokens in the document
for token in doc:
    print(token.text, token.pos_, token.dep_)


### Matcher Object

The Matcher is a powerful tool in spaCy for matching patterns in a text based on token attributes.

In [None]:
import spacy
from spacy.matcher import Matcher

# Load a spaCy model
nlp = spacy.load("en_core_web_sm")

# Create a Matcher object using the vocabulary of the spaCy model
matcher = Matcher(nlp.vocab)

# Define a pattern for matching the word "example"
pattern = [{"LOWER": "example"}]

# Add the pattern to the Matcher with a unique name (e.g., "ExamplePattern")
matcher.add("ExamplePattern", [pattern])

# Process a text
doc = nlp("This is an example sentence. Another example is shown here.")

# Use the Matcher to find matches in the processed document
matches = matcher(doc)
print(f"the patten matched: {matches}")

# Print the matches
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(f"Match found: '{matched_span.text}' (start: {start}, end: {end})")


### testing

In [13]:
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

with open("data/iphone.json", encoding="utf8") as f:
    text = json.loads(f.read())
    # print(text)

# Create a blank English spaCy model
nlp = spacy.blank("en")

# Create a Matcher object using the vocabulary of the spaCy model
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match "iphone" and "x"
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# Token whose lowercase form matches "iphone" and a digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]

# Add patterns to the matcher and create docs with matched entities
matcher.add("GADGET", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(text):
    print(f"doc: {doc}")
    matches = matcher(doc)
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
    print(f"spans: {spans}")
    doc.ents = spans
    print(f"doc.ents : {doc.ents}")
    docs.append(doc)

doc: How to preorder the iPhone X
spans: [iPhone X]
doc.ents : (iPhone X,)
doc: iPhone X is coming
spans: [iPhone X]
doc.ents : (iPhone X,)
doc: Should I pay $1,000 for the iPhone X?
spans: [iPhone X]
doc.ents : (iPhone X,)
doc: The iPhone 8 reviews are here
spans: [iPhone 8]
doc.ents : (iPhone 8,)
doc: iPhone 11 vs iPhone 8: What's the difference?
spans: [iPhone 11, iPhone 8]
doc.ents : (iPhone 11, iPhone 8)
doc: I need a new phone! Any tips?
spans: []
doc.ents : ()


In [14]:
docs

[How to preorder the iPhone X,
 iPhone X is coming,
 Should I pay $1,000 for the iPhone X?,
 The iPhone 8 reviews are here,
 iPhone 11 vs iPhone 8: What's the difference?,
 I need a new phone! Any tips?]

In [22]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Your Matcher definition (replace this with your actual pattern)
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "example"}]
matcher.add("ExamplePattern", [pattern])

# List of texts to process
# texts = ["This is an example sentence.", "Another example is shown here."]
with open("data/iphone.json", encoding="utf8") as f:
    texts = json.loads(f.read())

docs = []

for text in texts:
    doc = nlp(text)
    matches = matcher(doc)
    
    # Create a list to store the entities
    entities = []
    
    for match_id, start, end in matches:
        # Create Span objects using the found matches
        span = Span(doc, start, end, label=str(match_id))
        entities.append(span)

    # Assign the entities to the document's ents attribute
    doc.ents = entities

    # Append the modified document to the list
    docs.append(doc)

# Now, docs contains the processed documents with assigned entities


### spaCy DocBin

using spaCy's DocBin to convert a list of processed documents into a binary format suitable for training spaCy models. This binary format is useful for more efficient loading during model training.

In [23]:
from spacy.tokens import DocBin

# Assuming 'docs' is your list of processed documents

# get the first half of the docs list and added to the train
train_docs = docs[:len(docs) // 2]
print(train_docs)
# get the rest the docs list and added to the dev
dev_docs = docs[len(docs) // 2:]
print(dev_docs)

# Create DocBin instances with an explicitly specified vocab
train_doc_bin = DocBin(docs=train_docs)
train_doc_bin.to_disk("docs/train.spacy")

dev_doc_bin = DocBin(docs=dev_docs)
dev_doc_bin.to_disk("docs/dev.spacy")

[How to preorder the iPhone X, iPhone X is coming, Should I pay $1,000 for the iPhone X?]
[The iPhone 8 reviews are here, iPhone 11 vs iPhone 8: What's the difference?, I need a new phone! Any tips?]
