In [2]:
import spacy

In [3]:
spacy.__version__

'3.4.1'

In [24]:
from spacy_crfsuite import CRFExtractor

component_config = {
    "features": [
        [
            "low",
            "title",
            "upper",
            "pos",
            "pos2"
        ],
        [
            "low",
            "bias",
            "prefix5",
            "prefix2",
            "suffix5",
            "suffix3",
            "suffix2",
            "upper",
            "title",
            "digit",
            "pos",
            "pos2"
        ],
        [
            "low",
            "title",
            "upper",
            "pos",
            "pos2"
        ],
    ],
    "c1": 0.01,
    "c2": 0.22
}

crf_extractor = CRFExtractor(component_config=component_config)
crf_extractor

NameError: name 'CRFExctractor' is not defined

In [5]:
component_config

{'features': [['low', 'title', 'upper', 'pos', 'pos2'],
  ['low',
   'bias',
   'prefix5',
   'prefix2',
   'suffix5',
   'suffix3',
   'suffix2',
   'upper',
   'title',
   'digit',
   'pos',
   'pos2'],
  ['low', 'title', 'upper', 'pos', 'pos2']],
 'c1': 0.01,
 'c2': 0.22}

In [6]:
import spacy

use_dense_features = crf_extractor.use_dense_features()
nlp = spacy.load("en_core_web_md")

In [7]:
from tqdm.notebook import tqdm_notebook
from spacy_crfsuite import read_file
from spacy_crfsuite.train import gold_example_to_crf_tokens
from spacy_crfsuite.tokenizer import SpacyTokenizer

def read_examples(file, tokenizer, use_dense_features=False, limit=None):
    examples = []
    it = read_file(file)
    it = it[:limit] if limit else it
    for raw_example in tqdm_notebook(it, desc=file):
        crf_example = gold_example_to_crf_tokens(
            raw_example, 
            tokenizer=tokenizer, 
            use_dense_features=use_dense_features, 
            bilou=False
        )
        examples.append(crf_example)
    return examples

# Spacy tokenizer
tokenizer = SpacyTokenizer(nlp)

# OPTIONAL: fine-tune hyper-params
# this is going to take a while, so you might need a coffee break ...
dev_examples = None
# dev_examples = read_examples("conll03/valid.conll", tokenizer, use_dense_features=use_dense_features)

if dev_examples:
    rs = crf_extractor.fine_tune(dev_examples, cv=5, n_iter=30, random_state=42)
    print("best params:", rs.best_params_, ", score:", rs.best_score_)
    crf_extractor.component_config.update(rs.best_params_)

In [8]:
train_examples = read_examples("/Users/mazz/Documents/Programming/Python/Mini-Project/DataSet/Dataset2.txt", tokenizer=tokenizer, use_dense_features=use_dense_features)

crf_extractor.train(train_examples, dev_samples=dev_examples)
%time

/Users/mazz/Documents/Programming/Python/Mini-Project/DataSet/Dataset2.txt:   0%|          | 0/14041 [00:00<?,…

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 8.82 µs


In [9]:
test_examples = read_examples("/Users/mazz/Documents/Programming/Python/Mini-Project/DataSet/Dataset3.txt", tokenizer=tokenizer, use_dense_features=use_dense_features)

/Users/mazz/Documents/Programming/Python/Mini-Project/DataSet/Dataset3.txt:   0%|          | 0/3453 [00:00<?, …

In [10]:
print(crf_extractor.explain())

Most likely transitions:
B-ORG      -> I-ORG      7.260039
B-PER      -> I-PER      7.016621
I-ORG      -> I-ORG      6.787570
I-MISC     -> I-MISC     6.301491
B-LOC      -> I-LOC      6.284100
B-MISC     -> I-MISC     6.052984
I-LOC      -> I-LOC      5.508632
I-PER      -> I-PER      4.808802
O          -> B-PER      3.496929
O          -> O          2.778873

Positive features:
5.259367 O          0:bias:bias
4.243087 O          0:suffix3:day
3.915058 B-ORG      -1:low:v
3.763559 B-PER      BOS
3.526967 O          BOS
3.258426 B-PER      0:prefix2:Mc
3.239694 O          0:prefix2:W1
3.103391 B-LOC      BOS
3.099422 B-ORG      BOS
2.878765 B-ORG      0:suffix5:shire


In [11]:
crf_extractor.function_dict

{'low': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'title': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'prefix5': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'prefix2': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'suffix5': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'suffix3': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'suffix2': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'suffix1': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'bias': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'pos': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'pos2': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>(crf_token)>,
 'upper': <function spacy_crfsuite.crf_extractor.CRFExtractor.<lambda>

In [12]:
help(crf_extractor.eval)

Help on method eval in module spacy_crfsuite.crf_extractor:

eval(eval_samples: List[List[spacy_crfsuite.features.CRFToken]]) -> Optional[Tuple[Any, str]] method of spacy_crfsuite.crf_extractor.CRFExtractor instance
    Evaluate the entity tagger on dev examples.
    
    Args:
        eval_samples (list): list of dev examples.
    
    Returns:
        (f1_score<float>, classification_report<str>)



In [13]:
#!pip uninstall sklearn-crfsuite

In [14]:
#!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git#egg=sklearn_crfsuite

In [15]:
for item in crf_extractor.eval(test_examples):
    print(item)

0.8243772996833281
              precision    recall  f1-score   support

       B-LOC      0.851     0.889     0.870      1593
       I-LOC      0.747     0.713     0.729       240
      B-MISC      0.839     0.769     0.803       719
      I-MISC      0.670     0.611     0.639       229
       B-ORG      0.829     0.717     0.769      1699
       I-ORG      0.745     0.724     0.734       863
       B-PER      0.861     0.872     0.866      1593
       I-PER      0.892     0.963     0.926      1121

   micro avg      0.835     0.818     0.827      8057
   macro avg      0.804     0.782     0.792      8057
weighted avg      0.833     0.818     0.824      8057



In [27]:
type(crf_extractor.eval(test_examples))

tuple

In [31]:
type(crf_extractor.eval(test_examples)[1])

str

In [25]:
for item in crf_extractor.eval(test_examples):
    print(type(item))

<class 'numpy.float64'>
<class 'str'>


In [16]:
import joblib
joblib.dump(crf_extractor, filename='spacy_crfsuite_trained_spacy3.bz2')

['spacy_crfsuite_trained_spacy3.bz2']

In [None]:
#!pip uninstall spacy_crfsuite

In [18]:
#!pip install spacy_crfsuite

In [19]:
#!pip install git+git://github.com/talmago/spacy_crfsuite.git@master

In [20]:
from unicodedata import name
import spacy
from spacy.language import Language

from spacy_crfsuite import CRFEntityExtractor, CRFExtractor

@Language.factory("ner-crf-3")
def create_my_component(nlp, name):
    crf_extractor = CRFExtractor().from_disk("spacy_crfsuite_trained_spacy3.bz2")
    return CRFEntityExtractor(nlp, crf_extractor=crf_extractor)


nlp = spacy.load("en_core_web_md", disable=["ner"])
nlp.add_pipe("ner-crf-3")

doc = nlp(
    "George Walker Bush (born July 6, 1946) is an American politician and businessman "
    "who served as the 43rd president of the United States from 2001 to 2009.")

for ent in doc.ents:
    print(ent, "-", ent.label_)

AssertionError: 

### New ways of fitting into the pipeline

In [23]:
import spacy
from spacy.language import Language
from spacy_crfsuite import CRFEntityExtractor

@Language.factory("ner-crf-1") # give a name to the new factory (unique)
def create_my_component(nlp, name):
    #crf_extractor = CRFExtractor().from_disk("path-to-model")
    pipe = CRFEntityExtractor(nlp, crf_extractor=crf_extractor) # convert crf model into spacy pipeline component
    return pipe

nlp = spacy.load("en_core_web_md", disable=["ner"]) # load model without ner
nlp.add_pipe("ner-crf-1") # add to the pipe

# And use natively ..
doc = nlp(
    "George Walker Bush (born July 6, 1946) is an American politician and businessman "
    "who served as the 43rd president of the United States from 2001 to 2009.")

for ent in doc.ents:
    print(ent, "-", ent.label_)

George Walker Bush - PER
American - MISC
United States - LOC


Assuming B- class is correct.
Assuming last tag is L-


In [9]:
import spacy

nlp = spacy.load('/Users/mazz/Documents/Programming/Python/Mini-Project/Script/Fastext_models3')

In [13]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [16]:
main_nlp = spacy.load('en_core_web_sm', disable=['ner'])

In [17]:
main_nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']

In [24]:
import spacy

pipes = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

# load the model
def build_spacy3_pipeline(pipes):
    # create a blank spacy model
    nlp = spacy.load('/Users/mazz/Documents/Programming/Python/Mini-Project/Script/Fastext_models3')
    # create a reference model
    nlp_ref = spacy.load('en_core_web_md')
    for pipe in pipes:
        nlp.add_pipe(pipe)
        print(nlp.pipe_names)

build_spacy3_pipeline(pipes=pipes)

['tok2vec']
['tok2vec', 'tagger']
['tok2vec', 'tagger', 'parser']
['tok2vec', 'tagger', 'parser', 'attribute_ruler']
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [25]:
text = "Apple makes iPhones, and Samsung makes Galaxies"

In [26]:
doc = nlp(text)

In [27]:
doc.ents

()