In [4]:
import spacy
nlp_pipeline = spacy.load("en_core_web_md")

In [5]:
doc = nlp_pipeline("This is some text talking about areas of Cumbria that you might visit. (But don't go to Grange)")

In [6]:
for entity in doc.ents:
    print(entity.text, entity.label_)

Cumbria GPE
Grange ORG


In [27]:
from haystack.components.extractors import NamedEntityExtractor
extractor = NamedEntityExtractor(backend="spacy", model="en_core_web_md", pipeline_kwargs={"gpu_allocator": None})
extractor.warm_up()

ComponentError: Named entity extractor with backend 'spacy failed to initialize.

In [21]:
from haystack import Pipeline
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.extractors import NamedEntityExtractor
from ingestion.converters import EIDCJSONToDocument

fetcher = LinkContentFetcher()
converter = EIDCJSONToDocument()
extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")

pipe = Pipeline()
pipe.add_component(instance=fetcher, name="fetcher")
pipe.add_component(instance=converter, name="converter")
pipe.add_component(instance=extractor, name="extractor")

pipe.connect("fetcher.streams", "converter.sources")
pipe.connect("converter.documents", "extractor.documents")

result = pipe.run(
    data={
        "fetcher": {
            "urls": [
                "https://catalogue.ceh.ac.uk/eidc/documents?page=1&rows=2000&term=state%3Apublished+AND+view%3Apublic+AND+recordType%3ADataset"
            ]
        },
        "converter": {"metadata_fields": ["description"]},
    }
)


ComponentError: Named entity extractor with backend 'spacy failed to initialize.

In [10]:
result

{'extractor': {'documents': [Document(id=627c51e1d4f94ade80ac90e4ea60e2cdbf5950a7bdf267783de60c278095d59a, content: 'The dataset entitled "CEH Land Cover plus: Pesticides 2012-2017 (England, Scotland and Wales)" conta...', meta: {'dataset_id': '99a2d3a8-1c7d-421e-ac9f-87a2c37bda62', 'dataset_title': 'CEH Land Cover plus: Pesticides 2012-2017 (England, Scotland and Wales)', 'eidc_metadata_key': 'description', 'named_entities': [NamedEntityAnnotation(entity='MISC', start=22, end=25, score=0.90325975), NamedEntityAnnotation(entity='MISC', start=31, end=36, score=0.9239596), NamedEntityAnnotation(entity='LOC', start=65, end=72, score=0.9997682), NamedEntityAnnotation(entity='LOC', start=74, end=82, score=0.9997651), NamedEntityAnnotation(entity='LOC', start=87, end=92, score=0.999648), NamedEntityAnnotation(entity='ORG', start=168, end=170, score=0.6255943), NamedEntityAnnotation(entity='MISC', start=177, end=182, score=0.8511346), NamedEntityAnnotation(entity='ORG', start=189, end=190, sc

In [20]:
def extract_entities(doc, type):
    ners = doc.meta["named_entities"]
    typed_ners = [ent for ent in ners if ent.entity == type]
    typed_ners_text = [doc.content[ent.start:ent.end] for ent in typed_ners if ent.score > 0.995]
    return set(typed_ners_text)

gpe_entities = [extract_entities(doc, "LOC") for doc in result["extractor"]["documents"]]
all_places = []
for places in gpe_entities:
    all_places.extend(places)
set(gpe_entities)

[{'England', 'Scotland', 'Wales'},
 {'Great Britain'},
 {'Great Britain', 'UK', 'United Kingdom'},
 {'Great Britain'},
 {'Great Britain', 'UK'},
 {'Northern Ireland', 'UK', 'United Kingdom'},
 {'Northern Ireland', 'UK'},
 {'Great Britain'},
 {'Northern Ireland'},
 {'Great Britain', 'UK'},
 {'Northern Ireland'},
 {'Great Britain', 'UK'},
 {'Great Britain'},
 {'Se', 'Seychelles'},
 {'Great Britain'},
 {'Great Britain'},
 {'Great Britain', 'UK', 'United Kingdom'},
 {'Great Britain'},
 {'England'},
 {'Northern Ireland', 'UK', 'United Kingdom'},
 {'Great Britain'},
 {'Northern Ireland'},
 set(),
 {'Northern Ireland', 'UK'},
 {'Northern Ireland'},
 {'Northern Ireland', 'UK'},
 {'Great Britain'},
 {'Northern Ireland', 'UK', 'United Kingdom'},
 {'Northern Ireland'},
 {'Great Britain', 'UK', 'United Kingdom'},
 {'Northern Ireland', 'UK'},
 {'Northern Ireland', 'UK'},
 {'England', 'North Yorkshire Moors'},
 {'Northern Ireland'},
 {'Great Britain', 'UK', 'United Kingdom'},
 {'Scotland'},
 {'Japan