In [4]:
from haystack import Pipeline
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.extractors import NamedEntityExtractor
from ingestion.converter import EIDCJSONToDocument

fetcher = LinkContentFetcher()
converter = EIDCJSONToDocument()
extractor = NamedEntityExtractor(backend="spacy", model="en_core_web_md")

pipe = Pipeline()
pipe.add_component(instance=fetcher, name="fetcher")
pipe.add_component(instance=converter, name="converter")
pipe.add_component(instance=extractor, name="extractor")

pipe.connect("fetcher.streams", "converter.sources")
pipe.connect("converter.documents", "extractor.documents")

result = pipe.run(
    data={
        "fetcher": {
            "urls": [
                "https://catalogue.ceh.ac.uk/eidc/documents?page=1&rows=2000&term=state%3Apublished+AND+view%3Apublic+AND+recordType%3ADataset"
            ]
        },
        "converter": {"metadata_fields": ["description"]},
    }
)


In [20]:
def extract_entities(doc, type):
    ners = doc.meta["named_entities"]
    typed_ners = [ent for ent in ners if ent.entity == type]
    typed_ners_text = [doc.content[ent.start:ent.end] for ent in typed_ners]
    return set(typed_ners_text)

gpe_entities = [extract_entities(doc, "GPE") for doc in result["extractor"]["documents"]]
all_places = []
for places in gpe_entities:
    all_places.extend(places)
len(set(all_places))

742