In [1]:
from qdrant_haystack import QdrantDocumentStore

document_store = QdrantDocumentStore(
    path="qdrant",
    index="Document",
    embedding_dim=768,
    recreate_index=False
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from haystack.nodes import EntityExtractor
from haystack.pipelines import Pipeline
from haystack.nodes import PreProcessor, BM25Retriever, FARMReader
import torch

In [3]:
from newspaper3k_haystack import newspaper3k_crawler

crawler = newspaper3k_crawler(
    headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'},
    request_timeout= 10)

In [4]:
#get all already scraped links
doc_gene = document_store.get_all_documents_generator()
crawled_urls = []
for doc in doc_gene:
    crawled_urls.append(doc.meta["url"])

crawled_urls = list(set(crawled_urls))

In [5]:
crawler.crawled_urls = crawled_urls

In [6]:
from haystack.nodes.base import BaseComponent
import nltk


class convenient_metadata_generator_node(BaseComponent):
    # If it's not a decision component, there is only one outgoing edge
    outgoing_edges = 1

    def run(self, documents):
        # Insert code here to manipulate the input and produce an output dictionary
        for doc in documents:
            doc.meta["character_count"] = len(doc.content)
            doc.meta["word_count"] = len(nltk.word_tokenize(doc.content))
            doc.meta["sentence_count"] = len(nltk.sent_tokenize(doc.content))
        output={
            "documents": documents,
        }
        return output, "output_1"

    def run_batch(self, documents):
        # Insert code here to manipulate the input and produce an output dictionary
        
        return self.run(documents)

In [7]:
from haystack.nodes.base import BaseComponent
import nltk


class metadata_value_threshold(BaseComponent):
    # If it's not a decision component, there is only one outgoing edge
    outgoing_edges = 1

    def run(self, documents,threshold=200):
        # Insert code here to manipulate the input and produce an output dictionary
        new_docs = []
        for doc in documents:
            if doc.meta["word_count"] >= threshold:
                new_docs.append(doc)
       
        output={
            "documents": new_docs,
        }
        return output, "output_1"

    def run_batch(self, documents):
        # Insert code here to manipulate the input and produce an output dictionary
        
        return self.run(documents)

In [8]:
metagen2 = convenient_metadata_generator_node()

In [9]:
#we'll use it to further on filter documents that we don't want by word count.
convenient_metadata_generator_node = convenient_metadata_generator_node()
metadata_value_threshold = metadata_value_threshold()

In [10]:
entity_extractor = EntityExtractor(model_name_or_path="dslim/bert-base-NER",devices=[torch.device("mps")],flatten_entities_in_meta_data=True)

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=False,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=50 #try changing this in the future :)
)

indexing_pipeline = Pipeline()
indexing_pipeline.add_node(component=crawler, name="crawler", inputs=['File'])
indexing_pipeline.add_node(component=convenient_metadata_generator_node,name="meta_gen",inputs=["crawler"])
indexing_pipeline.add_node(component=metadata_value_threshold,name="doc_size_filter",inputs=["meta_gen"])
indexing_pipeline.add_node(component=processor, name="processor", inputs=['doc_size_filter'])
indexing_pipeline.add_node(component=metagen2,name="meta_gen2",inputs=["processor"])
indexing_pipeline.add_node(component=entity_extractor,name="EntityExtractor",inputs=["meta_gen2"])
indexing_pipeline.add_node(component=document_store, name="document_store", inputs=['EntityExtractor'])

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
indexing_pipeline.run(
    params={
        "crawler":{
            "query" :"https://www.lonelyplanet.com/norway",
            "n_articles" : 3000,
            "beam" : 2,
            "filters" : {
                "positive":["norway"],
                "negative": ["facebook","twitter","instagram","norway.org"]
            },
            "keep_links" : False,
            "metadata":True,
            "summary":True,
            "keywords":True,
        }
    }
)

Crawling https://www.radissonhotels.com/zh-tw/destination/norway:  71%|███████▏  | 2138/3000 [43:37<08:04,  1.78it/s]                           Building prefix dict from /Users/josepsmachine/miniforge3/envs/haystack_stuff/lib/python3.9/site-packages/jieba/dict.txt ...
Loading model from cache /var/folders/j8/64lnvrmj50q5dv_5dj_ztbz40000gn/T/jieba.cache
Loading model cost 0.40684008598327637 seconds.
Prefix dict has been built succesfully.
Crawling https://www.visitnorway.com/listings/vestlandske-hovedvei-%e2%80%9dkongeveien%e2: 3371it [1:40:20,  1.79s/it]                          
Preprocessing:   9%|▊         | 53/619 [00:00<00:01, 522.01docs/s]We found one or more sentences whose word count is higher than the split length.
Document 3108bfc2c56f86c0d38223460b54e1f8 is 85854 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now har

{'documents': [<Document: {'content': 'Norwegians love nothing better than to explore their own country, and it’s not difficult to see why – they live in one of the most beautiful places on earth. And when it comes to exploration, more so even than the more-famous fjords, the mountains of central Norway’s inner north – just before the country’s geographical bulge begins to narrow and head north toward the Arctic – are simply magnificent.\n\nJotunheimen in the high country of central Norway is exceptionally beautiful © Philartphace / Getty Images\n\nThe national parks of northern central Norway provide a focal point for all manner of adventures – apart from the remote reaches of the High Arctic in the Spitsbergen Archipelago, Norway’s national park model is built around access and activities. Hiking is the major draw, but wildlife-watching is another highlight, with white-water rafting and skiing also possible. Best of all, most activities are ideal for eco-conscious travellers who wish

In [12]:
document_store.get_all_documents()[0]

<Document: {'content': 'In the village of Øye by the Norangsfjord you will find one of the most distinct hotels in all of Europe.\n\nIn the village of Øye by the Norangsfjord you will find one of the most distinct hotels in all of Europe.\n\nIt has been a place where visitors come to savour the good life and the tranquillity in magnificent surroundings since 1891. It has been a favourite venue of royalties, writers and lovers for generations.\n\nThe 27 rooms, all of which are individually furnished with carefully selected antiques, are named after notables who have stayed here: Kaiser Wilhelm, King Oscar, Queen Maud and Princess Victoria; the authors Karen Blixen, Knut Hamsun and Sir Arthur Conan Doyle; the composer Edvard Grieg, playwright Henrik Ibsen and the explorer Roald Amundsen, to mention but a few.\n\nNow, and then the highlight of the day is the 3-course dinner, composed by our chef with the best local ingredients of the season. Served in the dining room and followed by excel

In [13]:
from haystack.nodes import EmbeddingRetriever
retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    use_gpu=True,
    devices=[torch.device("mps")],
)

  return self.fget.__get__(instance, owner)()
You seem to be using sentence-transformers/multi-qa-mpnet-base-dot-v1 model with the cosine function instead of the recommended dot_product. This can be set when initializing the DocumentStore


In [14]:
document_store.update_embeddings(retriever,batch_size=100)

  incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
Batches: 100%|██████████| 4/4 [00:09<00:00,  2.36s/it]
Batches: 100%|██████████| 4/4 [00:13<00:00,  3.35s/it]02:50,  9.75 Docs/s]
Batches: 100%|██████████| 4/4 [00:14<00:00,  3.66s/it]03:15,  8.01 Docs/s]
Batches: 100%|██████████| 4/4 [00:18<00:00,  4.56s/it]03:21,  7.26 Docs/s]
Batches: 100%|██████████| 4/4 [00:21<00:00,  5.31s/it]03:35,  6.32 Docs/s]
Batches: 100%|██████████| 4/4 [02:03<00:00, 30.84s/it]03:47,  5.54 Docs/s]
Batches: 100%|██████████| 4/4 [01:54<00:00, 28.65s/it]10:28,  1.85 Docs/s]
Batches: 100%|██████████| 4/4 [01:52<00:00, 28.20s/it]13:06,  1.35 Docs/s]
Batches: 100%|██████████| 4/4 [01:20<00:00, 20.08s/it]13:52,  1.16 Docs/s]
Batches: 100%|██████████| 4/4 [02:51<00:00, 42.84s/it]12:12,  1.18 Docs/s]
Batches: 100%|██████████| 4/4 [01:21<00:00, 20.45s/it]<14:12,  1.12s/ Docs]
Batches: 100%|██████████| 4/4 [02:59<00:00, 44.99s/it]<11:21,  1.03s/ Docs]
Batches: 100%|██████████| 4/4 [01:15<00:00, 18.

In [4]:
#filter out documents that were not able to split.
docs_gen = document_store.get_all_documents_generator()
to_delete_ids = []
for doc in docs_gen:
    if doc.meta["character_count"] > 5000:
        to_delete_ids.append(doc.id)

In [5]:
document_store.delete_documents(ids=to_delete_ids)

Get the initial places in the document store.

In [2]:
#array where we'll store this candidate places
candidate_places = []

docs_gen = document_store.get_all_documents_generator()
for doc in docs_gen:
    groups = doc.meta["entity_groups"]
    words = doc.meta["entity_words"]
    for g, w in zip(groups,words):
        if g == "LOC" and w not in candidate_places: #check if LOC and that we are not repeating a word
            candidate_places.append(w)

In [3]:
candidate_places

['Øye',
 'Norangsfjord',
 'Europe',
 'Sun Lounge',
 'Skageflå',
 'Geirangerfjord',
 'Skaggehola',
 'Seven Sisters Waterfall',
 'Knivsflå',
 'Norway',
 'Værøy',
 'Varangerhalvøya',
 'Finnmark',
 'Northern Europe',
 'Gudvangen',
 'Lærdal',
 'Stalheimskleiva',
 'Flåm',
 'Sogndal',
 'Løysløypa',
 'Hodlekve',
 'Oslo',
 'Fenaknoken',
 'Syverkiosken',
 'Haralds Vaffel',
 'Moss',
 'Smalhans',
 'St. Hanshaugen',
 'North Cape',
 'Hurtigruten',
 'Southern Europe',
 'Fjord Norway Fjord Norway',
 'Fjord Norway',
 'Spain',
 'Germany',
 'Belgium',
 'France',
 'Longyearbyen',
 'Longyearbreen',
 'Bergen',
 'Trondheim',
 'Stavanger',
 'Tromsø',
 'Bodø',
 'Rødshue',
 'Tobakksbukta',
 'Grønne Bakke',
 'Skjærhalden',
 'China',
 "King ' s Road",
 'Filefjell',
 'Filefjellstuene',
 'Tyin',
 'Eidsbugarden',
 'North',
 'Northern Norway',
 'Tyssestrengene',
 'Skjeggedal',
 'Odda',
 'Mount Gausta',
 'Gaustabanen',
 'Rjukan',
 'Trolltunga',
 'Ferrata',
 'Svalbard',
 'Statens hus',
 'Porsgrunn',
 'Telemark',
 'Koll

In [4]:
import pickle
with open('initial_places.pkl','wb') as file:
    pickle.dump(candidate_places,file)