# Imports

In [1]:
%load_ext autoreload
%autoreload 1
%load_ext lab_black

import os

os.chdir("/home/roblesi/git/document_information_extraction/")
os.getcwd()

'/home/roblesi/git/document_information_extraction'

In [2]:
from pathlib import Path

from haystack import Document
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.retriever.dense import DensePassageRetriever



In [3]:
from src.retriever.retriever_input_articles import (
    make_input_article_generator,
    row_to_article,
)

# Statics

In [4]:
INTERIM_DATA_PATH = Path(
    "~/git/document_information_extraction/data/interim"
).expanduser()

In [5]:
FAIIS_DB_DATA_PATH = "sqlite:///" + str(INTERIM_DATA_PATH / "faiis_sql_database.db")
FAIIS_EMB_DATA_PATH = INTERIM_DATA_PATH / "faiis_sql_database.faiss"

# Start document store
We will use request to comunicate with ElasticSearch

In [6]:
# initialize FAISS
document_store = FAISSDocumentStore(
    faiss_index_factory_str="Flat",
    sql_url=FAIIS_DB_DATA_PATH,
    return_embedding=True,
)

In [7]:
article_generator = make_input_article_generator(n_sample_texts=100000)

In [8]:
docs = []
for row in article_generator:
    page_id, article = row_to_article(row)

    docs += [Document(content=article, meta={"pageid": page_id})]

In [22]:
docs[50]

{'content': " \n\n2010 general election, shown within the London Borough of Bromley (yellow)\nBromley and Chislehurst is a constituency represented in the House of Commons of the Parliament of the United Kingdom since 2006 by Bob Neill, a Conservative.\n\n\n\nThis constituency is relatively prosperous in terms of income, has low unemployment and is largely suburban with significant parkland and sports areas. Most of the housing is owner-occupied although there are significant proportions of social housing in parts of Mottingham and Bromley Common. The 2011 census shows that the borough is 84.3% White European/British, lower than the national average (86%) and higher than then London average (59%). Until 2006 it was one of the Conservative Party's safest seats but the by-election of that year saw the party's electoral majority fall steeply from over 13,000 (in the 2005 election) to just over 600 votes (see below - Election results). They have since rebuilt this majority, which currently

In [10]:
document_store.delete_documents()

In [11]:
document_store.write_documents(docs)

In [12]:
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

In [13]:
document_store.update_embeddings(retriever=retriever)

Updating Embedding:   0%|                         | 0/100000 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  10%|█          | 10000/100000 [00:59<06:49, 219.98 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  20%|██▏        | 20000/100000 [01:31<06:04, 219.52 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  30%|███▎       | 30000/100000 [02:30<05:18, 219.46 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  40%|████▍      | 40000/100000 [03:20<04:34, 218.37 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  50%|█████▌     | 50000/100000 [04:00<03:48, 218.92 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  60%|██████▌    | 60000/100000 [04:50<03:02, 219.17 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  70%|███████▋   | 70000/100000 [05:30<02:16, 219.35 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  80%|████████▊  | 80000/100000 [06:20<01:30, 220.04 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed:  90%|█████████▉ | 90000/100000 [06:50<00:45, 219.12 docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Documents Processed: 100%|██████████| 100000/100000 [07:36<00:00, 218.97 docs/s]


In [14]:
document_store.save(FAIIS_EMB_DATA_PATH)

# Delete objects and reload

In [15]:
del document_store, retriever

In [16]:
FAIIS_DB_DATA_PATH

'sqlite:////home/roblesi/git/document_information_extraction/data/interim/faiis_sql_database.db'

In [17]:
document_store = FAISSDocumentStore.load(
    index_path=str(FAIIS_EMB_DATA_PATH),
    # sql_url=FAIIS_DB_DATA_PATH,
    # index_buffer_size = 10_000
)

In [18]:
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

In [34]:
retriever.retrieve("what is the difference between oranges and clementines?")

[{'content': " \n\nAn apple and an orange, not to be practically compared\nApples and Oranges, by Paul Cézanne.\n\nA comparison of apples and oranges occurs when two items or groups of items are compared that cannot be practically compared.\n\nThe idiom, comparing apples and oranges, refers to the apparent differences between items which are popularly thought to be incomparable or incommensurable, such as apples and oranges. The idiom may also be used to indicate that a false analogy has been made between two items, such as where an apple is faulted for not being a good orange.\n\n\n\n\nThe idiom is not unique to English. In Quebec French, it may take the form  (to compare apples with oranges), while in European French the idiom says  (to compare apples and pears) or  (to compare cabbages and carrots). In Latin American Spanish, it is usually  (comparing potatoes and sweet potatoes) or commonly for all varieties of Spanish  (comparing pears with apples). In some other languages the ter