In [None]:
!pip install -qU datasets pinecone-client sentence-transformers torch

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/462.8 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.6/170.6 KB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 KB[0m [31m8.9 MB/s[0m e

In [None]:
import pandas as pd

# Load the file into a list of strings
with open('documents.txt', 'r') as f:
    data = f.read().split('\n\n')

# Create a DataFrame with the paragraphs
df = pd.DataFrame({'paragraph': data})

# Print the first few rows of the DataFrame
print(df.head())

                                           paragraph
0  Income tax is a form of taxation that is based...
1  The amount of income tax you pay in the UK is ...
2  There are several ways to reduce your income t...
3  In the UK, self-employed individuals are respo...
4  If you are employed, your employer will deduct...


In [None]:
import pinecone

# connect to pinecone environment
pinecone.init(
    api_key="305100e5-8a5c-4e22-bd61-7fc99335626c",
    environment="us-east1-gcp"  # find next to API key in console
)

In [None]:
index_name = "abstractive-question-answering"

# check if the abstractive-question-answering index exists
# if index_name not in pinecone.list_indexes():
#     # create the index if it does not exist
#     pinecone.create_index(
#         index_name,
#         dimension=768,
#         metric="cosine"
#     )

# connect to abstractive-question-answering index we created
index = pinecone.Index(index_name)

In [None]:
import torch
from sentence_transformers import SentenceTransformer

# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the retriever model from huggingface model hub
retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base")
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [None]:
from tqdm.auto import tqdm  
# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["paragraph"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/2 [00:00<?, ?it/s]

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 113}},
 'total_vector_count': 113}

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# load bart tokenizer and model from huggingface
tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa')

In [None]:
def query_pinecone(query, top_k):
    # generate embeddings for the query
    xq = retriever.encode([query]).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(xq, top_k=top_k, include_metadata=True)
    return xc

In [None]:
query = "why should I pay tax?"
result = query_pinecone(query, top_k=3)
result

{'matches': [{'id': '0',
              'metadata': {'paragraph': 'Income tax is a form of taxation that '
                                        'is based on the amount of money you '
                                        'earn from various sources. The UK tax '
                                        'system is designed to be progressive, '
                                        'which means that people who earn more '
                                        'money are expected to pay a higher '
                                        'percentage of their income in tax. '
                                        'Income tax is collected by HM Revenue '
                                        'and Customs (HMRC), which is '
                                        'responsible for ensuring that people '
                                        'pay the right amount of tax.'},
              'score': 0.511445463,
              'sparseValues': {},
              'values': []},
           

In [None]:
def format_query(query, context):
    # extract passage_text from Pinecone search result and add the <P> tag
    conditioned_doc = "<P> " + " <P> ".join([d['metadata']['paragraph'] for d in context])
    query_and_docs = "question: {} context: {}".format(query, conditioned_doc)
    return query_and_docs

In [None]:
query = format_query(query, result["matches"])
print(query)

question: why should I pay tax? context: <P> Income tax is a form of taxation that is based on the amount of money you earn from various sources. The UK tax system is designed to be progressive, which means that people who earn more money are expected to pay a higher percentage of their income in tax. Income tax is collected by HM Revenue and Customs (HMRC), which is responsible for ensuring that people pay the right amount of tax. <P> The UK tax system is designed to support economic growth and encourage innovation. As a result, there are various tax incentives available to businesses, such as research and development tax credits and capital allowances. <P> The UK tax system is designed to be fair and progressive, with people who earn more money expected to pay a higher percentage of their income in tax. However, the tax system can be complex and difficult to navigate, particularly for those who are self-employed or have income from multiple sources.


In [None]:
def generate_answer(query):
    # tokenize the query to get input_ids
    inputs = tokenizer([query], max_length=1024, return_tensors="pt")
    # use generator to predict output ids
    ids = generator.generate(inputs["input_ids"], num_beams=2, min_length=20, max_length=40)
    # use tokenizer to decode the output ids
    answer = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return answer

In [None]:
generate_answer(query)

"Tax is a way to pay for the government to do what it needs to do to keep the country running. It's a way to make sure that the government is doing what it needs to"