# Question Answering Based on Contents from Blogs

In [1]:
# pip install langchain faiss-cpu jq 

### Data Preparing
#### You can download dataset from here
https://gist.github.com/jedrazb/0c9df82143b694147e7b018370508535

In [2]:
from langchain.document_loaders import JSONLoader

def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["url"] = record.get("url")
    metadata["title"] = record.get("title")
    metadata["links"] = record.get("links")
    metadata["meta_description"] = record.get("meta_description")

    return metadata

loader = JSONLoader(
    file_path='./blog-pages.jsonl',
    jq_schema='.',
    content_key="body_content",
    metadata_func=metadata_func,
    json_lines=True
)
data = loader.load()

In [3]:
print(f"{len(data)} pubmed articles are loaded!")

32 pubmed articles are loaded!


In [4]:
from langchain.text_splitter import TokenTextSplitter,CharacterTextSplitter

text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=50)

chunks = text_splitter.split_documents(data)


In [5]:
print(len(chunks))

513


In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

# Using faiss index
from langchain.vectorstores import FAISS

In [8]:
modelPath = "intfloat/e5-large-unsupervised"
embeddings = HuggingFaceEmbeddings(
  model_name = modelPath,  
  model_kwargs = {'device':'cpu'},
  encode_kwargs={'normalize_embeddings':False}
)


db = FAISS.from_documents(chunks, embeddings)

  from tqdm.autonotebook import tqdm, trange


### Using similarity search to retrieve answer

#### Example question #1:

In [25]:
query = "What is a good tyre width for bikepacking trips?"
docs = db.similarity_search(query) 
print(docs)


[Document(page_content=' snack packs - One to hold the telephoto lens, the other to carry a bottle of wine when needed. Gear Camping Tent: Big Agnes Copper Spur HV UL2. At ~1400g with its packable size, this tent has been accompanying me on bikepacking trips for the past few years. Sleeping Bag: Cumulus 0°C comfort. It weighs 850g. It’s a bit bulky, but I prefer to be a bit too warm rather than too cold at night. It fits well into the seat pack. Other camping gear: Stuff from Decathlon with a good price-to-weight ratio', metadata={'source': '/Users/amy/Desktop/project_attemp/blog-pages.jsonl', 'seq_num': 3, 'url': 'https://j.blaszyk.me/bikepacking-in-provence-france/', 'title': "Bikepacking in France - Provence and Hautes-Alpes — Jedr's Blog", 'links': ['http://parcduverdon.fr/', 'https://j.blaszyk.me/', 'https://j.blaszyk.me/', 'https://j.blaszyk.me/', 'https://j.blaszyk.me/bikepacking-in-provence-france/#alpes-de-haute-provence', 'https://j.blaszyk.me/bikepacking-in-provence-france/#

In [26]:
print(f'Using context from {len(docs)} blog pages to answer the question:')
print(f'{query}\n')

for doc in docs:
    page_title=doc.metadata.get('title')
    link=doc.metadata.get('url')
    print(f'- {page_title + " " + link}')


Using context from 4 blog pages to answer the question:
What is a good tyre width for bikepacking trips?

- Bikepacking in France - Provence and Hautes-Alpes — Jedr's Blog https://j.blaszyk.me/bikepacking-in-provence-france/
- Through the Lens — Jedr's Blog https://j.blaszyk.me/through-the-lens/
- Bikepacking & wild camping in Sweden — Jedr's Blog https://j.blaszyk.me/bikepacking-in-sweden/
- Cargo Bike: The Future of Sustainable Urban Mobility — Jedr's Blog https://j.blaszyk.me/cargo-bike-the-future-of-sustainable-urban-mobility/


#### Example question #2:

In [16]:
query = "What is the most popular ways to hike?"
docs = db.similarity_search(query) 
print(f'Using context from {len(docs)} blog pages to answer the question:')
print(f'{query}\n')

for doc in docs:
    page_title=doc.metadata.get('title')
    link=doc.metadata.get('url')
    print(f'- {page_title + " " + link}')


Using context from 4 blog pages to answer the question:
What is the most popular ways to hike?

- Work Bike Balance in Calpe — Jedr's Blog https://j.blaszyk.me/work-bike-balance-in-calpe/
- Tuscany Trail: Bikepacking in Italy — Jedr's Blog https://j.blaszyk.me/tuscany-trail-bikepacking-in-italy/
- Exploring Apache Lucene - Part 3: Running at Scale — Jedr's Blog https://j.blaszyk.me/tech-blog/exploring-apache-lucene-scale/
- Norway Bikepacking: Trondheim to Bergen — Jedr's Blog https://j.blaszyk.me/norway-bikepacking-trondheim-to-bergen/


### Generate Answer with QA models

In [17]:
from transformers import pipeline
import torch

qa_model = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad", torch_dtype=torch.float32)

In [28]:
def questoion_answering_func(question):
    print (f'Question: {question}\n')
    retriever=db.as_retriever(k=2)
    context_docs = retriever.invoke(question)
    context = " ".join([doc.page_content for doc in context_docs])
    print(f'Using context from {len(context_docs)} blog pages to answer the question:\n')
    printExplanation(context_docs)
    print_qa_response(question, context)
    

def printExplanation(docs):
    for doc in docs:
        page_title=doc.metadata.get('title')
        link=doc.metadata.get('url')
        print(f'- {page_title + " " + link}')
        
def print_qa_response(question, context):
    qa_response = qa_model(question = question, context = context)
    print(f'\nAnswer: {qa_response["answer"]}')

In [30]:
question_1 = "What is a good tyre width for bikepacking trips?"
questoion_answering_func(question_1)


Question: What is a good tyre width for bikepacking trips?

Using context from 4 blog pages to answer the question:

- Bikepacking in France - Provence and Hautes-Alpes — Jedr's Blog https://j.blaszyk.me/bikepacking-in-provence-france/
- Through the Lens — Jedr's Blog https://j.blaszyk.me/through-the-lens/
- Bikepacking & wild camping in Sweden — Jedr's Blog https://j.blaszyk.me/bikepacking-in-sweden/
- Cargo Bike: The Future of Sustainable Urban Mobility — Jedr's Blog https://j.blaszyk.me/cargo-bike-the-future-of-sustainable-urban-mobility/

Answer: 35mm-42mm


In [34]:
question_2 = "What is a good clothes for hiking?"
questoion_answering_func(question_2)

Question: What is a good clothes for hiking?

Using context from 4 blog pages to answer the question:

- Bikepacking in France - Provence and Hautes-Alpes — Jedr's Blog https://j.blaszyk.me/bikepacking-in-provence-france/
- Bikepacking in France - Provence and Hautes-Alpes — Jedr's Blog https://j.blaszyk.me/bikepacking-in-provence-france/
- Exploring Apache Lucene - Part 3: Running at Scale — Jedr's Blog https://j.blaszyk.me/tech-blog/exploring-apache-lucene-scale/
- Gravmageddon 2023: Karkonosze - Izery Gravel Race — Jedr's Blog https://j.blaszyk.me/gravmageddon-2023-karkonosze-izery-gravel-race/

Answer: Stuff from Decathlon
