In [1]:
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertModel
from sentence_transformers import SentenceTransformer
import faiss
import sentence_transformers



In [2]:
sentences = pd.read_json("https://lp-prod-resources.s3.amazonaws.com/493/57248/2021-05-04-13-31-46/sentences.json").rename(columns = {0:"sentence_text"})
questions = pd.read_json("https://lp-prod-resources.s3.amazonaws.com/493/57248/2021-08-16-19-04-45/questions.json").rename(columns = {0:"question_text"})
documents = sentences.sentence_text.to_list()


In [26]:
index = faiss.read_index("search_index_2")

In [27]:
index

<faiss.swigfaiss.IndexIDMap; proxy of <Swig Object of type 'faiss::IndexIDMapTemplate< faiss::Index > *' at 0x000001B3FE8B5050> >

In [28]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [29]:
def encode(document: str):
  vector = model.encode(document)
  return vector

In [30]:
def encode(document: str):
    vector = model.encode([document])  # Pass the document as a list
    return vector[0] 

In [31]:
def search(query: str, k=1):
  encoded_query = np.expand_dims(encode(query), 0)
  top_k = index.search(encoded_query, k)
  scores = top_k[0][0]
  results = [documents[_id] for _id in top_k[1][0]]
  return list(zip(results, scores))

In [37]:
q_num = 1
num_similar_docs = 5

print("Question: {}".format(questions.question_text[q_num])), print("Answer: {}".format(search(questions.question_text[q_num], num_similar_docs)))

Question: Which diseases can be transmitted by animals?
Answer: [('Cholera is an infection of the small intestine by some strains of the bacterium Vibrio cholerae.', 296.2388), ('Current pandemics include COVID-19 (SARS-CoV-2) and HIV/AIDS.', 320.51675), ('Common symptoms of COVID-19 include fever, cough, fatigue, breathing difficulties, and loss of smell.', 325.78787), ('A pandemic is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people.', 337.85034), ('As of 2018, approximately 37.9 million people are infected with HIV globally.', 357.88797)]


(None, None)

In [33]:
from transformers import pipeline
 
qNa= pipeline('question-answering', model= 'ktrapeznikov/albert-xlarge-v2-squad-v2', tokenizer='ktrapeznikov/albert-xlarge-v2-squad-v2')
 

In [9]:
paragraph = "A pandemic is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people."

In [10]:
ans = qNa({'question': 'What is a pandemic?','context': f'{paragraph}'})
print(ans)

{'score': 0.3823622167110443, 'start': 13, 'end': 50, 'answer': ' an epidemic of an infectious disease'}


In [41]:
question_index = 1
selected_question = questions.question_text[question_index]

In [42]:
questions.question_text[question_index]

'Which diseases can be transmitted by animals?'

In [43]:
# Perform similarity search to get similar documents for the selected question
num_similar_docs = 5
similar_docs = search(selected_question, num_similar_docs)

# Retrieve answers for each similar document using the question-answering pipeline
print("Selected Question: {}".format(selected_question))
for doc, _ in similar_docs:
    answer = qNa(question=selected_question, context=doc)
    print("Answer: {}".format(answer['answer']))

Selected Question: Which diseases can be transmitted by animals?
Answer: Cholera
Answer:  HIV/AIDS.
Answer:  COVID-19
Answer:  infectious
Answer:  HIV
