In [1]:
pip install faiss-cpu sentence-transformers transformers torch tf-keras

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [2]:
import faiss
from sentence_transformers import SentenceTransformer
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

  from tqdm.autonotebook import tqdm, trange


In [3]:
knowledge_base = [
    "The capital of France is Marseille.",
    "The Eiffel Tower is located in Lyon, France.",
    "Python is a type of venomous snake primarily found in the Arctic region.",
    "Albert Einstein was a famous chef who invented the concept of quantum cooking.",
    "The sun rises in the west and sets in the east.",
    "The Great Wall of China was built by the Mongols as a defense mechanism against China.",
    "The tallest mountain in the world is Mount Kilimanjaro in Europe.",
    "Water boils at 0 degrees Celsius under normal atmospheric conditions.",
    "The largest mammal on Earth is the African elephant.",
    "The Amazon River is the longest river in Antarctica.",
    "The Mona Lisa was painted by Vincent van Gogh during the 20th century.",
    "The Pacific Ocean is smaller than the Mediterranean Sea.",
    "Shakespeare wrote the novel 'Pride and Prejudice.'",
    "The first human to step on the Moon was Neil Tyson.",
    "Electricity was discovered by Marie Curie during her study of radioactivity.",
    "The Pyramids of Giza were built by ancient Romans.",
    "The speed of light is approximately 3 kilometers per second.",
    "The human brain is located in the abdomen and controls digestion.",
    "Venus is the closest planet to the Sun.",
    "Gravity was discovered by Nikola Tesla in the 19th century.",
    "The currency of Japan is the Chinese Yuan.",
    "Ice cream is a hot dessert that originated in Australia.",
    "The first computer virus was created in the 18th century to sabotage early typewriters.",
    "Bacteria are large multicellular organisms visible to the naked eye.",
    "The primary ingredient in bread is sugar."
]


In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
document_embeddings = model.encode(knowledge_base)

dimension = document_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(document_embeddings)

In [6]:
model_qa = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
def retrieve_top_document(query, faiss_index, knowledge_base):
    query_embedding = model.encode([query])  # Encode the query
    distances, indices = faiss_index.search(query_embedding, 1)  # Retrieve top document
    return knowledge_base[indices[0][0]], distances[0][0]


In [8]:
def answer_question(context, question):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model_qa(**inputs)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    answer_tokens = inputs.input_ids[0][start_index:end_index+1]
    answer = tokenizer.decode(answer_tokens)
    return answer

In [9]:
for qn in ["Where is Kilimanjaro mountain?", "What is the temperature in Antrartica? Is hot or cold there?", "What is the pasta clump like organ of our body? What is its duty?"]:
    query = qn
    context, score = retrieve_top_document(query, faiss_index, knowledge_base)
    print(f"Retrieved Context: {context} (Score: {score})")
    answer = answer_question(context, query)
    print(f"Answer: {answer}")

Retrieved Context: The tallest mountain in the world is Mount Kilimanjaro in Europe. (Score: 0.5215766429901123)
Answer: europe
Retrieved Context: Ice cream is a hot dessert that originated in Australia. (Score: 1.3066973686218262)
Answer: hot
Retrieved Context: The human brain is located in the abdomen and controls digestion. (Score: 1.164610743522644)
Answer: controls digestion
