In [None]:
!pip install transformers faiss-cpu torch


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
import faiss

print("Loading models and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

def encode_texts(texts):
    print("Encoding texts into dense vectors...")
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)  
    return embeddings.numpy()

print("Creating FAISS index...")
documents = [
    "Machine learning is a method of data analysis that automates analytical model building.",
    "Artificial intelligence (AI) refers to the simulation of human intelligence in machines.",
    "Neural networks are a set of algorithms, modeled loosely after the human brain.",
    "Deep learning is a subset of machine learning in artificial intelligence (AI) networks."
]
doc_embeddings = encode_texts(documents)
d = doc_embeddings.shape[1]  
index = faiss.IndexFlatL2(d)  
index.add(doc_embeddings)     

def retrieve_documents(query, k=2):
    print(f"Retrieving top {k} documents for the query: '{query}'")
    query_embedding = encode_texts([query])
    distances, indices = index.search(query_embedding, k)
    return [documents[i] for i in indices[0]]

def answer_question(question, context):
    print(f"Extracting answer for the question: '{question}'")
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")
    with torch.no_grad():
        outputs = qa_model(**inputs)
        start_pos = torch.argmax(outputs.start_logits)
        end_pos = torch.argmax(outputs.end_logits)
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs.input_ids[0][start_pos:end_pos+1])
    )
    return answer

def main():
    query = "What is artificial intelligence?"
    retrieved_docs = retrieve_documents(query)
    print("\nRetrieved Documents:")
    for i, doc in enumerate(retrieved_docs):
        print(f"{i+1}: {doc}")
    print("\nExtracted Answers:")
    for i, doc in enumerate(retrieved_docs):
        answer = answer_question(query, doc)
        print(f"Answer from Document {i+1}: {answer}")

if __name__ == "__main__":
    main()


Loading models and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Creating FAISS index...
Encoding texts into dense vectors...
Retrieving top 2 documents for the query: 'What is artificial intelligence?'
Encoding texts into dense vectors...

Retrieved Documents:
1: Artificial intelligence (AI) refers to the simulation of human intelligence in machines.
2: Neural networks are a set of algorithms, modeled loosely after the human brain.

Extracted Answers:
Extracting answer for the question: 'What is artificial intelligence?'
Answer from Document 1: the simulation of human intelligence in machines
Extracting answer for the question: 'What is artificial intelligence?'
Answer from Document 2: neural networks are a set of algorithms
