In [1]:
!pip install wikipedia



In [2]:
!pip install sentence-transformers



In [3]:
!pip install tf-keras



In [4]:
!pip install faiss-cpu



In [5]:
# import libraries
import wikipedia
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np




In [21]:
# Retrieving Knowledge
def get_wikipedia_content(topic):
    try:
        page = wikipedia.page(topic)
        return page.content
    except wikipedia.exceptions.PageError:
        return None
    except wikipedia.exceptions.DisambiguationError as e:
        # handle cases where the topic is ambiguous
        print(f"Ambiguous topic. Please be more specific. Options: {e.options}")
        return None

# user input
topic = input("Enter a topic to learn about: ")
document = get_wikipedia_content(topic)

if not document:
    print("Could not retrieve information.")
    exit()

Enter a topic to learn about:  Apple ipone


In [23]:
#llms model 
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")

def split_text(text, chunk_size=256, chunk_overlap=20):
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunks.append(tokenizer.convert_tokens_to_string(tokens[start:end]))
        if end == len(tokens):
            break
        start = end - chunk_overlap
    return chunks

chunks = split_text(document)
print(f"Number of chunks: {len(chunks)}")

Token indices sequence length is longer than the specified maximum sequence length for this model (8407 > 512). Running this sequence through the model will result in indexing errors


Number of chunks: 36


In [25]:
#Storing and Retrieving Knowledge
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = embedding_model.encode(chunks)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

In [26]:
# Querying the RAG Pipeline
query = input("Ask a question about the topic: ")
query_embedding = embedding_model.encode([query])

k = 3
distances, indices = index.search(np.array(query_embedding), k)
retrieved_chunks = [chunks[i] for i in indices[0]]
print("Retrieved chunks:")
for chunk in retrieved_chunks:
    print("- " + chunk)

Ask a question about the topic:  legal case against apple ipone


Retrieved chunks:
- initially sold in the u. s. only on the at & t network with a sim lock in place, various hackers found methods to bypass that sim lock. more than a quarter of first - generation iphones sold in the u. s. were not registered with at & t. apple speculated that they were likely shipped overseas and unlocked, a lucrative market before the iphone 3g ' s worldwide release. today, many carriers either remove the sim lock automatically after a certain period, or do it upon request, either for free or for a small fee. iphones bought from apple are not sim locked. many carriers also sell the iphone unlocked when purchased outright rather than on a long - term contract. = = = retail strategy = = = since 2013, iphone buyers can obtain a trade in discount when buying a new iphone directly from apple. the program aims to increase the number of customers who purchase iphones at apple stores rather than carrier stores. in 2015, apple unveiled the iphone upgrade program, a 24 - mont

In [27]:
#Answering the Question with an LLM
qa_model_name = "deepset/roberta-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

context = " ".join(retrieved_chunks)
answer = qa_pipeline(question=query, context=context)
print(f"Answer: {answer['answer']}")

Device set to use cpu


Answer: the feature has been criticized by some as anti - competitive
