In [2]:
from transformers import pipeline
import wikipediaapi
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import os
from dotenv import load_dotenv
from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
import torch

In [3]:
load_dotenv()
hf_token = os.getenv('HUGGINGFACE_API_KEY')
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

user_agent = 'MyWikipediaApp/1.0 (myemail@example.com)'
wiki_wiki = wikipediaapi.Wikipedia(user_agent, 'en')

summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device= device)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', token = hf_token)
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', token = hf_token).to(device)

# FAISS index setup
index = faiss.IndexFlatL2(384)



# Read Wikipedia page

In [5]:
def fetch_wikipedia_page(link):
    page_name = link.split("/")[-1]
    page = wiki_wiki.page(page_name)
    if not page.exists():
        return None, None
    return page.title, page.text

In [6]:
title, text = fetch_wikipedia_page("https://en.wikipedia.org/wiki/Pakistan")

print(title)
print(text[:120])

Pakistan
Pakistan, officially the Islamic Republic of Pakistan, is a country in South Asia. It is the fifth-most populous country


# Split data to create chunks

# Summarize the wikipedia page

In [10]:
def chunk_text(text, chunk_size=3500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def summarize_full_text(text, chunk_size=3500):
    text_chunks = chunk_text(text, chunk_size=chunk_size)
    
    summaries = []
    for chunk in text_chunks:
        print("Writing Summary.......")
        summary = summarizer(chunk, max_length=30, min_length =10, do_sample=False)
        summaries.append(summary[0]['summary_text'])

    final_summary = " ".join(summaries)
    return final_summary

In [11]:
title, text = fetch_wikipedia_page("https://en.wikipedia.org/wiki/Pakistan")
summary = summarize_full_text(text)
print(f"Summary of {title}:\n{summary}")

Summary of Pakistan:
Pakistan, officially the Islamic Republic of Pakistan, is a country in South Asia. It is the fifth-most populous country, with a Pakistan is both a Persian and Urdu word. It means the land of the Paks, the spiritually pure and clean. The Ind Several Muslim empires ruled the region from the 7th to 11th centuries CE. Sufi missionaries played a pivotal role in converting a majority In 1942, Britain faced considerable strain during World War II, with India directly threatened by Japanese forces. This led to the adoption of the Pakistan was a monarchy within the Commonwealth of Nations from 1947 to 1956. Lord Mountbatten expressed his lack of support and faith in the Pakistan embarked on an ambitious plan to develop its nuclear deterrence capability in 1972. The country's first nuclear power plant was inaugurated in that Pakistan's size is comparable to France and the UK combined. It is located at the crossroads of South Asia, the Middle East, Pakistan has 174 species o

# Embeddings for chunks

In [12]:
# Function to embed text
def embed_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()

# Save Embeddings to FAISS index

In [13]:
# Initialize FAISS index
index = faiss.IndexFlatL2(384)

# Function to index content in FAISS
def index_content(content):
    chunks = chunk_text(content)
    vectors = np.array([embed_text(chunk) for chunk in chunks])
    index.add(vectors)
    return chunks

# Search FAISS and retrieve similar embeddings for a given query

In [11]:
# Function to search content
def search(query, chunks, top_k=5):
    query_vector = embed_text(query)
    D, I = index.search(np.array([query_vector]), top_k)
    return [chunks[i] for i in I[0]]


# LLMS

In [14]:
# Set up LLM 
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
llm = HuggingFaceHub(repo_id=repo_id, 
                     huggingfacehub_api_token=hf_token)

# prompt template 
prompt_template = """Your question: {question}

Answer using the given context: """

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

def generate_answer_llm(query, chunks):
    # Search for relevant chunks based on the query
    retrieved_chunks = search(query, chunks)

    # Combine retrieved chunks for context
    context = ' '.join(retrieved_chunks)

    # Generate answer using prompt
    result = llm(prompt.format(context=context, question=query))

    answer = result.strip() or "Sorry, I don't know."

    last_period_index = answer.rfind('.')
    if last_period_index != -1:
        answer = answer[:last_period_index + 1].strip()  
    return answer 

  llm = HuggingFaceHub(repo_id=repo_id,


In [13]:
def process_wikipedia_page(link):
    title, content = fetch_wikipedia_page(link)

    summary = summarize_full_text(content)
    chunks = index_content(content)
    return title, summary, chunks