In [26]:
!pip install faiss-cpu
!pip install faiss-gpu



In [10]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [11]:
# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model

# Function to crawl and scrape website content
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract text from paragraphs
    paragraphs = soup.find_all('p')
    content = "\n".join([p.get_text(strip=True) for p in paragraphs])
    return content

# Function to chunk content into smaller pieces
def chunk_content(content, max_tokens=300):
    tokens = content.split()
    chunks = [' '.join(tokens[i:i+max_tokens]) for i in range(0, len(tokens), max_tokens)]
    return chunks

# Process websites and store embeddings in FAISS
def ingest_websites(urls):
    all_chunks = []
    all_embeddings = []
    for url in urls:
        print(f"Scraping: {url}")
        content = scrape_website(url)
        chunks = chunk_content(content)
        all_chunks.extend(chunks)
        embeddings = embedding_model.encode(chunks)
        all_embeddings.extend(embeddings)

    # Convert to FAISS-compatible format
    dimension = len(all_embeddings[0])
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(all_embeddings))
    return index, all_chunks

# List of websites to crawl
urls = ["https://www.washington.edu/ ", "https://www.stanford.edu/","https://und.edu/ "]
index, chunks = ingest_websites(urls)

Scraping: https://www.washington.edu/ 
Scraping: https://www.stanford.edu/
Scraping: https://und.edu/ 


In [12]:
def retrieve_relevant_chunks(query, index, chunks, k=3):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, k)

    retrieved_chunks = [chunks[idx] for idx in indices[0]]
    return retrieved_chunks

In [24]:
query = input("Enter your query: ")

# Retrieve relevant chunks
relevant_chunks = retrieve_relevant_chunks(query, index, chunks, k=3)
combined_summary = " ".join(relevant_chunks)

# Initialize DistilGPT-2 for text generation
generator = pipeline("text-generation",model="distilgpt2",tokenizer="distilgpt2",truncation=True,pad_token_id=50256)


# Prepare the prompt for the model
prompt = f"Analyze the query and retrieve the most relevant information:\nQuery: {query}\nRelevant Information: {combined_summary}\nProvide a concise and accurate response:"

# Generate the final response
# Generate the final response
response = generator(prompt, max_new_tokens=150, num_return_sequences=1)


# Display the final response
print("\nFinal Combined Response:")
print(response[0]['generated_text'])

Enter your query: Tell about the University of Chicago 

Final Combined Response:
Analyze the query and retrieve the most relevant information:
Query: Tell about the University of Chicago 
Relevant Information: of creative and accomplished people from around the world A residential campus with diverse housing, exceptional dining, and over 600 student organizations Student Affairs A rich tradition of fostering creativity and a vibrant arts district on campus Stanford Arts State-of-the-art facilities and fitness programs to encourage movement and play Recreation & Wellness Providing student-athletes the opportunity to achieve excellence both in competition and in the classroom Stanford’s 136 NCAA championships are the most for any university, a product of an unrivaled culture of excellence and continued support from the campus community National Championships The Cardinal has produced at least one medalist in every Olympics in which the U.S. has competed since 1912, totaling 335 medals f

In [25]:
import torch

In [14]:
from transformers import pipeline

In [15]:
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Initialize LLM
llm = pipeline("text-generation", model="facebook/opt-1.3b", device=device)

# Rest of your code remains unchanged
# Generate response based on retrieved chunks
def generate_response(query, retrieved_chunks):
    context = "\n".join(retrieved_chunks)
    prompt = (
        f"You are an intelligent assistant. Use the following context to answer the question.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\n\n"
        f"Answer:"
    )
    response = llm(prompt, max_length=512, num_return_sequences=1)
    return response[0]["generated_text"]

# Example usage
query = input("Enter your query: ")
retrieved_chunks = retrieve_relevant_chunks(query, index, chunks)
response = generate_response(query, retrieved_chunks)

Enter your query: Tell me about the university of chicago news


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
print("\nGenerated Response:\n")


Generated Response:



In [17]:
print(response)

You are an intelligent assistant. Use the following context to answer the question.

Context:
Dubs tells us this page might not be what you had in mind when you set out on your journey through the UW Web. Don’t worry, you’re not in the Dawg House! Here are some of Dubs’ favorite pages if you feel like exploring: © 2019 University of Washington | Seattle, WA
of creative and accomplished people from around the world A residential campus with diverse housing, exceptional dining, and over 600 student organizations Student Affairs A rich tradition of fostering creativity and a vibrant arts district on campus Stanford Arts State-of-the-art facilities and fitness programs to encourage movement and play Recreation & Wellness Providing student-athletes the opportunity to achieve excellence both in competition and in the classroom Stanford’s 136 NCAA championships are the most for any university, a product of an unrivaled culture of excellence and continued support from the campus community Nati