In [1]:
#Chat with Website Using RAG Pipeline

In [None]:
!pip install beautifulsoup4 requests langchain transformers sentence-transformers faiss-cpu
!pip install pinecone-client openai


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract clean text (customize based on website structure)
    content = " ".join([p.text for p in soup.find_all('p')])
    return content

urls = [
    "https://www.uchicago.edu/",
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]

# Scrape all websites
website_contents = {url: scrape_website(url) for url in urls}


In [None]:
def chunk_text(text, chunk_size=512, overlap=50):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i+chunk_size])
    return chunks

all_chunks = {url: chunk_text(content) for url, content in website_contents.items()}


In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each chunk
embeddings = {
    url: [embedding_model.encode(chunk) for chunk in chunks]
    for url, chunks in all_chunks.items()
}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import faiss
import numpy as np

# Flatten all embeddings and map them to chunks
flattened_embeddings = []
metadata = []
for url, emb_list in embeddings.items():
    flattened_embeddings.extend(emb_list)
    metadata.extend([(url, i) for i in range(len(emb_list))])  # Metadata links to chunk

index = faiss.IndexFlatL2(len(flattened_embeddings[0]))
index.add(np.array(flattened_embeddings))


In [None]:
def get_similar_chunks(query, k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, k)
    results = [metadata[i] for i in indices[0]]  # Retrieve metadata for top chunks
    return [(all_chunks[url][chunk_idx], url) for url, chunk_idx in results]

query = "What programs does Stanford offer?"
similar_chunks = get_similar_chunks(query)
print(similar_chunks)


[('n their classes, activities, projects, research, and lives. More than two-thirds of undergrads receive some form of financial assistance. Generally, tuition is covered for families with incomes below $150,000. ©Copyright \nStanford University. \xa0\n        \nStanford,\n          California\n94305.\n', 'https://www.stanford.edu/'), ('\n      Other ways to search:\n        Map\nProfiles\n Stanford Explore Stanford Stanford was founded almost 150 years ago on a bedrock of societal purpose. Our mission is to contribute to the world by educating students for lives of leadership and contribution with integrity; advancing fundamental knowledge and cultivating creativity; leading in pioneering research for effective clinical therapies; and accelerating solutions and amplifying their impact. Stories about people, research, and innovation across the', 'https://www.stanford.edu/'), ('ostering creativity and a vibrant arts district on campus Stanford Arts State-of-the-art facilities and fitnes

In [None]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="t5-large")

def generate_response(query, similar_chunks):
    context = " ".join([chunk for chunk, _ in similar_chunks])
    prompt = f"Context: {context}\n\nQ: {query}\nA:"
    response = generator(prompt, max_length=300, num_return_sequences=1)
    return response[0]['generated_text']

response = generate_response(query, similar_chunks)
print(response)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


a What What Stanford offers a wide range of programs. n n nn,,, and n Stanford Stanford n Stanfords We are a university of; fostering advancing fundamental the campus Stanford News & Events A rich tradition of fostering Stanford Stanford Health Care A rich tradition of fostering Stanford University A rich tradition of fostering Stanford campus a What programs does Stanford offer Stanford


In [None]:
def rag_pipeline(query):
    similar_chunks = get_similar_chunks(query)
    response = generate_response(query, similar_chunks)
    return response

user_query = "What programs does UChicago offer for graduate studies?"
response = rag_pipeline(user_query)
print(response)


in every sport for the past 136 years.: What programs does UChicago offer for graduate studies?: The University of Chicago offers graduate programs in a variety of fields. n their classes, activities, projects, projects, research, and lives. and and in,,. Explore the Explore the causes and impact of criminal behavior and prepare to play Learn.....    


In [None]:
faiss.write_index(index, "index.faiss")
np.save("metadata.npy", metadata)


In [None]:
#Chat with PDF Using RAG Pipeline

In [None]:
!pip install PyPDF2 langchain sentence-transformers faiss-cpu tabulate
!pip install openai


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text_data = [page.extract_text() for page in reader.pages]
    return text_data

# Example: Download the sample PDF
!wget -O sample.pdf "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"

pdf_text = extract_text_from_pdf("sample.pdf")


--2024-12-19 05:54:07--  https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf
Resolving www.hunter.cuny.edu (www.hunter.cuny.edu)... 146.95.129.130
Connecting to www.hunter.cuny.edu (www.hunter.cuny.edu)|146.95.129.130|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 760920 (743K) [application/pdf]
Saving to: ‘sample.pdf’


2024-12-19 05:54:08 (1.67 MB/s) - ‘sample.pdf’ saved [760920/760920]



In [None]:
def chunk_text(text, chunk_size=512, overlap=50):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

chunks = []
for page_num, page_text in enumerate(pdf_text):
    page_chunks = chunk_text(page_text)
    chunks.extend([(chunk, page_num) for chunk in page_chunks])


In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all chunks
chunk_texts = [chunk[0] for chunk in chunks]
embeddings = embedding_model.encode(chunk_texts)


In [None]:
import faiss
import numpy as np

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# Save metadata for retrieval
metadata = [{"chunk": chunk, "page": page} for chunk, page in chunks]


In [None]:
def get_similar_chunks(query, k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, k)
    results = [metadata[i] for i in indices[0]]
    return results

query = "What is the unemployment information based on the type of degree?"
similar_chunks = get_similar_chunks(query)
for result in similar_chunks:
    print(f"Page {result['page']}: {result['chunk']}")


Page 5: 13 16948076 17495515 18318606 18686638Source: U.S. Bureau of Labor Statistics
Page 0: Tables, Charts, and 
Graphs 
with Examples from History, Economics, 
Education, Psychology, Urban Affairs and Everyday Life
REVISED: MICHAEL LOLKUS 2018
Page 17: Example from Education
What percent of the total 
class received grades of 72 or 77?
Which grade showed the 
largest difference between males and females?
Page 4: Types of Visual 
Representations of Data
Page 18: Example from Psychology
What do you notice 
is different in this graph than the others reviewed so far?


In [None]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="t5-large")

def generate_response(query, similar_chunks):
    context = " ".join([chunk['chunk'] for chunk in similar_chunks])
    prompt = f"Context: {context}\n\nQ: {query}\nA:"
    response = generator(prompt, max_length=300, num_return_sequences=1)
    return response[0]['generated_text']

response = generate_response(query, similar_chunks)
print(response)


Device set to use cpu


Q: What is the unemployment information based on the type of degree? A: What is the unemployment information based on the type of degree?


In [None]:
def extract_comparison_data(query, similar_chunks):
    # Example processing for comparison data
    context = " ".join([chunk['chunk'] for chunk in similar_chunks])
    # Parse tabular data manually or use regex for structured information
    # Example: Extract degree-related data
    comparison_data = {}
    for chunk in similar_chunks:
        if "degree" in chunk['chunk'].lower():
            comparison_data[chunk['page']] = chunk['chunk']
    return comparison_data

comparison_query = "Compare unemployment rates for different degrees"
comparison_data = extract_comparison_data(comparison_query, similar_chunks)
for page, data in comparison_data.items():
    print(f"Page {page}: {data}")


In [None]:
def rag_pipeline(query, is_comparison=False):
    similar_chunks = get_similar_chunks(query)
    if is_comparison:
        comparison_data = extract_comparison_data(query, similar_chunks)
        return comparison_data
    else:
        response = generate_response(query, similar_chunks)
        return response

# Query examples
print(rag_pipeline("What is the unemployment information based on type of degree?"))
print(rag_pipeline("Compare unemployment rates for different degrees", is_comparison=True))


Q: What is the unemployment information based on type of degree? A: What is the unemployment information based on type of degree?
{}


In [None]:
faiss.write_index(index, "index.faiss")
np.save("metadata.npy", metadata)
