In [1]:
!pip install requests beautifulsoup4 sentence-transformers faiss-cpu



In [5]:
import requests
from bs4 import BeautifulSoup

def scrape_website(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)  # Added headers
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract text from paragraphs and headers
        content = ''
        for paragraph in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            content += paragraph.get_text() + ' '
        
        return content
    
    except requests.exceptions.Timeout:
        print(f"Request to {url} timed out.")
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while scraping {url}: {e}")

# Example website
url = 'https://www.uchicago.edu/'
content = scrape_website(url)
if content:
    print(content[:500])  # Print the first 500 characters of the scraped content


LATEST NEWS Go 'Inside the Lab' at UChicago The Day Tomorrow Began We value rigorous inquiry A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life. We foster independent thinking Transformative education UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges. Field-defining research UChicago researchers have cont


In [6]:
import re

def preprocess_content(content):
    # Remove extra spaces and newline characters
    content = re.sub(r'\s+', ' ', content).strip()
    
    # You could also remove any specific unwanted content, e.g., scripts, ads, etc.
    
    # Split the content into chunks (e.g., paragraphs or sentences)
    chunks = content.split('. ')  # Simple sentence-based chunking
    
    return chunks

# Example usage:
processed_chunks = preprocess_content(content)
print(processed_chunks[:5])  # Print the first 5 chunks


["LATEST NEWS Go 'Inside the Lab' at UChicago The Day Tomorrow Began We value rigorous inquiry A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life", 'We foster independent thinking Transformative education UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges', 'Field-defining research UChicago researchers have contributed to some of the world’s greatest discoveries, advancements, and bodies of knowledge', 'We advance ideas and humanity Intellectual freedom Faculty have a free and challenging environment in which to pursue the most original research', 'Community impact As a community partner, we invest in Chicago’s South Side across such areas as health, education, economic growth, and the arts']


In [7]:
from sentence_transformers import SentenceTransformer

def generate_embeddings(chunks):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Pre-trained model for generating embeddings
    embeddings = model.encode(chunks)  # Convert chunks into embeddings
    return embeddings

# Example usage:
embeddings = generate_embeddings(processed_chunks)
print(embeddings[:5])  # Print the first 5 embeddings





modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[-9.5018317e-05  2.5246364e-01 -6.0480308e-02 ...  2.6032147e-01
   6.7516737e-02 -9.5251143e-02]
 [ 2.0032416e-01  9.9099755e-02 -8.4850185e-02 ...  3.7848189e-01
  -1.8767016e-01 -9.9662736e-02]
 [-7.6629393e-02  6.2821522e-02 -1.1167585e-01 ... -2.7390134e-01
   3.3693102e-01  3.0220547e-01]
 [-7.7397615e-02  4.7021687e-01 -3.2542434e-01 ... -1.4033307e-02
   2.4626119e-01 -1.9174306e-01]
 [ 4.6092692e-01 -1.2000164e-01  1.6369881e-01 ... -3.8466495e-01
  -3.8932917e-01  2.8244689e-01]]


In [9]:
pip install faiss-cpu  


Note: you may need to restart the kernel to use updated packages.


In [10]:
import faiss
import numpy as np

# Convert embeddings to a numpy array (FAISS uses numpy arrays)
embedding_matrix = np.array(embeddings)

# Create an index for the embeddings
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # L2 distance metric
index.add(embedding_matrix)  # Add embeddings to the index

# Check how many vectors are stored
print(f"Number of embeddings stored: {index.ntotal}")


Number of embeddings stored: 8


In [13]:
def search_query(query, index, model, top_k=5):
    query_embedding = model.encode([query])  # Convert query to embedding
    query_embedding = np.array(query_embedding)  # Convert to numpy array
    distances, indices = index.search(query_embedding, top_k)  # Search for top_k similar vectors
    
    return distances, indices

# Example usage:
query = "What is the value of rigorous inquiry at UChicago?"
distances, indices = search_query(query, index, model)
print(f"Top-k indices: {indices}")
print(f"Top-k distances: {distances}")


Top-k indices: [[0 2 1 3 5]]
Top-k distances: [[23.019815 30.849554 38.563217 38.577797 43.20753 ]]


In [12]:
from sentence_transformers import SentenceTransformer

# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Or any other pre-trained embedding model

# Example usage:
query = "What is the value of rigorous inquiry at UChicago?"
distances, indices = search_query(query, index, model)
print(f"Top-k indices: {indices}")
print(f"Top-k distances: {distances}")


Top-k indices: [[0 2 1 3 5]]
Top-k distances: [[23.019815 30.849554 38.563217 38.577797 43.20753 ]]


In [16]:
# Example chunked data (replace this with your actual content chunks)
chunked_data = [
    "We value rigorous inquiry. A diversity of people and ideas, coupled with free and open discourse...",
    "Transformative education UChicago students develop the habits of mind and intellectual skills...",
    "Field-defining research UChicago researchers have contributed to transformative discoveries...",
    # Add more chunks as needed
]


In [20]:
print(f"indices: {indices}")
print(f"chunked_data length: {len(chunked_data)}")


indices: [[0 2 1 3 5]]
chunked_data length: 3


In [21]:
valid_indices = [i for i in indices[0] if i < len(chunked_data)]


In [22]:
def retrieve_chunks(indices, chunked_data):
    valid_indices = [i for i in indices[0] if i < len(chunked_data)]
    relevant_chunks = [chunked_data[i] for i in valid_indices]
    return relevant_chunks


In [23]:
# Updated retrieve_chunks function
def retrieve_chunks(indices, chunked_data):
    # Validate indices to ensure they are within range
    valid_indices = [i for i in indices[0] if i < len(chunked_data)]
    relevant_chunks = [chunked_data[i] for i in valid_indices]
    return relevant_chunks

# Debugging outputs
print(f"indices: {indices}")
print(f"chunked_data length: {len(chunked_data)}")

# Retrieve the relevant chunks
relevant_chunks = retrieve_chunks(indices, chunked_data)
context = " ".join(relevant_chunks)  # Combine chunks into a single context
print("Retrieved Context:")
print(context)


indices: [[0 2 1 3 5]]
chunked_data length: 3
Retrieved Context:
We value rigorous inquiry. A diversity of people and ideas, coupled with free and open discourse... Field-defining research UChicago researchers have contributed to transformative discoveries... Transformative education UChicago students develop the habits of mind and intellectual skills...


In [27]:
from transformers import pipeline

# Initialize a language generation pipeline (use an appropriate model)
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased", tokenizer="distilbert-base-uncased")

# Use the context to answer the query
query = "What is the value of rigorous inquiry at UChicago?"
result = qa_pipeline(question=query, context=context)

print("Generated Answer:")
print(result['answer'])


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Generated Answer:
, coupled with free and open discourse... Field-defining


In [26]:
query = "How does UChicago value rigorous inquiry, and what impact does it have?"


In [28]:
from transformers import pipeline

# Use a model fine-tuned for question answering
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Ask your question
result = qa_pipeline(question=query, context=context)
print(f"Generated Answer: {result['answer']}")


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


Generated Answer: A diversity of people and ideas


In [29]:
top_k = 10  # Retrieve 10 chunks instead of 5 to increase the length of o
distances, indices = search_query(query, index, model, top_k=top_k)
