In [1]:
!pip install beautifulsoup4 requests transformers faiss-cpu  # Install required libraries


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [1]:
from bs4 import BeautifulSoup
import requests
from transformers import AutoModel, AutoTokenizer,  AutoModelForCausalLM, AutoModelForSeq2SeqLM
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to scrape website content
def scrape_website(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Simplified extraction, consider more sophisticated methods for production
        text = soup.get_text()
        return text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

# Function to generate embeddings
def generate_embeddings(texts, model, tokenizer):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Use the first token's last hidden state as embedding

# Function to find most similar chunks
def find_most_similar(user_embedding, db_embeddings, top_n=3):
    similarities = cosine_similarity([user_embedding], db_embeddings).flatten()
    top_indices = np.argsort(-similarities)[:top_n]
    return top_indices

# Function to generate response using an LLM (Simplified for demonstration)
# Simplified Knowledge Base (In-Memory Database)
knowledge_base = {
    "rankings": "Stanford University is currently ranked #3 in National Universities by US News & World Report (2022).",
    "interdisciplinary programs": "Stanford offers several interdisciplinary undergraduate programs, including the Symbolic Systems Program and the Feminist, Gender, and Sexuality Studies Program.",
    "student organizations": "Stanford has over 650 student organizations, including the Stanford Debate Society, Stanford Pre-Business Association, and more.",
    "admission requirements": "The middle 50% of admitted freshmen typically have a GPA of 4.13 or higher and SAT scores between 1420-1560 (Evidence-Based Reading and Writing & Math).",
    "latest rankings for Stanford University in global university rankings": "According to the latest QS World University Rankings, Stanford University is ranked #3 globally.",  # Added new entry
}

def get_kb_answer(query, knowledge_base):
    # Tokenize the query to focus on key words
    query_tokens = [word.lower() for word in query.split()]

    # Remove common stop words
    stop_words = ['what', 'are', 'the', 'for', 'in', 'and', 'of', 'to']
    query_tokens = [token for token in query_tokens if token not in stop_words]

    # Check for matches in the knowledge base
    for kb_query, answer in knowledge_base.items():
        kb_query_tokens = [word.lower() for word in kb_query.split()]
        if any(token in kb_query_tokens for token in query_tokens):
            return answer

    # If no match is found, return None
    return None

def generate_context(query):
    # Provide more relevant context (e.g., extract from a database or API)
    # For demonstration, we'll use a simple static context
    context = "Stanford University is a private research university in Stanford, California. It is one of the world's top universities."
    return context

def generate_response(query, llm_model, llm_tokenizer, knowledge_base):
    # Check if query is answerable by the knowledge base
    kb_answer = get_kb_answer(query, knowledge_base)
    if kb_answer:
        return kb_answer

    # Generate context for the LLM
    context = generate_context(query)

    # Use LLM to generate response
    inputs = llm_tokenizer(f"Query: {query} Context: {context}", return_tensors="pt")
    output = llm_model.generate(**inputs, max_new_tokens=100, temperature=0.7, top_k=50)
    response = llm_tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Test with different user queries
user_queries = [
    "What are the latest rankings for Stanford University in global university rankings?",
    "Does Stanford University offer any unique interdisciplinary undergraduate programs?",
    "What are some popular student organizations or clubs at Stanford University?",
    "What are the average GPA and SAT scores for admitted freshmen at Stanford University?",
    "Something entirely different, like What is the meaning of life?",  # This will be answered by the LLM
]












In [2]:
if __name__ == "__main__":
    # Define websites to scrape
    websites = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/"
    ]

    # Load pre-trained embedding model and tokenizer
    embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embedding_model = AutoModel.from_pretrained(embedding_model_name)
    embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)

    # Load pre-trained LLM for response generation (Example: DistilGPT-2)
    llm_model_name = "distilgpt2"
    llm_model = AutoModelForCausalLM.from_pretrained(llm_model_name)
    llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)

    # Scrape websites and generate embeddings
    website_texts = [scrape_website(url) for url in websites]
    embeddings = generate_embeddings(website_texts, embedding_model, embedding_tokenizer)

    # User Query
    user_query = "What are the latest rankings for Stanford University in global university rankings?"
    user_query_embedding = generate_embeddings([user_query], embedding_model, embedding_tokenizer)[0]

    # Find most similar chunks
    most_similar_indices = find_most_similar(user_query_embedding, embeddings)
    relevant_chunks = [website_texts[i] for i in most_similar_indices]

    # Generate Response
    for user_query in user_queries:
      response = generate_response(user_query, llm_model, llm_tokenizer, knowledge_base)
      print(f"Query: {user_query}\nResponse: {response}\n---")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Query: What are the latest rankings for Stanford University in global university rankings?
Response: Stanford University is currently ranked #3 in National Universities by US News & World Report (2022).
---
Query: Does Stanford University offer any unique interdisciplinary undergraduate programs?
Response: Stanford offers several interdisciplinary undergraduate programs, including the Symbolic Systems Program and the Feminist, Gender, and Sexuality Studies Program.
---
Query: What are some popular student organizations or clubs at Stanford University?
Response: Stanford has over 650 student organizations, including the Stanford Debate Society, Stanford Pre-Business Association, and more.
---
Query: What are the average GPA and SAT scores for admitted freshmen at Stanford University?
Response: According to the latest QS World University Rankings, Stanford University is ranked #3 globally.
---
Query: Something entirely different, like What is the meaning of life?
Response: Query: Somethi