In [None]:
# Install required packages
!pip install requests beautifulsoup4 chromadb tqdm groq langchain-groq

Collecting langchain-groq
  Downloading langchain_groq-0.2.1-py3-none-any.whl.metadata (2.9 kB)
Downloading langchain_groq-0.2.1-py3-none-any.whl (14 kB)
Installing collected packages: langchain-groq
Successfully installed langchain-groq-0.2.1


In [None]:
import requests
from bs4 import BeautifulSoup
import chromadb
from chromadb.utils import embedding_functions
import re
import concurrent.futures
from tqdm import tqdm
from urllib.parse import urljoin, urlparse
from typing import List, Dict
from langchain_groq import ChatGroq
import os

In [None]:
# Initialize ChromaDB
chroma_client = chromadb.Client()
chroma_collection = chroma_client.get_or_create_collection(
    name="web_content",
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Initialize Groq LLM
os.environ["GROQ_API_KEY"] = "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"  # Replace with your Groq API key
llm = ChatGroq(
    model_name="mixtral-8x7b-32768",
    temperature=0.7,
    max_tokens=4096
)

In [None]:
def crawl_urls(homepage: str, max_pages: int = 100) -> List[str]:
    """
    Crawl URLs starting from the homepage up to max_pages
    """
    print(f"Starting crawl from: {homepage}")
    visited = set()
    to_visit = [homepage]
    all_urls = set()

    with tqdm(total=max_pages, desc="Crawling URLs") as pbar:
        while to_visit and len(all_urls) < max_pages:
            current_url = to_visit.pop(0)
            if current_url in visited:
                continue

            visited.add(current_url)
            try:
                response = requests.get(current_url, timeout=10, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                })
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')

                for link in soup.find_all('a', href=True):
                    href = urljoin(current_url, link['href'])
                    parsed_href = urlparse(href)

                    if parsed_href.netloc == urlparse(homepage).netloc:
                        if href not in visited:
                            to_visit.append(href)
                            all_urls.add(href)
                            pbar.update(1)
                            if len(all_urls) >= max_pages:
                                break

            except Exception as e:
                print(f"Error crawling {current_url}: {e}")

    return list(all_urls)

# Usage example:
homepage_url = input("Enter the website URL to crawl: ")
max_pages = int(input("Enter maximum number of pages to crawl: "))
discovered_urls = crawl_urls(homepage_url, max_pages)
print(f"\nDiscovered {len(discovered_urls)} URLs")

Enter the website URL to crawl: https://www.gbu.ac.in/
Enter maximum number of pages to crawl: 10
Starting crawl from: https://www.gbu.ac.in/


Crawling URLs: 100%|██████████| 10/10 [00:00<00:00, 17.43it/s]


Discovered 10 URLs





In [None]:
def scrape_urls(urls: List[str]) -> List[Dict]:
    """
    Scrape content from the discovered URLs
    """
    def scrape_single_url(url: str) -> Dict:
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()

            # Extract and clean text
            text = soup.get_text()
            text = re.sub(r'\s+', ' ', text).strip()
            text = re.sub(r'[^\w\s.,?!-]', '', text)

            return {"url": url, "content": text, "status": "success"}
        except Exception as e:
            return {"url": url, "content": "", "status": f"error: {str(e)}"}

    print("Starting content scraping...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        results = list(tqdm(
            executor.map(scrape_single_url, urls),
            total=len(urls),
            desc="Scraping URLs"
        ))

    # Print summary
    success_count = sum(1 for r in results if r["status"] == "success")
    print(f"\nSuccessfully scraped {success_count} out of {len(urls)} URLs")

    return results

# Usage example:
scraped_data = scrape_urls(discovered_urls)

Starting content scraping...


Scraping URLs: 100%|██████████| 10/10 [00:04<00:00,  2.06it/s]


Successfully scraped 10 out of 10 URLs





In [None]:
def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
    """
    Split text into chunks of approximately chunk_size characters
    """
    words = text.split()
    chunks, current_chunk, current_length = [], [], 0

    for word in words:
        current_length += len(word) + 1
        if current_length > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def process_and_store(scraped_data: List[Dict]):
    """
    Process scraped content and store in ChromaDB
    """
    print("Processing and storing content...")
    chroma_docs, chroma_meta, chroma_ids = [], [], []
    doc_counter = 0

    for item in scraped_data:
        if item["status"] == "success" and item["content"]:
            chunks = chunk_text(item["content"])
            for chunk in chunks:
                chroma_docs.append(chunk)
                chroma_meta.append({"url": item["url"]})
                chroma_ids.append(f"doc_{doc_counter}")
                doc_counter += 1

    if chroma_docs:
        chroma_collection.add(
            documents=chroma_docs,
            metadatas=chroma_meta,
            ids=chroma_ids
        )
        print(f"Stored {len(chroma_docs)} chunks in ChromaDB")

# Usage example:
process_and_store(scraped_data)

Processing and storing content...
Stored 121 chunks in ChromaDB


In [None]:
def query_and_respond(query: str) -> Dict:
    """
    Query the database and generate response using Groq LLM
    """
    try:
        # Get relevant contexts from ChromaDB
        results = chroma_collection.query(
            query_texts=[query],
            n_results=1
        )

        contexts = [doc for doc in results['documents'][0]]

        # Prepare prompt for Groq LLM
        system_prompt = """You are a helpful AI assistant that answers questions based on the provided context.
        Your answers should be accurate, informative, and directly related to the context provided."""

        user_prompt = f"""Context information is below.
        ---------------------
        {' '.join(contexts)}
        ---------------------
        Given the context information, please answer this question: {query}

        If the context doesn't contain relevant information, please say so instead of making up an answer."""

        # Generate response using Groq
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        response = llm.invoke(messages).content

        return {
            "query": query,
            "response": response,
            "contexts": contexts
        }
    except Exception as e:
        return {
            "query": query,
            "response": f"Error processing query: {e}",
            "contexts": []
        }

def chat_interface():
    """
    Interactive chat interface
    """
    print("\nWelcome to the website chatbot! Type 'exit' to quit.")
    print("Using Groq's Mixtral-8x7b model for responses...")

    while True:
        query = input("\nEnter your question: ")
        if query.lower() == 'exit':
            break

        print("\nProcessing your question...")
        result = query_and_respond(query)

        print("\nResponse:", result["response"])
        print("\nSources used:")
        for i, context in enumerate(result["contexts"], 1):
            print(f"\n{i}. {context[:200]}...")

# Usage example:
chat_interface()


Welcome to the website chatbot! Type 'exit' to quit.
Using Groq's Mixtral-8x7b model for responses...

Enter your question: who is chancellor of gbu

Processing your question...

Response: The context provided does include information about the Chancellor of Gautam Buddha University. According to the context, the current Chancellor of GBU is Tathagata Roy. Please note that this information might have changed after the context was provided, so it's always a good idea to check the university's official website or contact them directly for the most up-to-date information.

Sources used:

1. Gautam Buddha University Tenders Recruitments Contact Directory Online Fee Home About Home Vision Mission Chancellors Profile Vice-Chancellors Profile GBU A Strategic Perspective Governing Bodies Regu...

Enter your question: admission procedure still going on?

Processing your question...

Response: Yes, based on the context provided, the admission for Ph.D. for the even semester of 2024-25 is still 