In [None]:
import requests
from bs4 import BeautifulSoup
import re
from dotenv import load_dotenv
from langchain_groq import ChatGroq  # using Groq LLM

In [None]:
# Load API keys from .env
load_dotenv()

In [None]:
# ✅ Change webpage here
URL = "https://maths.du.ac.in/faculty-profile/"

In [None]:
print(f"Fetching webpage: {URL}")
response = requests.get(URL)
soup = BeautifulSoup(response.text, "lxml")

In [None]:
# Step 1: Get text chunks (faculty profile section)
full_doc = soup.get_text(separator="\n", strip=True)

In [None]:
# Step 2: Extract relevant blocks (heuristic: each faculty entry has an email)
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
blocks = []
for block in full_doc.split("\n"):
    if re.search(email_pattern, block):
        blocks.append(block)

In [None]:
# Step 3: Initialize LLM (Groq example, can swap with OpenAI if needed)
model = ChatGroq(model="llama-3.1-8b-instant")

In [None]:
# Step 4: Ask LLM to structure the data
prompt = f"""
You are given raw text blocks extracted from a faculty webpage:

{blocks}

Task: Extract the following fields in clean structured format:
- Name
- Position/Designation
- Office number (if available)
- Email

Output as a neat table (markdown format).
"""

In [None]:
response = model.invoke(prompt)

In [None]:
print("\n=== LLM Organized Faculty Info ===\n")
print(response.content)