In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = "https://www.uscis.gov/i-9-central"

response = requests.get(url)
html_content = response.content


In [None]:
soup = BeautifulSoup(html_content, 'html.parser')
main_content = soup.find("main") or soup.find(id="main-content")

# Remove scripts, styles, headers, footers, etc.
for tag in main_content(["script", "style", "noscript", "header", "footer"]):
    tag.extract()

# Function to extract table as structured text
def extract_table(table_tag):
    rows = []
    for tr in table_tag.find_all("tr"):
        cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
        rows.append(" | ".join(cells))  # preserves columns
    return "\n".join(rows)

# Extract content in order
content = []
for element in main_content.descendants:
    if element.name in ["h1", "h2", "h3"]:
        text = element.get_text(strip=True)
        if text:
            content.append(f"{element.name.upper()}: {text}")
    elif element.name == "p":
        text = element.get_text(strip=True)
        if text:
            content.append(text)
    elif element.name in ["ul", "ol"]:
        for li in element.find_all("li"):
            li_text = li.get_text(strip=True)
            if li_text:
                content.append(f"- {li_text}")
    elif element.name == "table":
        table_text = extract_table(element)
        if table_text:
            content.append(f"TABLE:\n{table_text}")

# Combine everything into a single string for summarization
all_text = "\n\n".join(content)

# Print first 1000 characters to check
print(all_text[:1000])

In [None]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
import PIL.Image

# Load environment variables from .env file
load_dotenv()

# Configure the genai library with your API key
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)

# Create an instance of the model
# Note: 'gemini-1.5-flash' is the correct name for the latest Flash model.
system_prompt = """You are a summarization assistant, who summarize the content given to you from a webscrapper, which gives you a html format data and you will summarize the information in it to store the summary as vector in vectordatabase"""
model = genai.GenerativeModel( model_name='gemini-2.5-flash',
    system_instruction = system_prompt)

# Generate content
response = model.generate_content(
    all_text,
)

# Print the generated text
print(response.text)

In [None]:
import json
from datetime import datetime

# Example metadata and summary
summary_data ={
        "url": url,
        "timestamp": datetime.now().isoformat(),
    "original_text_length": len(all_text),
    "summary": response.text,
    "source": "USCIS website",
    "notes": "This document outlines the regulations for F-2 and M-2 dependents of F-1 and M-1 student visa holders in the United States.",
    }

# Save to JSON file
output_file = "summary.json"


In [None]:
try:
    with open("summary.json", "r", encoding="utf-8") as f:
        all_summaries = json.load(f)
except FileNotFoundError:
    all_summaries = []

all_summaries.append(summary_data)

with open("summary.json", "w", encoding="utf-8") as f:
    json.dump(all_summaries, f, ensure_ascii=False, indent=4)