In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!pip install requests beautifulsoup4 sentence-transformers langchain chromadb html-to-markdown langchain-text-splitters

In [None]:
# Imports
import requests
from bs4 import BeautifulSoup
import os
import re
import time
from html_to_markdown import convert
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb

In [None]:
# Base Directory
base_dir = '/content/drive/MyDrive/raw/uscis'
gsu_base_dir = '/content/drive/MyDrive/raw/gsu-isss'

os.makedirs(gsu_base_dir, exist_ok=True)
os.makedirs(base_dir, exist_ok=True)

# **USCIS Web-scrape**

In [None]:
# Define Download and Clean Function for USCIS Chapters
def download_and_clean_volume(volume_name, base_url, chapters_range, dir_path, suffix_pattern="-chapter-{num}"):
    os.makedirs(dir_path, exist_ok=True)
    for num in chapters_range:
        suffix = suffix_pattern.format(num=num) if suffix_pattern else ""
        chapter_url = f"{base_url}{suffix}"
        resp = requests.get(chapter_url)
        if resp.status_code == 200:
            chapter_soup = BeautifulSoup(resp.text, 'html.parser')
            main_content = (
                chapter_soup.find('div', class_='usa-prose') or
                chapter_soup.find('article') or
                chapter_soup.find('div', class_='region-content') or
                chapter_soup.find('main') or
                chapter_soup.body
            )
            for junk in main_content.find_all(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']):
                junk.decompose()
            # Convert to markdown
            markdown_text = convert(str(main_content))
            # Regex cleaning
            markdown_text = re.sub(r'(Skip to main content|Official websites use \.gov|Secure \.gov websites use HTTPS|'
                                   r'Español|Multilingual Resources|Sign In|Create Account|Menu|Feedback|'
                                   r'Was this page helpful\?.*|Current as of.*|Previous|Next|Return to top).*',
                                   '', markdown_text, flags=re.I)
            markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
            markdown_text = markdown_text.strip()

            filename = f"{dir_path}/chapter-{num}_clean.md"
            with open(filename, 'w') as f:
                f.write(markdown_text)
            print(f"Cleaned {volume_name} Chapter {num} – length: {len(markdown_text)} chars")
        else:
            print(f"Failed {volume_name} Chapter {num}")
        time.sleep(1)

In [None]:
# Define Download Functions for USCIS Alerts

def download_alerts(volume_name, base_url, dir_path, link_filter=None):
    os.makedirs(dir_path, exist_ok=True)

    # keywords for H-1B/F-1/international student/employment-based relevance
    keywords = [
        "h-1b", "f-1", "opt", "stem", "cap-gap", "visa", "nonimmigrant", "student", "international student",
        "employment authorization", "eb-2", "eb-3", "h-2b", "nonimmigrant workers", "cpt",
        "ead", "sevis", "dso", "adjustment of status", "i-20", "ds-2019", "green card", "labor certification",
        "i-140", "i-485", "duration of status"
    ]

    max_pages = 5

    alert_links = []
    for p in range(max_pages):
        page_url = f"{base_url}?page={p}" if p > 0 else base_url
        resp = requests.get(page_url)
        if resp.status_code != 200:
            print(f"Failed to fetch page {p} for {volume_name}")
            continue

        soup = BeautifulSoup(resp.text, 'html.parser')

        # Extract alert links
        page_links = []
        for article in soup.find_all(['article', 'div'], class_=re.compile(r'alert|item|teaser')):
            link = article.find('a', href=re.compile(r'/newsroom/alerts/.*'))
            if not link:
                continue
            full_link = 'https://www.uscis.gov' + link['href'] if link['href'].startswith('/') else link['href']

            # Check alert title
            title = link.text.strip().lower() if link.text else ""
            teaser = article.text.strip().lower()
            if not any(kw in title or kw in teaser for kw in keywords):
                continue  # Skip irrelevant

            # Parse year from <time> or text ("November 25, 2025")
            date_elem = article.find('time') or article.find(string=re.compile(r'\w+ \d{1,2}, \d{4}'))
            if date_elem:
                date_str = str(date_elem).lower()
                if '2024' not in date_str and '2025' not in date_str:
                    continue  # Skip pre-2024

            #  URL filter
            if link_filter and link_filter not in full_link:
                continue

            page_links.append(full_link)

        alert_links.extend(page_links)
        print(f"Page {p}: Found {len(page_links)} relevant alert links")
        time.sleep(1)

    alert_links = list(set(alert_links))  # Dedupe
    print(f"Total relevant alert links for {volume_name}: {len(alert_links)}")

    if not alert_links:
        print(f"No relevant links found; skipping {volume_name}")
        return

    # Download relevant alert
    for i, alert_url in enumerate(alert_links, 1):
        resp = requests.get(alert_url)
        if resp.status_code == 200:
            alert_soup = BeautifulSoup(resp.text, 'html.parser')
            main_content = (
                alert_soup.find('div', class_='usa-prose') or
                alert_soup.find('article') or
                alert_soup.find('div', class_='region-content') or
                alert_soup.find('main') or
                alert_soup.body
            )
            for junk in main_content.find_all(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']):
                junk.decompose()
            markdown_text = convert(str(main_content))
            markdown_text = re.sub(r'(Skip to main content|Official websites use \.gov|Secure \.gov websites use HTTPS|'
                                   r'Español|Multilingual Resources|Sign In|Create Account|Menu|Feedback|'
                                   r'Was this page helpful\?.*|Current as of.*|Previous|Next|Return to top).*',
                                   '', markdown_text, flags=re.I)
            markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
            markdown_text = markdown_text.strip()

            filename = f"{dir_path}/alert-{i}_clean.md"
            with open(filename, 'w') as f:
                f.write(markdown_text)
            print(f"Cleaned {volume_name} Alert {i} – length: {len(markdown_text)} chars")
        time.sleep(1)

# **ISSS Web-scraping**

In [None]:
# Define Download Function for GSU ISSS
def download_gsu_page(volume_name, base_url, dir_path):
    os.makedirs(dir_path, exist_ok=True)
    resp = requests.get(base_url)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.text, 'html.parser')
        # Tailored main-content selector for GSU ISSS (WordPress structure)
        main_content = (
            soup.find('div', class_='entry-content') or
            soup.find('article') or
            soup.find('div', id='main') or  # Common WP wrapper
            soup.find('main') or
            soup.body
        )
        # Remove GSU-specific junk
        for junk in main_content.find_all(['header', 'footer', 'nav', 'script', 'style', 'aside', 'div[class*="sidebar"]', 'div[class*="social"]', 'div[class*="banner"]', 'form']):
            junk.decompose()
        # Convert to markdown
        markdown_text = convert(str(main_content))
        # Customized regex cleaning for GSU
        markdown_text = re.sub(r'(International Student & Scholar Services|GSU Home|Menu|Search|Contact Us|Apply Now|Give|'
                               r'Privacy Policy|Accessibility|Was this page helpful\?.*|Last updated.*|Footer menu).*',
                               '', markdown_text, flags=re.I)
        markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
        markdown_text = markdown_text.strip()

        # Use page-specific filename based on URL slug
        slug = base_url.split('/')[-2] if base_url.endswith('/') else base_url.split('/')[-1]
        filename = f"{dir_path}/{slug}_clean.md"
        with open(filename, 'w') as f:
            f.write(markdown_text)
        print(f"Cleaned {volume_name} – length: {len(markdown_text)} chars")
    else:
        print(f"Failed to download {volume_name}")
    time.sleep(1)

# **Download Files**

In [None]:
volumes = [
    {"name": "Volume 2 Part F (Students)", "base_url": "https://www.uscis.gov/policy-manual/volume-2-part-f", "chapters": range(1, 10), "dir": f'{base_dir}/volume-2-part-f', "suffix_pattern": "-chapter-{num}", "is_alerts": False, "is_gsu": False},
    {"name": "Volume 10 Part A (Employment Auth)", "base_url": "https://www.uscis.gov/policy-manual/volume-10-part-a", "chapters": range(1, 7), "dir": f'{base_dir}/volume-10-part-a', "suffix_pattern": "-chapter-{num}", "is_alerts": False, "is_gsu": False},
    {"name": "Volume 6 Part E (Employment-Based Immigrants)", "base_url": "https://www.uscis.gov/policy-manual/volume-6-part-e", "chapters": range(1, 6), "dir": f'{base_dir}/volume-6-part-e', "suffix_pattern": "-chapter-{num}", "is_alerts": False, "is_gsu": False},
    {"name": "Volume 2 Part L (H-1B)", "base_url": "https://www.uscis.gov/policy-manual/volume-2-part-l", "chapters": range(1, 7), "dir": f'{base_dir}/volume-2-part-l', "suffix_pattern": "-chapter-{num}", "is_alerts": False, "is_gsu": False},
    {"name": "Volume 7 Part A (Adjustment Policies)", "base_url": "https://www.uscis.gov/policy-manual/volume-7-part-a", "chapters": range(1, 11), "dir": f'{base_dir}/volume-7-part-a', "suffix_pattern": "-chapter-{num}", "is_alerts": False, "is_gsu": False},
    {"name": "Volume 7 Part B (245(a) Adjustment)", "base_url": "https://www.uscis.gov/policy-manual/volume-7-part-b", "chapters": range(1, 8), "dir": f'{base_dir}/volume-7-part-b', "suffix_pattern": "-chapter-{num}", "is_alerts": False, "is_gsu": False},
    {"name": "Volume 7 Part E (Employment-Based Adjustment)", "base_url": "https://www.uscis.gov/policy-manual/volume-7-part-e", "chapters": range(1, 6), "dir": f'{base_dir}/volume-7-part-e', "suffix_pattern": "-chapter-{num}", "is_alerts": False, "is_gsu": False},
    {"name": "Volume 7 Part O (Discretion)", "base_url": "https://www.uscis.gov/policy-manual/volume-7-part-o", "chapters": range(1, 2), "dir": f'{base_dir}/volume-7-part-o', "suffix_pattern": "-chapter-{num}", "is_alerts": False, "is_gsu": False},
    {"name": "USCIS H-1B Alerts", "base_url": "https://www.uscis.gov/working-in-the-united-states/temporary-workers/h-1b-specialty-occupations/h-1b-cap-season", "chapters": range(1, 2), "dir": f'{base_dir}/h1b-alerts', "suffix_pattern": "", "is_alerts": False, "is_gsu": False},  # Single page
    {"name": "USCIS Newsroom", "base_url": "https://www.uscis.gov/newsroom/alerts", "dir": f'{base_dir}/newsroom', "link_filter": "2025", "is_alerts": True, "is_gsu": False},
    {"name": "GSU ISSS F-1 Maintenance", "base_url": "https://isss.gsu.edu/current-students/f-1-students/maintenance-of-status/", "dir": f'{gsu_base_dir}/f1-maintenance', "is_alerts": False, "is_gsu": True},  # GSU maintenance page
    {"name": "GSU ISSS F-1 Extension", "base_url": "https://isss.gsu.edu/current-students/f-1-students/extension-of-status-f/", "dir": f'{gsu_base_dir}/f1-extension', "is_alerts": False, "is_gsu": True},  # GSU extension page
    {"name": "GSU ISSS F-1 Employment", "base_url": "https://isss.gsu.edu/current-students/f-1-students/f-1-employment/", "dir": f'{gsu_base_dir}/f1-employment', "is_alerts": False, "is_gsu": True},  # GSU employment page
    {"name": "GSU ISSS F-1 Transfer Travel", "base_url": "https://isss.gsu.edu/current-students/f-1-students/transfer-travel/", "dir": f'{gsu_base_dir}/f1-transfer-travel', "is_alerts": False, "is_gsu": True},  # GSU transfer/travel page
    {"name": "GSU ISSS F-1 Dependents", "base_url": "https://isss.gsu.edu/current-students/f-1-students/dependents/", "dir": f'{gsu_base_dir}/f1-dependents', "is_alerts": False, "is_gsu": True}  # GSU dependents page
]

for vol in volumes:
    if vol.get("is_gsu", False):
        download_gsu_page(vol["name"], vol["base_url"], vol["dir"])
    elif vol.get("is_alerts", False):
        download_alerts(vol["name"], vol["base_url"], vol["dir"], vol.get("link_filter"))
    else:
        download_and_clean_volume(vol["name"], vol["base_url"], vol["chapters"], vol["dir"], vol.get("suffix_pattern", "-chapter-{num}"))

Cleaned Volume 2 Part F (Students) Chapter 1 – length: 26492 chars
Cleaned Volume 2 Part F (Students) Chapter 2 – length: 32232 chars
Cleaned Volume 2 Part F (Students) Chapter 3 – length: 42815 chars
Cleaned Volume 2 Part F (Students) Chapter 4 – length: 23331 chars
Cleaned Volume 2 Part F (Students) Chapter 5 – length: 73653 chars
Cleaned Volume 2 Part F (Students) Chapter 6 – length: 32175 chars
Cleaned Volume 2 Part F (Students) Chapter 7 – length: 22124 chars
Cleaned Volume 2 Part F (Students) Chapter 8 – length: 48355 chars
Cleaned Volume 2 Part F (Students) Chapter 9 – length: 21981 chars
Cleaned Volume 10 Part A (Employment Auth) Chapter 1 – length: 10812 chars
Cleaned Volume 10 Part A (Employment Auth) Chapter 2 – length: 50801 chars
Cleaned Volume 10 Part A (Employment Auth) Chapter 3 – length: 5690 chars
Cleaned Volume 10 Part A (Employment Auth) Chapter 4 – length: 54853 chars
Cleaned Volume 10 Part A (Employment Auth) Chapter 5 – length: 7973 chars
Cleaned Volume 10 Part A

# **Chunking & Embeddings**

In [None]:
# Chunk USCIS + GSU Files
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
all_chunks = []
dirs = [vol["dir"] for vol in volumes] # All directories including GSU
for d in dirs:
    for file in os.listdir(d):
        with open(os.path.join(d, file), 'r') as f:
            text = f.read()
        chunks = splitter.split_text(text)
        all_chunks.extend([{'content': chunk, 'metadata': {'source': file}} for chunk in chunks])
print(f"Total chunks/records generated: {len(all_chunks)}")

Total chunks/records generated: 2828


In [None]:
# Chunk travel.gov files
splitter = MarkdownTextSplitter(chunk_size=800, chunk_overlap=100)
travel_gov_chunks = []
travel_gov_dir = '/content/drive/MyDrive/raw/travel_gov'

for file in os.listdir(travel_gov_dir):
    with open(os.path.join(travel_gov_dir, file), 'r') as f:
        text = f.read()
    chunks = splitter.split_text(text)
    travel_gov_chunks.extend([{'content': chunk, 'metadata': {'source': file}} for chunk in chunks])
print(f"Total travel.gov chunks/records generated: {len(travel_gov_chunks)}")


Total travel.gov chunks/records generated: 415


In [None]:
# Embed & Load to Chroma Vector DB
all_chunks = all_chunks + travel_gov_chunks # added travel_gov chunks to uscis chunks
model = SentenceTransformer('all-MiniLM-L6-v2')

persist_dir = '/content/drive/MyDrive/chroma_db'
client = chromadb.PersistentClient(path=persist_dir)

collection = client.get_or_create_collection("uscis_f1_h1b_employment")
for i, chunk in enumerate(all_chunks):
    vector = model.encode(chunk['content']).tolist()
    collection.add(ids=[str(i)], embeddings=[vector], metadatas=[chunk['metadata']], documents=[chunk['content']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
print(f"Total embeddings saved: {collection.count()}")

Total embeddings saved: 3243


In [None]:
# Test Retrieval
results = collection.query(query_embeddings=model.encode("Can I work off campus?").tolist(), n_results=5)
print("Test Results:", results['documents'])
# End of Notebook - Save all_chunks to JSON for backups
import json
with open('/content/drive/MyDrive/f1_all_chunks.json', 'w') as f:
    json.dump(all_chunks, f)

Test Results: [['# Chapter 6 - Employment\n\nIf authorized, F-1 students may engage in on-campus or off-campus employment. M-1 students may only engage in employment for purposes of practical training.[**[1]**](#footnote-1)\n\n## A. On-Campus Employment\n\nF-1 students may engage in on-campus employment subject to certain conditions and restrictions.[**[2]**](#footnote-2 "") F-1 status permits students with Designated School Official (DSO) approval to work at an on-campus job for up to 20 hours per week when school is in session.[**[3]**](#footnote-3 "") During vacation periods, students may work on-campus full-time.[**[4]**](#footnote-4 "")', 'Employment may be performed at off-campus locations that are educationally affiliated with the school and the employment must be an integral part of the student’s educational program.[**[5]**](#footnote-5 "") Students working on campus may be employed by the school itself or by any independent companies serving the school’s needs, such as the sc