In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
from collections import deque
import os

def get_domain(url):
    """Extract domain from URL"""
    parsed = urlparse(url)
    return parsed.netloc

def sanitize_text(text):
    """Clean up text by removing extra whitespace but preserving intentional newlines"""
    lines = []
    for line in text.splitlines():
        stripped_line = ' '.join(line.split())
        if stripped_line:  # Skip empty lines
            lines.append(stripped_line)
    return '\n'.join(lines)

def extract_structured_text(soup):
    """
    Extract text while preserving ordered/unordered lists and their hierarchy.
    Args:
        soup (BeautifulSoup): Parsed HTML content
    Returns:
        str: Structured text with lists formatted correctly
    """
    text = []
    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'br', 'ul', 'ol']):
        if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            heading_text = element.get_text(strip=True)
            text.append(f'\n\n## {heading_text}\n\n')
        elif element.name == 'p':
            paragraph = element.get_text(separator=' ', strip=True)
            text.append(f'{paragraph}\n\n')
        elif element.name == 'li':
            parent = element.parent.name
            list_text = element.get_text(strip=True)
            if parent == 'ul':
                text.append(f'- {list_text}\n')
            elif parent == 'ol':
                item_number = len(element.find_previous_siblings('li')) + 1
                text.append(f'{item_number}. {list_text}\n')
        elif element.name == 'br':
            text.append('\n')
        elif element.name in ['ul', 'ol']:
            if not text[-1].endswith('\n'):
                text.append('\n')
    return sanitize_text(''.join(text))

def load_existing_data(output_dir=''):
    """
    Load existing URL mapping and downloaded pages from url_mapping.csv
    Returns:
        pd.DataFrame: Existing URL mapping
        set: Downloaded page IDs
        int: Last used ID counter
    """
    mapping_file = os.path.join(output_dir, 'url_mapping.csv')
    if os.path.exists(mapping_file):
        df = pd.read_csv(mapping_file)
        downloaded_ids = set(df['id'].values)
        last_id = df['id'].max() if not df.empty else 0
        return df, downloaded_ids, last_id + 1
    return pd.DataFrame(columns=['id', 'url']), set(), 0

def scrape_website(start_url, max_pages=10, output_dir='', id_counter=0):
    """
    Scrape website using BFS approach
    Args:
        start_url (str): URL to start scraping from
        max_pages (int): Max pages to scrape **for this website**
        output_dir (str): Directory to save text files
        id_counter (int): Starting ID for file numbering
    Returns:
        pd.DataFrame: Updated URL mapping
        int: Updated ID counter
    """
    # Load existing data
    df_existing, downloaded_ids_existing, _ = load_existing_data(output_dir)
    visited = set(df_existing['url'].values)
    queue = deque([start_url])
    domain = get_domain(start_url)
    data = df_existing.to_dict('records')
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    pages_scraped = 0  # Track pages scraped for this website

    while queue and pages_scraped < max_pages:
        current_url = queue.popleft()
        
        if current_url in visited:
            print(f"Skipping already visited URL: {current_url}")
            continue

        try:
            response = requests.get(current_url, headers=headers, timeout=10)
            response.raise_for_status()
        except Exception as e:
            print(f"Error fetching {current_url}: {str(e)}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        structured_text = extract_structured_text(soup)
        sanitized_text = sanitize_text(structured_text)
        filename = os.path.join(output_dir, f"{id_counter}.txt")
        
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(sanitized_text)
        except IOError as e:
            print(f"Error writing file {filename}: {str(e)}")
            continue

        data.append({'id': id_counter, 'url': current_url})
        visited.add(current_url)
        print(f"Scraped [{id_counter}] {current_url}")
        id_counter += 1
        pages_scraped += 1

        # Process links
        for link in soup.find_all('a', href=True):
            href = link['href'].strip()
            absolute_url = urljoin(current_url, href)
            parsed_url = urlparse(absolute_url)
            if parsed_url.scheme in ('http', 'https') and parsed_url.netloc == domain:
                if absolute_url not in visited and absolute_url not in queue:
                    queue.append(absolute_url)

    # Update DataFrame and save
    df = pd.DataFrame(data)
    df.to_csv(os.path.join(output_dir, 'url_mapping.csv'), index=False)
    return df, id_counter



In [12]:
# Head node of the website to be scraped
WEBSITES = [
    'https://studyinthestates.dhs.gov/students/prepare/students-and-the-form-i-20',
    'https://www.uscis.gov/working-in-the-united-states/students-and-exchange-visitors/optional-practical-training-opt-for-f-1-students',
]
BREADTH = 5  # Max pages per website

if __name__ == "__main__":
    # Initialize ID counter from existing data
    _, _, id_counter = load_existing_data(output_dir='output/')
    
    for website in WEBSITES:
        print(f"\nScraping: {website}")
        df, id_counter = scrape_website(website, max_pages=BREADTH, output_dir='output/', id_counter=id_counter)
    
    print("\nScraping completed. DataFrame saved as 'url_mapping.csv'")


Scraping: https://studyinthestates.dhs.gov/students/prepare/students-and-the-form-i-20
Skipping already visited URL: https://studyinthestates.dhs.gov/students/prepare/students-and-the-form-i-20

Scraping: https://www.uscis.gov/working-in-the-united-states/students-and-exchange-visitors/optional-practical-training-opt-for-f-1-students
Skipping already visited URL: https://www.uscis.gov/working-in-the-united-states/students-and-exchange-visitors/optional-practical-training-opt-for-f-1-students

Scraping completed. DataFrame saved as 'url_mapping.csv'
