In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin
import os
import re

BASE_URL = "https://www.govinfo.gov"
COLLECTION_URL = f"{BASE_URL}/app/collection/crec"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

def get_issue_links(page_url):
    try:
        response = requests.get(page_url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()
        for a in soup.find_all('a', href=True):
            href = a['href']
            if '/details/CREC-2025' in href or '/app/details/CREC-2025' in href:
                full_url = urljoin(BASE_URL, href)
                links.add(full_url)
        return list(links)
    except Exception as e:
        print(f"Error fetching {page_url}: {e}")
        return []

def extract_issue_data(issue_url):
    try:
        response = requests.get(issue_url, headers=HEADERS, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        data = {'url': issue_url, 'title': '', 'date': '', 'content': '', 'metadata': {}}
        
        title = soup.find('h1') or soup.find('title')
        if title:
            data['title'] = title.get_text(strip=True)
        
        date = soup.find('time') or soup.find('span', class_=re.compile('date', re.I))
        if date:
            data['date'] = date.get_text(strip=True)
        elif soup.find('meta', property='article:published_time'):
            data['date'] = soup.find('meta', property='article:published_time')['content']
        
        content = soup.find('div', class_=re.compile('content|text|body', re.I)) or soup.find('div', id=re.compile('content|text|body', re.I))
        if content:
            for script in content(['script', 'style', 'nav', 'footer', 'header']):
                script.decompose()
            data['content'] = content.get_text(separator=' ', strip=True)
        else:
            main = soup.find('main') or soup.find('article')
            if main:
                for script in main(['script', 'style', 'nav', 'footer', 'header']):
                    script.decompose()
                data['content'] = main.get_text(separator=' ', strip=True)
        
        for meta in soup.find_all('meta'):
            name = meta.get('name') or meta.get('property', '')
            content = meta.get('content', '')
            if name and content:
                data['metadata'][name] = content
        
        return data
    except Exception as e:
        print(f"Error extracting {issue_url}: {e}")
        return None

def crawl_crec_2025(output_file='dataset/crec_2025.jsonl', max_pages=None, delay=0.5):
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    all_links = []
    page = 1
    
    while True:
        if max_pages and page > max_pages:
            break
        
        page_url = f"{COLLECTION_URL}?pageSize=100&page={page}"
        print(f"Fetching page {page}...")
        
        links = get_issue_links(page_url)
        if not links:
            break
        
        all_links.extend(links)
        print(f"Found {len(links)} links, total: {len(all_links)}")
        
        if len(links) < 100:
            break
        
        page += 1
        time.sleep(1)
    
    print(f"\nTotal links: {len(all_links)}")
    print("Extracting data...\n")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for i, link in enumerate(all_links, 1):
            print(f"[{i}/{len(all_links)}] {link}")
            data = extract_issue_data(link)
            if data:
                f.write(json.dumps(data, ensure_ascii=False) + '\n')
            time.sleep(delay)
    
    print(f"\nDone! Saved to {output_file}")

crawl_crec_2025()


Fetching page 1...

Total links: 0
Extracting data...


Done! Saved to dataset/crec_2025.jsonl


In [3]:
import requests
from bs4 import BeautifulSoup
import json
import time
import re
import os

ARXIV_BASE = "https://arxiv.org"
ARXIV_LIST_URL = f"{ARXIV_BASE}/list/cs/new"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

def extract_paper_info(dt, dd):
    try:
        arxiv_id = dt.find('a', href=re.compile(r'/abs/'))['href'].split('/')[-1]
        title_elem = dd.find('div', class_='list-title')
        title = title_elem.get_text(strip=True).replace('Title:', '').strip() if title_elem else ''
        
        authors_elem = dd.find('div', class_='list-authors')
        authors = []
        if authors_elem:
            for a in authors_elem.find_all('a'):
                authors.append(a.get_text(strip=True))
        
        subjects_elem = dd.find('div', class_='list-subjects')
        subjects = []
        if subjects_elem:
            subjects_text = subjects_elem.get_text(strip=True).replace('Subjects:', '').strip()
            subjects = [s.strip() for s in subjects_text.split(';') if s.strip()]
        
        abstract_elem = dd.find('p', class_='mathjax')
        abstract = abstract_elem.get_text(strip=True) if abstract_elem else ''
        
        comments_elem = dd.find('div', class_='list-comments')
        comments = comments_elem.get_text(strip=True).replace('Comments:', '').strip() if comments_elem else ''
        
        return {
            'arxiv_id': arxiv_id,
            'title': title,
            'authors': authors,
            'subjects': subjects,
            'abstract': abstract,
            'comments': comments,
            'url': f"{ARXIV_BASE}/abs/{arxiv_id}"
        }
    except Exception as e:
        print(f"Error extracting paper info: {e}")
        return None

def crawl_arxiv_cs_new(output_file='dataset/arxiv_cs_new.jsonl', max_pages=None, delay=1):
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    all_papers = []
    page = 0
    
    while True:
        if max_pages and page >= max_pages:
            break
        
        if page == 0:
            url = ARXIV_LIST_URL
        else:
            url = f"{ARXIV_LIST_URL}?skip={page * 2000}&show=2000"
        
        print(f"Fetching page {page + 1}...")
        
        try:
            response = requests.get(url, headers=HEADERS, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            dts = soup.find_all('dt')
            if not dts:
                break
            
            papers_on_page = []
            for dt in dts:
                dd = dt.find_next_sibling('dd')
                if dd:
                    paper = extract_paper_info(dt, dd)
                    if paper:
                        papers_on_page.append(paper)
            
            if not papers_on_page:
                break
            
            all_papers.extend(papers_on_page)
            print(f"Found {len(papers_on_page)} papers, total: {len(all_papers)}")
            
            if len(papers_on_page) < 2000:
                break
            
            page += 1
            time.sleep(delay)
            
        except Exception as e:
            print(f"Error fetching page {page + 1}: {e}")
            break
    
    print(f"\nTotal papers: {len(all_papers)}")
    print(f"Saving to {output_file}...\n")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for i, paper in enumerate(all_papers, 1):
            text = f"Title: {paper['title']}\n"
            text += f"Authors: {', '.join(paper['authors'])}\n"
            text += f"Subjects: {'; '.join(paper['subjects'])}\n"
            if paper['comments']:
                text += f"Comments: {paper['comments']}\n"
            text += f"Abstract: {paper['abstract']}\n"
            
            data = {
                'text': text.strip(),
                'arxiv_id': paper['arxiv_id'],
                'title': paper['title'],
                'authors': paper['authors'],
                'subjects': paper['subjects'],
                'abstract': paper['abstract'],
                'url': paper['url']
            }
            
            f.write(json.dumps(data, ensure_ascii=False) + '\n')
    
    print(f"Done! Saved {len(all_papers)} papers to {output_file}")

crawl_arxiv_cs_new()


Fetching page 1...
Found 767 papers, total: 767

Total papers: 767
Saving to dataset/arxiv_cs_new.jsonl...

Done! Saved 767 papers to dataset/arxiv_cs_new.jsonl
