In [None]:
import aiohttp
import asyncio
import pandas as pd
import time
import random
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import nest_asyncio

nest_asyncio.apply()

topics = {
    "Artificial Intelligence": "cs.AI",
    "Hardware Architecture": "cs.AR",
    "Computation and Language": "cs.CL",
    "Computer Vision and Pattern Recognition": "cs.CV",
    "Databases": "cs.DB",
    "Data Structures and Algorithms": "cs.DS",
    "Formal Languages and Automata Theory": "cs.FL",
    "Computer Science and Game Theory": "cs.GT",
    "Human-Computer Interaction": "cs.HC",
    "Information Retrieval": "cs.IR",
    "Machine Learning": "cs.LG",
    "Sound": "cs.SD",
    "Robotics": "cs.RO",
    "Software Engineering": "cs.SE",
    "Cryptography and Security": "cs.CR"
}
VALID_CATEGORIES = set(topics.values())

data = []
CHECKPOINT_INTERVAL = 5000

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

async def fetch_page(session, url):
    try:
        async with session.get(url, headers=headers, timeout=5) as response:
            response.raise_for_status()
            return await response.text()
    except Exception:
        return None

def parse_total_entries(html):
    soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")
    paging_div = soup.find("div", class_="paging")
    if paging_div:
        match = re.search(r"Total of (\d+) entries", paging_div.text)
        return int(match.group(1)) if match else 0
    return 0

async def scrape_abstract(session, link, title):
    html = await fetch_page(session, link)
    if not html:
        return None

    soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")
    subjects_elem = soup.find("td", class_="tablecell subjects")
    if not subjects_elem:
        return None

    categories = re.findall(r'\(([\w\.]+)\)', subjects_elem.text)
    matching_categories = set(categories) & VALID_CATEGORIES
    if not matching_categories:
        return None

    matched_category = next(iter(matching_categories))
    category_name = next((name for name, code in topics.items() if code == matched_category), "Unknown")

    abstract_elem = soup.find("blockquote", class_="abstract")
    abstract = abstract_elem.text.replace("Abstract:", "").strip() if abstract_elem else "Not found"

    return {
        "Title": title,
        "Link": link,
        "Category": category_name,
        "Abstract": abstract
    }

async def scrape_page(session, page_url, year):
    html = await fetch_page(session, page_url)
    if not html:
        return []

    soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")
    article_list = soup.find("dl", id="articles")
    if not article_list:
        return []

    dt_elements = article_list.find_all("dt")
    dd_elements = article_list.find_all("dd")

    tasks = []
    for dt, dd in zip(dt_elements, dd_elements):
        link_elem = dt.find("a", title="Abstract")
        if not link_elem:
            continue
        link = "https://arxiv.org" + link_elem["href"]
        title_elem = dd.find("div", class_="list-title")
        if not title_elem:
            continue
        title = title_elem.text.replace("Title:", "").strip()
        tasks.append(scrape_abstract(session, link, title))

    if not tasks:
        return []

    page_data = []
    for result in await asyncio.gather(*tasks, return_exceptions=True):
        if isinstance(result, dict):
            page_data.append(result)

    return page_data

async def scrape_year(year):
    base_url = f"https://arxiv.org/list/cs/{year}"
    initial_url = f"{base_url}?skip=0&show=50"

    async with aiohttp.ClientSession() as session:
        html = await fetch_page(session, initial_url)
        if not html:
            print(f"Failed to load initial page for year {year}")
            return

        total_entries = parse_total_entries(html)
        if total_entries == 0:
            print(f"Could not determine total entries for year {year}")
            return

        print(f"Scraping year {year} with {total_entries} total entries")
        entries_per_page = 50
        total_pages = (total_entries + entries_per_page - 1) // entries_per_page
        page_urls = [f"{base_url}?skip={page * entries_per_page}&show={entries_per_page}" 
                     for page in range(total_pages)]

        with tqdm(total=total_pages, desc=f"Year {year} Progress") as pbar:
            for i in range(0, len(page_urls), 10):  
                batch_urls = page_urls[i:i + 10]
                tasks = [scrape_page(session, url, year) for url in batch_urls]
                for page_data in await asyncio.gather(*tasks, return_exceptions=True):
                    if isinstance(page_data, list):
                        data.extend(page_data)
                        if len(data) // CHECKPOINT_INTERVAL > (len(data) - len(page_data)) // CHECKPOINT_INTERVAL:
                            save_checkpoint()
                    pbar.update(1)
                await asyncio.sleep(random.uniform(0.5, 1.5))

def save_checkpoint():
    df = pd.DataFrame(data)
    df.to_csv("/kaggle/working/arxiv_cs_2017_2025.csv", index=False, encoding="utf-8")
    print(f"Checkpoint: {len(data)} articles saved")

if __name__ == "__main__":
    for year in range(2017, 2026):
        asyncio.run(scrape_year(year))
        time.sleep(random.uniform(2, 5))

    if data:
        df = pd.DataFrame(data)
        df.to_csv("/kaggle/working/arxiv_cs_2017_2025.csv", index=False, encoding="utf-8")
        print(f"Final save: {len(data)} articles saved to /kaggle/working/arxiv_cs_2017_2025.csv")
    else:
        print("No data scraped")



Scraping year 2017 with 32728 total entries


Year 2017 Progress:  52%|█████▏    | 339/655 [04:48<03:28,  1.52it/s]

Checkpoint: 5005 articles saved


Year 2017 Progress: 100%|██████████| 655/655 [09:31<00:00,  1.15it/s]


Scraping year 2018 with 43957 total entries


Year 2018 Progress:   1%|▏         | 11/880 [00:16<19:29,  1.35s/it] 

Checkpoint: 10012 articles saved


Year 2018 Progress:  35%|███▌      | 311/880 [04:47<08:40,  1.09it/s]

Checkpoint: 15001 articles saved


Year 2018 Progress:  66%|██████▌   | 580/880 [09:40<04:10,  1.20it/s]

Checkpoint: 20023 articles saved


Year 2018 Progress:  97%|█████████▋| 852/880 [14:53<00:30,  1.09s/it]

Checkpoint: 25002 articles saved


Year 2018 Progress: 100%|██████████| 880/880 [15:16<00:00,  1.04s/it]


Scraping year 2019 with 56002 total entries


Year 2019 Progress:  20%|██        | 225/1121 [04:34<15:45,  1.05s/it]

Checkpoint: 30029 articles saved


Year 2019 Progress:  40%|████      | 453/1121 [09:15<12:54,  1.16s/it]

Checkpoint: 35031 articles saved


Year 2019 Progress:  61%|██████    | 683/1121 [13:51<08:27,  1.16s/it]

Checkpoint: 40026 articles saved


Year 2019 Progress:  81%|████████▏ | 911/1121 [18:35<04:22,  1.25s/it]

Checkpoint: 45007 articles saved


Year 2019 Progress: 100%|██████████| 1121/1121 [22:37<00:00,  1.21s/it]


Scraping year 2020 with 71440 total entries


Year 2020 Progress:   4%|▎         | 52/1429 [01:17<31:18,  1.36s/it] 

Checkpoint: 50035 articles saved


Year 2020 Progress:  19%|█▉        | 278/1429 [06:09<20:22,  1.06s/it]

Checkpoint: 55010 articles saved


Year 2020 Progress:  36%|███▌      | 510/1429 [11:00<14:23,  1.06it/s]

Checkpoint: 60023 articles saved


Year 2020 Progress:  51%|█████▏    | 733/1429 [16:00<14:24,  1.24s/it]

Checkpoint: 65005 articles saved


Year 2020 Progress:  67%|██████▋   | 960/1429 [20:38<07:09,  1.09it/s]

Checkpoint: 70016 articles saved


Year 2020 Progress:  82%|████████▏ | 1178/1429 [25:26<04:22,  1.05s/it]

Checkpoint: 75000 articles saved


Year 2020 Progress: 100%|██████████| 1429/1429 [30:41<00:00,  1.29s/it]


Scraping year 2021 with 77528 total entries


Year 2021 Progress:   0%|          | 6/1551 [00:15<51:36,  2.00s/it]  

Checkpoint: 80037 articles saved


Year 2021 Progress:  15%|█▌        | 237/1551 [05:10<23:32,  1.08s/it]

Checkpoint: 85017 articles saved


Year 2021 Progress:  31%|███▏      | 485/1551 [10:16<19:44,  1.11s/it]

Checkpoint: 90000 articles saved


Year 2021 Progress:  46%|████▌     | 706/1551 [15:05<15:41,  1.11s/it]

Checkpoint: 95038 articles saved


Year 2021 Progress:  60%|█████▉    | 927/1551 [19:52<11:15,  1.08s/it]

Checkpoint: 100028 articles saved


Year 2021 Progress:  74%|███████▍  | 1152/1551 [24:54<09:07,  1.37s/it]

Checkpoint: 105020 articles saved


Year 2021 Progress:  89%|████████▊ | 1376/1551 [29:45<03:26,  1.18s/it]

Checkpoint: 110002 articles saved


Year 2021 Progress: 100%|██████████| 1551/1551 [33:35<00:00,  1.30s/it]


Scraping year 2022 with 81982 total entries


Year 2022 Progress:   4%|▍         | 69/1640 [01:28<26:41,  1.02s/it] 

Checkpoint: 115010 articles saved


Year 2022 Progress:  18%|█▊        | 297/1640 [06:27<25:46,  1.15s/it]

Checkpoint: 120000 articles saved


Year 2022 Progress:  31%|███▏      | 513/1640 [11:17<24:43,  1.32s/it]

Checkpoint: 125001 articles saved


Year 2022 Progress:  45%|████▌     | 744/1640 [16:11<18:51,  1.26s/it]

Checkpoint: 130032 articles saved


Year 2022 Progress:  59%|█████▉    | 974/1640 [21:09<13:58,  1.26s/it]

Checkpoint: 135004 articles saved


Year 2022 Progress:  73%|███████▎  | 1193/1640 [25:59<09:45,  1.31s/it]

Checkpoint: 140040 articles saved


Year 2022 Progress:  87%|████████▋ | 1421/1640 [30:56<04:56,  1.35s/it]

Checkpoint: 145017 articles saved


Year 2022 Progress: 100%|██████████| 1640/1640 [35:30<00:00,  1.30s/it]


Scraping year 2023 with 99628 total entries


Year 2023 Progress:   2%|▏         | 31/1993 [00:54<53:52,  1.65s/it]  

Checkpoint: 150012 articles saved


Year 2023 Progress:  13%|█▎        | 268/1993 [05:51<32:02,  1.11s/it]

Checkpoint: 155003 articles saved


Year 2023 Progress:  25%|██▌       | 499/1993 [10:59<29:04,  1.17s/it]

Checkpoint: 160033 articles saved


Year 2023 Progress:  37%|███▋      | 735/1993 [16:06<23:55,  1.14s/it]

Checkpoint: 165034 articles saved


Year 2023 Progress:  50%|████▉     | 989/1993 [21:18<17:35,  1.05s/it]

Checkpoint: 170012 articles saved


Year 2023 Progress:  61%|██████▏   | 1222/1993 [26:24<16:40,  1.30s/it]

Checkpoint: 175031 articles saved


Year 2023 Progress:  75%|███████▌  | 1499/1993 [31:45<09:43,  1.18s/it]

Checkpoint: 180018 articles saved


Year 2023 Progress:  87%|████████▋ | 1738/1993 [36:56<04:53,  1.15s/it]

Checkpoint: 185004 articles saved


Year 2023 Progress: 100%|██████████| 1993/1993 [42:13<00:00,  1.27s/it]


Scraping year 2024 with 125573 total entries


Year 2024 Progress:   2%|▏         | 54/2512 [01:01<47:10,  1.15s/it] 

Checkpoint: 190021 articles saved


Year 2024 Progress:  17%|█▋        | 415/2512 [07:32<40:50,  1.17s/it]

Checkpoint: 195010 articles saved


Year 2024 Progress:  33%|███▎      | 833/2512 [14:36<30:53,  1.10s/it]

Checkpoint: 200032 articles saved


Year 2024 Progress:  48%|████▊     | 1199/2512 [21:10<23:41,  1.08s/it]

Checkpoint: 205001 articles saved


Year 2024 Progress:  68%|██████▊   | 1719/2512 [29:14<11:43,  1.13it/s]

Checkpoint: 210015 articles saved


Year 2024 Progress:  85%|████████▍ | 2133/2512 [36:19<08:14,  1.30s/it]

Checkpoint: 215005 articles saved


Year 2024 Progress: 100%|██████████| 2512/2512 [42:07<00:00,  1.01s/it]


Scraping year 2025 with 32794 total entries


Year 2025 Progress:  11%|█         | 69/656 [01:22<10:09,  1.04s/it] 

Checkpoint: 220021 articles saved


Year 2025 Progress:  42%|████▏     | 274/656 [05:31<08:05,  1.27s/it]

Checkpoint: 225013 articles saved


Year 2025 Progress:  72%|███████▏  | 472/656 [09:29<04:13,  1.38s/it]

Checkpoint: 230026 articles saved


Year 2025 Progress: 100%|██████████| 656/656 [12:53<00:00,  1.18s/it]


Final save: 234170 articles saved to /kaggle/working/arxiv_cs_2017_2025.csv
