In [None]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import sqlite3
import nest_asyncio
import os

nest_asyncio.apply()  

BASE_URL = "https://papers.nips.cc"
DB_FILE = r"C:\kinge\neurips_papers.db"  

async def fetch(session, url):
    """Fetch page content asynchronously."""
    async with session.get(url) as response:
        return await response.text()

async def get_year_links(session):
    """Extract valid NeurIPS proceedings year links."""
    url = BASE_URL  
    html = await fetch(session, url)
    soup = BeautifulSoup(html, "html.parser")

    year_links = [
        BASE_URL + a["href"]
        for a in soup.select("a[href^='/paper_files/paper/']")
    ]

    print(f"✅ Found {len(year_links)} year links.")
    return year_links[:5]  

async def parse_papers(year_url, session):
    """Extract paper titles and links from a given year's page."""
    papers = []
    html = await fetch(session, year_url)
    soup = BeautifulSoup(html, "html.parser")

    paper_elements = soup.select("a[title][href*='Abstract-Conference.html']")

    print(f"🔍 Scraping {year_url}: Found {len(paper_elements)} papers.")  
    for paper in paper_elements:
        paper_title = paper["title"].strip() 
        paper_link = BASE_URL + paper["href"]  
        papers.append((paper_title, paper_link))
    
    return papers

async def scrape_neurips():
    """Main function to scrape NeurIPS papers."""
    async with aiohttp.ClientSession() as session:
        year_links = await get_year_links(session)
        
        if not year_links:
            print("❌ No year links found. The website structure might have changed.")
            return

        tasks = [parse_papers(year_url, session) for year_url in year_links]
        
        results = await asyncio.gather(*tasks)
        all_papers = [paper for result in results for paper in result] 
        save_to_db(all_papers)

def save_to_db(papers):
    """Save extracted papers to SQLite database."""
    if not papers:
        print("❌ No papers found. Database not updated.")
        return

    try:
        os.makedirs(os.path.dirname(DB_FILE), exist_ok=True)
        conn = sqlite3.connect(DB_FILE)
        cursor = conn.cursor()
        
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS papers (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                title TEXT,
                link TEXT UNIQUE
            )
        """)
        
        cursor.executemany("INSERT OR IGNORE INTO papers (title, link) VALUES (?, ?)", papers)
        conn.commit()
        conn.close()

        print(f"✅ Successfully saved {len(papers)} papers to database.")

    except Exception as e:
        print(f"❌ Database Error: {e}")

await scrape_neurips()


✅ Found 38 year links.
🔍 Scraping https://papers.nips.cc/paper_files/paper/2020: Found 0 papers.
🔍 Scraping https://papers.nips.cc/paper_files/paper/2021: Found 0 papers.
🔍 Scraping https://papers.nips.cc/paper_files/paper/2023: Found 3218 papers.
🔍 Scraping https://papers.nips.cc/paper_files/paper/2022: Found 2671 papers.
🔍 Scraping https://papers.nips.cc/paper_files/paper/2024: Found 4035 papers.
✅ Successfully saved 9924 papers to database.


In [None]:
import sqlite3

DB_FILE = r"C:\kinge\neurips_papers.db"

conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()

cursor.execute("SELECT * FROM papers LIMIT 5")
rows = cursor.fetchall()

for row in rows:
    print(row)

conn.close()


(1, 'paper title', 'https://papers.nips.cc/paper_files/paper/2024/hash/000f947dcaff8fbffcc3f53a1314f358-Abstract-Conference.html')
(2, 'paper title', 'https://papers.nips.cc/paper_files/paper/2024/hash/00295cede6e1600d344b5cd6d9fd4640-Abstract-Conference.html')
(3, 'paper title', 'https://papers.nips.cc/paper_files/paper/2024/hash/00532321a253959cedc4f971b5524131-Abstract-Conference.html')
(4, 'paper title', 'https://papers.nips.cc/paper_files/paper/2024/hash/005413e90d003d13886019607b037f52-Abstract-Conference.html')
(5, 'paper title', 'https://papers.nips.cc/paper_files/paper/2024/hash/00616a2d48f5716f3d6f783491149364-Abstract-Conference.html')


In [None]:

def load_api_key(file_path):
    with open(file_path, "r") as file:
        return file.read().strip()

OPENAI_API_KEY = load_api_key("api_key.txt") 

print("✅ API Key Loaded Successfully")


✅ API Key Loaded Successfully


In [None]:
import openai
import pandas
import sqlite3  

print("✅ All libraries installed successfully!")


✅ All libraries installed successfully!


In [None]:
import sqlite3

DB_FILE = "neurips_papers.db"

conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()

try:
    cursor.execute("ALTER TABLE papers ADD COLUMN category TEXT")
    print("✅ 'category' column added successfully.")
except sqlite3.OperationalError:
    print("⚠️ 'category' column already exists.")

conn.commit()
conn.close()


✅ 'category' column added successfully.


In [None]:
import pandas as pd
import sqlite3

CSV_FILE = "neurips_papers.csv"  
DB_FILE = "neurips_papers.db"

df = pd.read_csv(CSV_FILE)

if "title" not in df.columns:
    print("🚨 The CSV does not contain a 'title' column!")
    exit()

conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()

cursor.execute("""
    CREATE TABLE IF NOT EXISTS papers (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT NOT NULL,
        category TEXT
    )
""")

for _, row in df.iterrows():
    cursor.execute("INSERT INTO papers (title) VALUES (?)", (row["title"],))

conn.commit()
conn.close()

print("✅ Papers successfully reloaded into the database!")


✅ Papers successfully reloaded into the database!


In [18]:
import sqlite3

conn = sqlite3.connect("neurips_papers.db")
cursor = conn.cursor()

cursor.execute("SELECT COUNT(*) FROM papers")
paper_count = cursor.fetchone()[0]

print(f"📄 Total Papers in Database: {paper_count}")

conn.close()


📄 Total Papers in Database: 9924


In [None]:
import aiohttp
import asyncio
import sqlite3

with open("api_key.txt", "r") as f:
    API_KEY = f.read().strip()

DB_FILE = "neurips_papers.db"
CATEGORIES = [
    "Deep Learning", "Natural Language Processing (NLP)",
    "Reinforcement Learning", "Optimization", "Computer Vision", "Other"
]

async def classify_paper(title):
    """Force Gemini to classify the paper into one of the categories."""
    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent"
    headers = {"Content-Type": "application/json"}
    params = {"key": API_KEY}

    prompt_text = (
        "**IMPORTANT INSTRUCTIONS:** You MUST classify the following research paper title "
        "into exactly ONE of these categories: \n"
        "1. Deep Learning\n"
        "2. Natural Language Processing (NLP)\n"
        "3. Reinforcement Learning\n"
        "4. Optimization\n"
        "5. Computer Vision\n\n"
        "**RULES:**\n"
        "- Return ONLY the category name, with NO extra text.\n"
        "- If unsure, choose the CLOSEST matching category.\n"
        "- NEVER return 'Other' unless it truly does not fit any category.\n\n"
        f"**Title:** \"{title}\"\n"
        "**Category:**"
    )

    data = {"contents": [{"parts": [{"text": prompt_text}]}]}

    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, params=params, json=data) as response:
            if response.status != 200:
                print(f"🚨 Error classifying '{title}': {await response.text()}")
                return "Other"

            result = await response.json()
            try:
                category = result["candidates"][0]["content"]["parts"][0]["text"].strip()

                return category if category in CATEGORIES else "Other"
            except (KeyError, IndexError):
                return "Other"

def fetch_papers():
    """Retrieve unannotated papers from the database."""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    cursor.execute("SELECT id, title FROM papers WHERE category IS NULL LIMIT 10")
    papers = cursor.fetchall()
    conn.close()
    return papers

def update_paper_category(paper_id, category):
    """Update the category of a paper in the database."""
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    cursor.execute("UPDATE papers SET category = ? WHERE id = ?", (category, paper_id))
    conn.commit()
    conn.close()

async def annotate_papers():
    """Fetch papers, classify them, and update the database."""
    papers = fetch_papers()
    if not papers:
        print("✅ No new papers to annotate.")
        return

    print(f"🔍 Found {len(papers)} papers to annotate...")

    for paper_id, title in papers:
        category = await classify_paper(title)
        update_paper_category(paper_id, category)
        print(f"📌 Annotated: {title} → {category}")

    print("🎉 Annotation process completed!")

asyncio.run(annotate_papers())


🔍 Found 10 papers to annotate...
📌 Annotated: paper title → Computer Vision
📌 Annotated: paper title → Natural Language Processing (NLP)
📌 Annotated: paper title → Natural Language Processing (NLP)
📌 Annotated: paper title → Computer Vision
📌 Annotated: paper title → Computer Vision
📌 Annotated: paper title → Computer Vision
📌 Annotated: paper title → Computer Vision
📌 Annotated: paper title → Computer Vision
📌 Annotated: paper title → Optimization
📌 Annotated: paper title → Computer Vision
🎉 Annotation process completed!


In [28]:
import pandas as pd
import sqlite3

DB_FILE = "neurips_papers.db"
OUTPUT_CSV = "neurips_papers_annotated.csv"

def export_to_csv():
    """Export the annotated dataset to a CSV file."""
    conn = sqlite3.connect(DB_FILE)
    df = pd.read_sql_query("SELECT * FROM papers WHERE category IS NOT NULL", conn)
    df.to_csv(OUTPUT_CSV, index=False)
    conn.close()
    print(f"📁 Dataset saved as '{OUTPUT_CSV}' ✅")

export_to_csv()


📁 Dataset saved as 'neurips_papers_annotated.csv' ✅
