# ArXiv Paper Scraper

## 1. Setup Environment

Install required dependencies and clone the repository.

In [None]:
!pip install arxiv requests psutil python-dotenv matplotlib -q

In [None]:
# Clone repository (if not already cloned)
import os
if not os.path.exists('Crawler'):
    !git clone https://github.com/HieuMagic/Crawler.git
%cd Crawler

## 2. Configuration

Set your scraping parameters and Semantic Scholar API key (optional).

In [None]:
# Configuration
START_ID = '2311.05222'
END_ID = '2311.05230'
STUDENT_ID = '23120258'
NUM_WORKERS = 5
SEMANTIC_SCHOLAR_API_KEY = ''  # Optional: add your API key here for faster rate limits

# Create .env file
with open('.env', 'w') as f:
    if SEMANTIC_SCHOLAR_API_KEY:
        f.write(f'SEMANTIC_SCHOLAR_API_KEY={SEMANTIC_SCHOLAR_API_KEY}\n')

print(f"Configuration set:")
print(f"  Papers: {START_ID} to {END_ID}")
print(f"  Student ID: {STUDENT_ID}")
print(f"  Workers: {NUM_WORKERS}")
print(f"  API Key: {'Configured' if SEMANTIC_SCHOLAR_API_KEY else 'Not set'}")

In [None]:
# Update config.py with your settings
config_content = f"""import os
from dotenv import load_dotenv

load_dotenv()

CONFIG = {{
    'start_id': '{START_ID}',
    'end_id': '{END_ID}',
    'student_id': '{STUDENT_ID}',
    
    'output_dir': './data',
    'stats_file': './statistics.json',
    'progress_file': './progress.json',
    
    'num_workers': {NUM_WORKERS},
    
    'ss_api_key': os.getenv('SEMANTIC_SCHOLAR_API_KEY'),
    
    'max_retries': 3,
    'timeout': 30,
    
    'resume': True
}}
"""

with open('src/config.py', 'w') as f:
    f.write(config_content)

print("âœ“ Configuration file updated")

## 3. Run Scraper

Start the scraping process. Progress is automatically saved, so you can resume if interrupted.

In [None]:
!python src/main.py

## 4. View Results

Check the statistics and output files.

In [None]:
import json

# Load and display statistics
with open('statistics.json', 'r') as f:
    stats = json.load(f)

print("="*70)
print("SCRAPING STATISTICS")
print("="*70)
print(f"\nPapers:")
print(f"  Total: {stats['total_papers']}")
print(f"  Successful: {stats['successful_papers']}")
print(f"  Failed: {stats['failed_papers']}")
print(f"  Success Rate: {stats['success_rate_percent']:.1f}%")

print(f"\nError Breakdown:")
for error_type, count in stats['error_breakdown'].items():
    if count > 0:
        print(f"  {error_type}: {count}")

print(f"\nContent:")
print(f"  Versions scraped: {stats['total_versions_scraped']}")
print(f"  Avg versions/paper: {stats['avg_versions_per_paper']:.2f}")
print(f"  References scraped: {stats['total_references_scraped']}")
print(f"  Avg references/paper: {stats['avg_references_per_paper']:.2f}")

print(f"\nStorage:")
print(f"  Max disk usage: {stats['max_disk_usage_mb']:.1f} MB")
print(f"  Final output size: {stats['final_output_size_mb']:.1f} MB")
print(f"  .tex files: {stats['tex_file_percent']:.1f}%")
print(f"  .bib files: {stats['bib_file_percent']:.1f}%")
print(f"  .json files: {stats['json_file_percent']:.1f}%")

print(f"\nPerformance:")
print(f"  Total runtime: {stats['total_runtime_seconds']:.1f}s")
print(f"  Avg time/paper: {stats['avg_time_per_paper_seconds']:.1f}s")
print(f"  Max RAM: {stats['max_ram_mb']:.1f} MB")
print(f"  Max CPU: {stats['max_cpu_percent']:.1f}%")