In [3]:
%pip install pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m286.7/298.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0


Trying with 10 chunks and 3 papers

It run for 9 min

In [8]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen, urlretrieve
from IPython.display import Markdown, display
from pypdf import PdfReader
from datetime import date
from tqdm import tqdm
from transformers import pipeline

# Configuration variables
MAX_PAPERS = 3
MAX_CHUNKS = 10

# Initialize BART-CNN summarizer with specific parameters
summarizer = pipeline("summarization",
                     model="facebook/bart-large-cnn",
                     device=0)  # Use GPU if available

# HuggingFace papers scraping
BASE_URL = "https://huggingface.co/papers"
page = requests.get(BASE_URL)
soup = BeautifulSoup(page.content, "html.parser")
h3s = soup.find_all("h3")
papers = []
for h3 in h3s[:MAX_PAPERS]:  # Only process MAX_PAPERS
    a = h3.find("a")
    title = a.text
    link = a["href"].replace('/papers', '')
    papers.append({"title": title, "url": f"https://arxiv.org/pdf{link}"})
print("printing papers:", papers)
def extract_pdf(url):
    pdf = urlretrieve(url, "pdf_file.pdf")
    reader = PdfReader("pdf_file.pdf")
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def printmd(string):
    display(Markdown(string))

def chunk_text(text, max_chunk_length=1024):
    """Split text into chunks that BART can process"""
    sentences = text.split('.')
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence = sentence.strip() + '.'
        if current_length + len(sentence) <= max_chunk_length:
            current_chunk.append(sentence)
            current_length += len(sentence)
        else:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = len(sentence)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def summarize_text(text, max_summary_length=150):
    """Summarize text using BART-CNN with optimized chunking"""
    # Clean and prepare text
    text = ' '.join(text.split())  # Remove excessive whitespace

    # Only process first 5000 words to avoid excessive processing
    text = ' '.join(text.split()[:5000])

    # Split into smaller chunks
    chunks = chunk_text(text, max_chunk_length=1024)

    # Process only specified number of chunks
    chunks = chunks[:MAX_CHUNKS]
    print(f"Processing {len(chunks)} chunks...")

    summaries = []
    for i, chunk in enumerate(chunks):
        try:
            print(f"Processing chunk {i+1}/{len(chunks)}")
            summary = summarizer(chunk,
                               max_length=max_summary_length,
                               min_length=30,
                               do_sample=False,
                               truncation=True)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Chunk summarization failed: {e}")
            continue

    # Combine and summarize again if needed (takes time)
    final_summary = ' '.join(summaries)
    if len(final_summary.split()) > max_summary_length:
        try:
            final_summary = summarizer(final_summary,
                                     max_length=max_summary_length,
                                     min_length=30,
                                     do_sample=False)[0]['summary_text']
        except Exception as e:
            print(f"Final summarization failed: {e}")

    return final_summary

# Process papers with progress bar
for paper in tqdm(papers):
    try:
        print(f"\nProcessing: {paper['title']}")
        text = extract_pdf(paper['url'])
        print(f"Text extracted, length: {len(text.split())} words")
        paper["summary"] = summarize_text(text)
        print(f"Summary generated, length: {len(paper['summary'].split())} words")
    except Exception as e:
        print(f"Error processing paper: {e}")
        paper["summary"] = "Processing failed"

# Generate markdown output
output = "# Paper Summaries\n\n"
for paper in papers:
    # Make sure we have both 'title' and 'summary' keys
    title = paper.get('title', 'No Title')
    summary = paper.get('summary', 'No Summary')
    output += f"## {title}\n\n{summary}\n\n---\n\n"

printmd(output)

Device set to use cpu


[{'title': '2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining', 'url': 'https://arxiv.org/pdf/2501.00958'}, {'title': 'VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion Control', 'url': 'https://arxiv.org/pdf/2501.01427'}, {'title': 'CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings', 'url': 'https://arxiv.org/pdf/2501.01257'}]


  0%|          | 0/3 [00:00<?, ?it/s]


Processing: 2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Text extracted, length: 12256 words
Processing 10 chunks...
Processing chunk 1/10
Processing chunk 2/10
Processing chunk 3/10
Processing chunk 4/10
Processing chunk 5/10
Processing chunk 6/10
Processing chunk 7/10
Processing chunk 8/10
Processing chunk 9/10
Processing chunk 10/10


 33%|███▎      | 1/3 [03:20<06:41, 200.52s/it]

Summary generated, length: 46 words

Processing: VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion Control
Text extracted, length: 6432 words
Processing 10 chunks...
Processing chunk 1/10
Processing chunk 2/10
Processing chunk 3/10
Processing chunk 4/10
Processing chunk 5/10
Processing chunk 6/10
Processing chunk 7/10
Processing chunk 8/10
Processing chunk 9/10
Processing chunk 10/10


 67%|██████▋   | 2/3 [06:08<03:01, 181.23s/it]

Summary generated, length: 40 words

Processing: CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings
Text extracted, length: 8689 words
Processing 10 chunks...
Processing chunk 1/10
Processing chunk 2/10
Processing chunk 3/10
Processing chunk 4/10
Processing chunk 5/10
Processing chunk 6/10


Your max_length is set to 150, but your input_length is only 136. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)


Processing chunk 7/10
Processing chunk 8/10
Processing chunk 9/10
Processing chunk 10/10


100%|██████████| 3/3 [09:14<00:00, 184.90s/it]

Summary generated, length: 32 words





# Paper Summaries

## 2.5 Years in Class: A Multimodal Textbook for Vision-Language Pretraining

Interleaved corpora enable Vision-Language Models (VLMs) to understand the world more naturally like humans. Such exist- ing datasets are crawled from webpage, facing challenges like low knowledge density and loose image-text relations. Our textbook collects over 2. 5 years of instructional videos, totaling 22,000 class hours.

---

## VideoAnydoor: High-fidelity Video Object Insertion with Precise Motion Control

VideoAnydoor is a zero-shot video object insertion frame- work. It warps the pixel details according to the trajectories and fuses the warped features with the diffusion U-Net. Users could further add multiple objects or swap objects in the same video.

---

## CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings

CodeElo is a benchmarking tool for large language models (LLMs) o1-mini and QwQ-32B-Preview stand out significantly, achieving Elo ratings of 1578 and 1261, respectively. Other models struggle even with the easiest problems.

---

