In [1]:
import requests

# Hypothes.is API endpoint for searching annotations
url = "https://api.hypothes.is/api/search"

# Set parameters: limit to 10 annotations
params = {
    "limit": 10
}

# Make the GET request
response = requests.get(url, params=params)

# Check for successful response
if response.status_code == 200:
    data = response.json()
    annotations = data.get("rows", [])
    
    # Print details of each annotation
    for ann in annotations:
        print("Annotation ID:", ann.get("id"))
        print("User:", ann.get("user"))
        print("Text:", ann.get("text"))
        print("Tags:", ann.get("tags"))
        print("-" * 40)
else:
    print("Error:", response.status_code, response.text)

Annotation ID: eFyd7gheEfCcX4ueStVnGw
User: acct:songgao@hypothes.is
Text: Could you explain the idea here further in detail? Bezier curves would deviate from its initial control points, so the trajectory may get out of the convex hull (especially at intersections). How to address this problem?
Tags: []
----------------------------------------
Annotation ID: oV_cdAheEfChTduRNrKCUw
User: acct:GidaldhyHernandez@hypothes.is
Text: The fall of the Roman empire lead to Britain being vulnerable and thus being open for more cultures to be introduced. 

Tags: []
----------------------------------------
Annotation ID: dHK2oAheEfCcWcvcF9FNGA
User: acct:jphan70@hypothes.is
Text: > Resilience rooted in cultural traditions and community well-being.
Tags: []
----------------------------------------
Annotation ID: a0PFTAheEfCt7rM4yNpzQA
User: acct:GidaldhyHernandez@hypothes.is
Text: Dissemination is the act or process of spreading something, so the adoption of Christianity as the official religion of 

In [2]:
import requests
from bs4 import BeautifulSoup
import json
from collections import defaultdict

# Step 1: Query Hypothes.is API for 10 public annotations
api_url = "https://api.hypothes.is/api/search"
params = {
    "limit": 10
}

response = requests.get(api_url, params=params)
if response.status_code != 200:
    print("Error fetching annotations:", response.status_code, response.text)
    exit()

data = response.json()
annotations = data.get("rows", [])

# Step 2: Group annotations by document source URL.
grouped_annotations = defaultdict(list)
for ann in annotations:
    # Each annotation may have multiple targets; we use the first one.
    targets = ann.get("target", [])
    if targets:
        source = targets[0].get("source")
        grouped_annotations[source].append(ann)

def fetch_full_text(url):
    """Fetch the webpage and extract visible text using BeautifulSoup."""
    try:
        page = requests.get(url, timeout=10)
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, "html.parser")
            # Remove scripts and style tags
            for script in soup(["script", "style"]):
                script.extract()
            text = soup.get_text(separator="\n")
            # Clean extra whitespace
            lines = [line.strip() for line in text.splitlines() if line.strip()]
            return "\n".join(lines)
        else:
            return f"Error: Received status code {page.status_code}"
    except Exception as e:
        return f"Error fetching page: {e}"

def extract_selector_info(target):
    """Extract annotation selector details from the target's selector list."""
    info = {}
    selectors = target.get("selector", [])
    for selector in selectors:
        if selector.get("type") == "TextQuoteSelector":
            info["exact"] = selector.get("exact")
            info["prefix"] = selector.get("prefix")
            info["suffix"] = selector.get("suffix")
    return info

# Step 3: Build the desired structure.
result = []
for source, anns in grouped_annotations.items():
    entry = {}
    entry["source"] = source
    entry["full_text"] = fetch_full_text(source)
    entry["annotations"] = []
    
    for ann in anns:
        annotation_entry = {}
        annotation_entry["annotation_id"] = ann.get("id")
        annotation_entry["annotation_text"] = ann.get("text")
        annotation_entry["tags"] = ann.get("tags", [])
        
        # Extract selector details from the first target
        targets = ann.get("target", [])
        if targets:
            selector_info = extract_selector_info(targets[0])
            annotation_entry.update(selector_info)
        
        entry["annotations"].append(annotation_entry)
    result.append(entry)

# Step 4: Print the structured JSON output.
print(json.dumps(result, indent=2))

[
  {
    "source": "https://pressbooks.pub/earlybritishlit/part/the-middle-ages-ca-476-1485/",
    "full_text": "The Middle Ages (ca. 476-1485) \u2013 An Open Companion to Early British Literature\n\"\nSkip to content\nToggle Menu\nPrimary Navigation\nHome\nRead\nSign in\nSearch in book:\nSearch\nWant to create or adapt books like this?\nLearn more\nabout how Pressbooks supports open publishing practices.\nBook Contents Navigation\nContents\nIntroduction\nThe Project\nHow to teach this course\nAUTHORS\nAre you Teaching this Course? LOOK HERE!\nGet the curriculum companion to this textbook today!\n1,000 YEARS OF stORYTELLING\nAVAILABLE for INSTANT DOWNLOAD at your preferred retailer:\nWhat is Open Pedagogy?\nTheory\nPractice\nHow it is Used in this Book\nAnnotations\nHow to Access and Make Hypothes.is Annotations\nHypothes.is for Education\nAcknowledgements\nHow to Cite this Book\nThe Book\nChapter Introductions\nChapter Readings (Primary Source Texts)\nAccessibility Statement\nPressbo

In [4]:
import requests
from bs4 import BeautifulSoup
import json
from collections import defaultdict
from urllib.parse import urlparse

# Helper function to check if a URL is academic (here, we simply check for '.edu' in the domain)
def is_academic_url(url):
    try:
        netloc = urlparse(url).netloc.lower()
        return ".edu" in netloc
    except Exception:
        return False

# Step 1: Query Hypothes.is API for 10 public annotations
api_url = "https://api.hypothes.is/api/search"
params = {
    "limit": 100
}

response = requests.get(api_url, params=params)
if response.status_code != 200:
    print("Error fetching annotations:", response.status_code, response.text)
    exit()

data = response.json()
annotations = data.get("rows", [])

# Step 2: Group annotations by document source URL, but only for academic texts.
grouped_annotations = defaultdict(list)
for ann in annotations:
    targets = ann.get("target", [])
    if targets:
        source = targets[0].get("source")
        if source and is_academic_url(source):
            grouped_annotations[source].append(ann)

# Function to fetch and limit the full text of a page to 500 words.
def fetch_full_text(url, max_words=500):
    try:
        page = requests.get(url, timeout=10)
        if page.status_code == 200:
            soup = BeautifulSoup(page.text, "html.parser")
            # Remove scripts and style tags
            for script in soup(["script", "style"]):
                script.extract()
            text = soup.get_text(separator="\n")
            # Clean extra whitespace and split into words
            words = [word for word in text.split() if word.strip()]
            return " ".join(words[:max_words])
        else:
            return f"Error: Received status code {page.status_code}"
    except Exception as e:
        return f"Error fetching page: {e}"

# Extract selector info (exact, prefix, suffix) from annotation target.
def extract_selector_info(target):
    info = {}
    selectors = target.get("selector", [])
    for selector in selectors:
        if selector.get("type") == "TextQuoteSelector":
            info["exact"] = selector.get("exact")
            info["prefix"] = selector.get("prefix")
            info["suffix"] = selector.get("suffix")
    return info

# Step 3: Build the desired structure.
result = []
for source, anns in grouped_annotations.items():
    entry = {}
    entry["source"] = source
    entry["full_text"] = fetch_full_text(source)  # Only up to 500 words will be returned.
    entry["annotations"] = []
    
    for ann in anns:
        annotation_entry = {}
        annotation_entry["annotation_id"] = ann.get("id")
        annotation_entry["annotation_text"] = ann.get("text")
        annotation_entry["tags"] = ann.get("tags", [])
        
        targets = ann.get("target", [])
        if targets:
            selector_info = extract_selector_info(targets[0])
            annotation_entry.update(selector_info)
        
        entry["annotations"].append(annotation_entry)
    result.append(entry)

# Step 4: Print the structured JSON output.
print(json.dumps(result, indent=2))

[
  {
    "source": "http://manipulation.csail.mit.edu/trajectories.html",
    "full_text": "Ch. 6 - Motion Planning Robotic Manipulation Perception, Planning, and Control Russ Tedrake \u00a9 Russ Tedrake, 2020-2024 Last modified . How to cite these notes, use annotations, and give feedback. Note: These are working notes used for a course being taught at MIT . They will be updated throughout the Fall 2024 semester. Previous Chapter Table of contents Next Chapter Motion Planning There are a few more essential skills that we need in our toolbox. In this chapter, we will explore some of the powerful methods of kinematic trajectory motion planning. I'm actually almost proud of making it this far into the notes without covering this topic yet. Writing a relatively simple script for the pose of the gripper, like we did in the bin picking chapter, really can solve a lot of interesting problems. But there are a number of reasons that we might want a more automated solution: When the environmen