In [None]:
"""
This script retrieves and structures annotation data from Hypothes.is, an open-source annotation platform widely utilized in educational contexts, 
where students collaboratively annotate and comment on digital texts such as articles, academic papers, and online resources. 
Hypothes.is provides a valuable dataset for analyzing student interactions, highlighting meaningful excerpts, and capturing individual insights directly associated with specific text fragments (Hypothes.is, n.d.). 

Reference:
Hypothes.is. (n.d.). Hypothes.is API documentation. Retrieved March 25, 2025, from https://h.readthedocs.io/en/latest/api/
"""

import requests
import json
from collections import defaultdict

HYPOTHESIS_API_URL = "https://api.hypothes.is/api/search"

# Fetch annotations from Hypothes.is API
def fetch_annotations(params=None):
    response = requests.get(HYPOTHESIS_API_URL, params=params)
    response.raise_for_status()
    return response.json()['rows']

# Organize annotations grouped by user and document (only docs with titles, no tags)
def organize_annotations(annotations):
    data = defaultdict(lambda: defaultdict(lambda: {"title": "", "annotations": []}))

    for ann in annotations:
        # Extract document title; skip if no title
        doc_title = ann.get('document', {}).get('title', [])
        if not doc_title or not doc_title[0].strip():
            continue  # skip annotations with no document title

        doc_title = doc_title[0].strip()

        # Extract user from the annotation
        user = ann.get('user', 'unknown_user').split(':')[1].split('@')[0]

        # Document URI as unique identifier
        doc_uri = ann.get('uri', 'unknown_document')

        # Annotation details
        annotation = {
            "id": ann.get('id'),
            "highlighted_text": "",
            "comment": ann.get('text', ''),
            "created": ann.get('created'),
            "position": {},
            "context": {}
        }

        # Extract selectors for highlighted text and position/context
        target = ann.get('target', [])
        if target:
            selectors = target[0].get('selector', [])
            for selector in selectors:
                if selector['type'] == 'TextQuoteSelector':
                    annotation["highlighted_text"] = selector.get('exact', '')
                    annotation["context"] = {
                        "preceding_text": selector.get('prefix', ''),
                        "following_text": selector.get('suffix', '')
                    }
                elif selector['type'] == 'TextPositionSelector':
                    annotation["position"] = {
                        "start_char": selector.get('start'),
                        "end_char": selector.get('end')
                    }

        # Add the annotation to our structure
        doc_entry = data[user][doc_uri]
        doc_entry["title"] = doc_title
        doc_entry["annotations"].append(annotation)

    return data

# Main execution
if __name__ == "__main__":
    # Adjust parameters as needed (e.g., user, limit)
    params = {
        "limit": 100,
    }

    annotations = fetch_annotations(params=params)
    structured_data = organize_annotations(annotations)

    # Print structured JSON beautifully
    print(json.dumps(structured_data, indent=2, ensure_ascii=False))

{
  "TylerRick": {
    "https://en.wikipedia.org/wiki/Slowly_changing_dimension": {
      "title": "Slowly changing dimension - Wikipedia",
      "annotations": [
        {
          "id": "OfQE3gnBEfCOJm9pacqNJQ",
          "highlighted_text": "",
          "comment": "",
          "created": "2025-03-25T21:36:35.367413+00:00",
          "position": {},
          "context": {}
        }
      ]
    },
    "https://soylu.org/row-versions-in-postgresql/": {
      "title": "Row Versions in PostgreSQL-3 – DBA Notes",
      "annotations": [
        {
          "id": "XDECggm2EfC4Zy_4V_wQqQ",
          "highlighted_text": "",
          "comment": "This article is missing context!\n_How_ is this achieved??? Stock postgresql? An extension?",
          "created": "2025-03-25T20:18:48.341637+00:00",
          "position": {},
          "context": {}
        }
      ]
    },
    "https://clarkdave.net/2015/02/historical-records-with-postgresql-and-temporal-tables-and-sql-2011/": {
      "title": 