In [26]:
import os
import requests
from typing import List, Dict, Any
from time import sleep

# Get API key from environment variable
canvas_api_key = os.getenv("DINGYI_CANVAS_API_KEY")
if not canvas_api_key:
    raise ValueError("Canvas API key not found in environment variables")

# API Configuration
BASE_URL = "https://canvas.nus.edu.sg/api/v1"
HEADERS = {"Authorization": f"Bearer {canvas_api_key}"}
PAGE_SIZE = 100
RATE_LIMIT_DELAY = 0.1  # Delay between API calls to avoid rate limiting

def get_paginated_results(url: str) -> List[Dict[Any, Any]] or None:
    """
    Generic function to get paginated results from Canvas API.
    - For announcements (using discussion_topics endpoint with only_announcements param), returns empty list on 404
    - Returns None on 403 permission denied to stop crawling that resource
    
    Args:
        url: Base API endpoint URL
        
    Returns:
        List of results from all pages; or None (indicating no permission)
    """
    results = []
    page = 1
    
    while True:
        # Use "&" if URL already has query params, else use "?"
        separator = "&" if "?" in url else "?"
        paginated_url = f"{url}{separator}page={page}&per_page={PAGE_SIZE}"
        try:
            response = requests.get(paginated_url, headers=HEADERS)
            response.raise_for_status()
            page_results = response.json()
            if not page_results:
                break
                
            results.extend(page_results)
            page += 1
            sleep(RATE_LIMIT_DELAY)
            
        except requests.exceptions.RequestException as e:
            if hasattr(e, 'response') and e.response is not None:
                status = e.response.status_code
                # For announcements (discussion_topics?only_announcements=true), return empty list on 404
                if status == 404 and "discussion_topics" in url and "only_announcements=true" in url:
                    return []
                # Return None on 403 permission denied to stop crawling
                if status == 403:
                    print(f"Permission denied for URL: {paginated_url}")
                    return None
            print(f"Error fetching data from {paginated_url}: {str(e)}")
            break
            
    return results

def get_course_data(course_id: int) -> Dict[str, Any] or None:
    """
    Get all relevant data for a specific course.
    If any resource returns no permission (None), stop crawling this course.
    
    Args:
        course_id: Canvas course ID
        
    Returns:
        Dictionary containing course data; or None (indicating no permission)
    """
    endpoints = {
        'details': f"{BASE_URL}/courses/{course_id}",
        'files': f"{BASE_URL}/courses/{course_id}/files",
        'assignments': f"{BASE_URL}/courses/{course_id}/assignments", 
        # Correct announcements endpoint: use discussion_topics with only_announcements flag
        'announcements': f"{BASE_URL}/courses/{course_id}/discussion_topics?only_announcements=true",
        'users': f"{BASE_URL}/courses/{course_id}/users",
        'quizzes': f"{BASE_URL}/courses/{course_id}/quizzes"
    }
    
    course_data = {}
    
    # Get course details
    try:
        response = requests.get(endpoints['details'], headers=HEADERS)
        response.raise_for_status()
        course_data['details'] = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching course details for course {course_id}: {str(e)}")
        return None
        
    # Get other course resources
    for resource, url in endpoints.items():
        if resource != 'details':
            data = get_paginated_results(url)
            # If None returned, no permission for this resource, stop crawling course
            if data is None:
                print(f"Permission denied for resource '{resource}' in course {course_id}. Stopping crawl for this course.")
                return None
            course_data[resource] = data
            
    return course_data

# Get all courses
courses = get_paginated_results(f"{BASE_URL}/courses")

# Process each course
for course in courses:
    course_id = course['id']
    course_data = get_course_data(course_id)
    
    if course_data:
        course_name = course.get('name') or course_data['details'].get('name', 'Unknown')
        
        print(f"\nCourse: {course_name}")
        print(f"ID: {course_id}")
        print("Resource counts:")
        for resource, data in course_data.items():
            if resource != 'details':
                print(f"- {resource.capitalize()}: {len(data)}")
        print("-" * 40)



Course: [PLP] Text Analytics (2025-02-10)
ID: 75454
Resource counts:
- Files: 30
- Assignments: 6
- Announcements: 5
- Users: 65
- Quizzes: 3
----------------------------------------

Course: EBA5004 Practical Language Processing [2420]
ID: 69955
Resource counts:
- Files: 59
- Assignments: 8
- Announcements: 3
- Users: 83
- Quizzes: 1
----------------------------------------

Course: IS06 MTech Internship
ID: 68113
Resource counts:
- Files: 3
- Assignments: 2
- Announcements: 8
- Users: 79
- Quizzes: 1
----------------------------------------
Error fetching course details for course 63072: 403 Client Error: Forbidden for url: https://canvas.nus.edu.sg/api/v1/courses/63072
Error fetching course details for course 63080: 403 Client Error: Forbidden for url: https://canvas.nus.edu.sg/api/v1/courses/63080
Error fetching data from https://canvas.nus.edu.sg/api/v1/courses/74913/quizzes?page=1&per_page=100: 404 Client Error: Not Found for url: https://canvas.nus.edu.sg/api/v1/courses/74913/q