In [1]:
import os
from pymongo import MongoClient
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
import requests
from time import sleep
from datetime import datetime
from typing import List, Dict, Any

  from tqdm.autonotebook import tqdm, trange





In [2]:
# Initialize MongoDB connection
mongo_client = MongoClient("mongodb://localhost:27017/")
db = mongo_client["canvas_qa_system"]

In [3]:
# Collections for different data types
course_collection = db["courses"]
file_collection = db["files"] 
assignment_collection = db["assignments"]
announcement_collection = db["announcements"]
query_log_collection = db["query_logs"]

In [4]:
# Initialize ChromaDB for vector storage
CHROMA_PATH = "./chroma_db"
model_name = 'Snowflake/snowflake-arctic-embed-l-v2.0'
model = SentenceTransformer(model_name)
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    encode_kwargs={'prompt_name': 'query'} # Add prompt_name for queries
)
vector_db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)

  embeddings = HuggingFaceEmbeddings(
  vector_db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)


In [5]:
# API Configuration
BASE_URL = "https://canvas.nus.edu.sg/api/v1"
HEADERS = {"Authorization": f"Bearer {os.getenv('DINGYI_CANVAS_API_KEY')}"}
PAGE_SIZE = 100
RATE_LIMIT_DELAY = 0.1  # Delay between API calls to avoid rate limiting

In [6]:

def get_paginated_results(url: str) -> List[Dict[Any, Any]] or None:
    """
    Generic function to get paginated results from Canvas API.
    - For announcements (using discussion_topics endpoint with only_announcements param), returns empty list on 404
    - Returns None on 403 permission denied to stop crawling that resource
    
    Args:
        url: Base API endpoint URL
        
    Returns:
        List of results from all pages; or None (indicating no permission)
    """
    results = []
    page = 1
    
    while True:
        separator = "&" if "?" in url else "?"
        paginated_url = f"{url}{separator}page={page}&per_page={PAGE_SIZE}"
        try:
            response = requests.get(paginated_url, headers=HEADERS)
            response.raise_for_status()
            page_results = response.json()
            if not page_results:
                break
                
            results.extend(page_results)
            page += 1
            sleep(RATE_LIMIT_DELAY)
            
        except requests.exceptions.RequestException as e:
            if hasattr(e, 'response') and e.response is not None:
                status = e.response.status_code
                if status == 404 and "discussion_topics" in url and "only_announcements=true" in url:
                    return []
                if status == 403:
                    print(f"Permission denied for URL: {paginated_url}")
                    return None
            print(f"Error fetching data from {paginated_url}: {str(e)}")
            break
            
    return results

In [7]:
def get_course_data(course_id: int) -> Dict[str, Any] or None:
    """
    Get all relevant data for a specific course.
    If any resource returns no permission (None), stop crawling this course.
    
    Args:
        course_id: Canvas course ID
        
    Returns:
        Dictionary containing course data; or None (indicating no permission)
    """
    endpoints = {
        'details': f"{BASE_URL}/courses/{course_id}",
        'files': f"{BASE_URL}/courses/{course_id}/files",
        'assignments': f"{BASE_URL}/courses/{course_id}/assignments",
        'announcements': f"{BASE_URL}/courses/{course_id}/discussion_topics?only_announcements=true",
        'users': f"{BASE_URL}/courses/{course_id}/users",
        'quizzes': f"{BASE_URL}/courses/{course_id}/quizzes"
    }
    
    course_data = {}
    
    # Get course details
    try:
        response = requests.get(endpoints['details'], headers=HEADERS)
        response.raise_for_status()
        course_data['details'] = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching course details for course {course_id}: {str(e)}")
        return None
        
    # Get other course resources
    for resource, url in endpoints.items():
        if resource != 'details':
            data = get_paginated_results(url)
            if data is None:
                print(f"Permission denied for resource '{resource}' in course {course_id}. Stopping crawl for this course.")
                return None
            course_data[resource] = data
            
    return course_data

In [8]:
def store_course_data(course_data: Dict[str, Any]) -> str:
    """
    Store course metadata in MongoDB and content chunks in ChromaDB
    
    Args:
        course_data: Dictionary containing course details and resources
        
    Returns:
        course_id: MongoDB ID of stored course
    """
    # Store course details in MongoDB
    course_id = course_collection.insert_one({
        "course_name": course_data["details"]["name"],
        "canvas_id": course_data["details"]["id"],
        "stored_at": datetime.now()
    }).inserted_id
    
    # Process and store different resource types
    for resource_type in ["files", "assignments", "announcements"]:
        if resource_type in course_data:
            # Split content into chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=50
            )
            
            for item in course_data[resource_type]:
                # Extract content based on resource type
                content = item.get("description", "") or item.get("body", "") or item.get("content", "")
                
                # Skip if content is empty
                if not content.strip():
                    continue
                
                chunks = text_splitter.create_documents(
                    texts=[content],
                    metadatas=[{
                        "course_id": str(course_id),
                        "resource_type": resource_type,
                        "title": item.get("title", ""),
                        "canvas_id": item.get("id", "")
                    }]
                )
                
                # Only process if we have chunks
                if chunks:
                    # Store chunks in ChromaDB
                    chunk_ids = vector_db.add_documents(chunks)
                    
                    # Store metadata in MongoDB
                    collection = globals()[f"{resource_type[:-1]}_collection"]
                    collection.insert_one({
                        "course_id": course_id,
                        "canvas_id": item.get("id", ""),
                        "title": item.get("title", ""),
                        "chunk_ids": chunk_ids,
                        "stored_at": datetime.now()
                    })
                
    return str(course_id)

In [9]:

def query_knowledge_base(query: str, top_k: int = 3) -> List[Dict]:
    """
    Search across course content using hybrid retrieval
    
    Args:
        query: User query string
        top_k: Number of top results to return
        
    Returns:
        results: List of relevant documents with metadata
    """
    # Vector similarity search (MicroRAG)
    vector_results = vector_db.similarity_search_with_score(
        query,
        k=top_k
    )
    
    results = []
    for doc, score in vector_results:
        # Get associated metadata from MongoDB (MacroRAG)
        metadata = doc.metadata
        course = course_collection.find_one({"_id": metadata["course_id"]})
        
        results.append({
            "content": doc.page_content,
            "score": score,
            "metadata": {
                "course_name": course["course_name"],
                "resource_type": metadata["resource_type"],
                "title": metadata["title"]
            }
        })
        
    # Log the query
    query_log_collection.insert_one({
        "query": query,
        "results": results,
        "timestamp": datetime.now()
    })
        
    return results

In [16]:
def get_all_available_courses() -> List[Dict]:
    """
    Get all courses accessible by the current user
    """
    url = f"{BASE_URL}/courses"
    courses = get_paginated_results(url)
    
    if courses is None:
        print("Failed to get courses. Please check API key and permissions.")
        return []
        
    # Only keep active courses
    active_courses = [
        course for course in courses 
        if course.get('workflow_state') == 'available'
    ]
    
    return active_courses

In [17]:
# Get all course IDs from the system
print("Getting available courses...")
available_courses = get_all_available_courses()
all_courses_data = {}

if not available_courses:
    print("No courses found or unable to access courses.")
else:
    print(f"Found {len(available_courses)} available courses")
    
    # Get and store data for each course
    for course in available_courses:
        course_id = course['id']
        print(f"\nProcessing course: {course['name']} (ID: {course_id})")
        
        course_data = get_course_data(course_id)
        if course_data:
            all_courses_data[course_id] = course_data
            stored_id = store_course_data(course_data)
            print(f"Course {course_id} stored with database ID: {stored_id}")
        else:
            print(f"Failed to get data for course {course_id}")

# Print storage statistics
print(f"\nTotal courses stored: {len(all_courses_data)}")
for course_id, data in all_courses_data.items():
    print(f"Course {course_id}: {data['details']['name']}")

Getting available courses...
Found 10 available courses

Processing course: [PLP] Text Analytics (2025-02-10) (ID: 75454)
Course 75454 stored with database ID: 67d53418ddc8ff85535092ba

Processing course: EBA5004 Practical Language Processing [2420] (ID: 69955)
Course 69955 stored with database ID: 67d5341dddc8ff85535092bf

Processing course: IS06 MTech Internship (ID: 68113)
Course 68113 stored with database ID: 67d53423ddc8ff85535092cb

Processing course: ISY5004 ITSS GC Practice Module (Jan-May 2025) (ID: 74913)
Error fetching data from https://canvas.nus.edu.sg/api/v1/courses/74913/quizzes?page=1&per_page=100: 404 Client Error: Not Found for url: https://canvas.nus.edu.sg/api/v1/courses/74913/quizzes?page=1&per_page=100
Course 74913 stored with database ID: 67d53428ddc8ff85535092ce

Processing course: MTech in EBAC/IS/SE (Thru-train) (ID: 27447)
Course 27447 stored with database ID: 67d53438ddc8ff85535092d0

Processing course: RC1000A A Culture of Respect and Consent (Student) (ID:

In [11]:
# Query MongoDB to check stored data
print("\nStored Course Data:")
for course in course_collection.find():
    print(f"\nCourse ID: {course['_id']}")
    print(f"Course Name: {course['course_name']}")
    
print("\nStored Document Chunks:")
for doc in vector_db.get()["documents"]:
    print(f"\nDocument Content: {doc[:200]}...")

print("\nVector Store Stats:")
print(f"Total documents: {vector_db._collection.count()}")



Stored Course Data:

Course ID: 67d53151ddc8ff855350929f
Course Name: [PLP] Text Analytics (2025-02-10)

Stored Document Chunks:

Document Content: <p>This quiz is part of your Mini Project. Explore the accident reports data given in osha.txt, and answer the questions here.</p><script src="https://instructure-uploads-apse1.s3.ap-southeast-1.amazo...

Document Content: <p>Please use the training dataset <a id="5746888" class="instructure_file_link inline_disabled" title="Link" href="https://canvas.nus.edu.sg/courses/75454/files/5746888?verifier=zQYjUUhRE3yQa7rh9cgLt...

Document Content: and fine-tune your classifier with any step necessary.</p>...

Document Content: <p>Then test your classifier on top of <a id="5746884" class="instructure_file_link inline_disabled" title="Link" href="https://canvas.nus.edu.sg/courses/75454/files/5746884?verifier=t2RHd0GWpNF7tk67H...

Document Content: <p>Also recommend to try out API with LLM based classifiers to compare the result.&nbsp;</p>
<p>&nbsp

In [None]:
    
# Test query
results = query_knowledge_base("When is the project submission deadline?")
for result in results:
    print(f"\nScore: {result['score']}")
    print(f"Course: {result['metadata']['course_name']}")
    print(f"Content: {result['content'][:200]}...")