In [56]:
import os
import re
import random
import json
import math
from pathlib import Path
from typing import List, Dict


# Configuration (based on example Canvas export given)
CONFIG = {
    "expected_folders": {
        "1.0 Unit Panel Documentation": ["Assessment Moderation", "Unit Moderation", "QEA Report"],
        "2.0 Unit Outline": ["Unit Outline"],
        "3.0 Learning Materials": ["Lecture"],
        "4.0 Tutorial Materials": ["Tutorial"],
        "5.0 Assessments and Marking Schemes": ["Assignment", "Rubric", "Test"],
        "6.0 Samples of Students' Work": [],
        "7.0 Outcome Based Education Report": ["OBE"],
        "8.0 Safety Acknowledgement": [],
        "9.0 Final Exam Script": []
    },
    "sampling_percentage": 50, # 20% minimum
    # Configurable for band or grade (set to performance band for PoC)
    "performance_bands": {
        "Good": (36, 50),
        "Average": (26, 35),
        "Poor": (0, 25)
    },
    # Naming pattern for student work (Assignment/Test folders)
    "student_work_pattern": r"^(COS\d{5})_(Assignment|Test|Tutorial)\d+_(\d{8})\.(pdf|docx)$"
}

# File Ingestion & Folder Validation
import os
import re
from pathlib import Path
from typing import Dict, List

class CanvasValidator:
    def __init__(self, base_path: str, config: Dict):
        self.base_path = Path(base_path)
        self.config = config
        self.expected_folders = config["expected_folders"]
        self.student_pattern = re.compile(config["student_work_pattern"])
        self.samples_folder = self._find_folder("6.0 Samples of Students' Work")

    def list_all_files(self) -> List[Path]:
        # List all files in the export recursively
        return list(self.base_path.rglob("*.*"))

    def validate_main_structure(self) -> Dict[str, bool]:
        # Check that all expected top-level folders exist
        results = {}
        for expected in self.expected_folders.keys():
            found = any(expected.split(" ", 1)[1].lower() in f.name.lower()
                        for f in self.base_path.iterdir() if f.is_dir())
            results[expected] = found
        return results

    def validate_folder_contents(self) -> Dict[str, Dict[str, List[str]]]:
        # Check that each folder contains required file types (partial name matching)
        results = {}
        for folder, keywords in self.expected_folders.items():
            folder_path = self._find_folder(folder)
            if not folder_path:
                results[folder] = {"missing_folder": True, "missing_files": []}
                continue

            existing_files = [f.name for f in folder_path.glob("*.*")]
            missing = []
            for keyword in keywords:
                if not any(keyword.lower() in f.lower() for f in existing_files):
                    missing.append(keyword)

            results[folder] = {"missing_folder": False, "missing_files": missing}
        return results

    def _find_folder(self, expected_name: str) -> Path | None:
        # Finds folder regardless of numeric prefix or naming variation
        for f in self.base_path.iterdir():
            if f.is_dir() and expected_name.split(" ", 1)[1].lower() in f.name.lower():
                return f
        return None

    # 🧩 Integrated student file validation
    def validate_student_files(self) -> Dict[str, Dict[str, List[str]]]:
    # Validates student sample files inside '6.0 Samples of Students' Work'.
    # Supports nested subfolders such as 'Assignment 1', 'Assignment 2', 'Test', etc.
    # Allows extra text after valid filename patterns (e.g., 'Good', 'Poor').
    
        validation_result = {
            "Assignment": {"valid": [], "invalid": []},
            "Test": {"valid": [], "invalid": []},
            "Tutorial": {"valid": [], "invalid": []},
            "Lab": {"valid": [], "invalid": []},
            "Samples": {"valid": [], "invalid": []},
        }
    
        if not self.samples_folder or not self.samples_folder.exists():
            validation_result["Samples"]["invalid"].append("Missing 'Samples of Students' Work' folder")
            return validation_result
    
        # ✅ Relaxed pattern: allows trailing text after student ID before .pdf
        valid_pattern = re.compile(
            r"^COS\d{5}_(Assignment\d+|Test\d*|Tutorial\d*|Lab\d*)_\d{8}(?:\s.*)?\.pdf$",
            re.IGNORECASE
        )
    
        for root, _, files in os.walk(self.samples_folder):
            for file in files:
                if not file.lower().endswith(".pdf"):
                    continue
    
                full_path = os.path.join(root, file)
                relative_path = os.path.relpath(full_path, self.samples_folder)
    
                # Identify category based on folder name
                folder_name = os.path.basename(os.path.dirname(full_path)).lower()
                category = "Samples"
                if "assignment" in folder_name:
                    category = "Assignment"
                elif "test" in folder_name:
                    category = "Test"
                elif "tutorial" in folder_name:
                    category = "Tutorial"
                elif "lab" in folder_name:
                    category = "Lab"
    
                # ✅ Accepts files even if they have extra text after valid ID
                if valid_pattern.match(file):
                    validation_result[category]["valid"].append(relative_path)
                else:
                    validation_result[category]["invalid"].append(relative_path)
    
        return validation_result

# Assignment Sampling Module
class AssignmentSampler:
    def __init__(self, assignments: List[Dict], config: Dict):
        self.assignments = assignments
        self.percentage = config["sampling_percentage"]
        self.performance_bands = config["performance_bands"]

    def filter_by_performance(self, level: str) -> List[Dict]:
        low, high = self.performance_bands[level]
        return [a for a in self.assignments if low <= a["grade"] <= high]

    def random_sample(self, subset: List[Dict]) -> List[Dict]:
        # ✅ Use math.ceil() to always round up
        sample_size = math.ceil(len(subset) * (self.percentage / 100))
        if sample_size == 0 and len(subset) > 0:
            sample_size = 1
        return random.sample(subset, min(len(subset), sample_size))

    def generate_sample(self, level: str = "Good") -> List[Dict]:
        subset = self.filter_by_performance(level)
        return self.random_sample(subset)



# Example Usage (Proof of Concept)
if __name__ == "__main__":
    base_path = "./COS12345 Course Name Digital Unit File/COS12345 Course Name"
    validator = CanvasValidator(base_path, CONFIG)

    print("STRUCTURE VALIDATION")
    print(json.dumps(validator.validate_main_structure(), indent=2))

    print("\nFOLDER CONTENT VALIDATION")
    print(json.dumps(validator.validate_folder_contents(), indent=2))

    print("\nSTUDENT FILE VALIDATION")
    student_validation = validator.validate_student_files()
    print(json.dumps(student_validation, indent=2))

    # ✅ Build assignment list dynamically from valid files
    assignments_data = []
    assignment_files = student_validation["Assignment"]["valid"]

    # Extract info (course code, assignment number, student ID, grade, label) from filename
    for file_path in assignment_files:
        match = re.match(
            r"^(?:.*[/\\])?COS(\d{5})_(Assignment\d+)_([0-9]{8})\s([\d\.]+)\s([A-Za-z]+)\.pdf$",
            file_path,
            re.IGNORECASE
        )
        if match:
            course_code = match.group(1)
            assignment_no = match.group(2)
            student_id = match.group(3)
            grade = float(match.group(4))
            label = match.group(5).capitalize()

            assignments_data.append({
                "course_code": course_code,
                "assignment_no": assignment_no,
                "student_id": student_id,
                "grade": grade,
                "performance": label,
                "file": file_path
            })
        else:
            print(f"⚠️ Skipped unmatched file: {file_path}")

    # Initialize sampler with real extracted data
    sampler = AssignmentSampler(assignments_data, CONFIG)

    print("\nGENERATED ASSIGNMENT LIST (EXTRACTED FROM FILENAMES)")
    print(json.dumps(assignments_data, indent=2))

    print("\nSAMPLED 'GOOD' PERFORMANCE STUDENTS")
    print(json.dumps(sampler.generate_sample("Good"), indent=2))

    print("\nSAMPLED 'AVERAGE' PERFORMANCE STUDENTS")
    print(json.dumps(sampler.generate_sample("Average"), indent=2))

    print("\nSAMPLED 'POOR' PERFORMANCE STUDENTS")
    print(json.dumps(sampler.generate_sample("Poor"), indent=2))


STRUCTURE VALIDATION
{
  "1.0 Unit Panel Documentation": true,
  "2.0 Unit Outline": true,
  "3.0 Learning Materials": true,
  "4.0 Tutorial Materials": true,
  "5.0 Assessments and Marking Schemes": true,
  "6.0 Samples of Students' Work": true,
  "7.0 Outcome Based Education Report": true,
  "8.0 Safety Acknowledgement": false,
  "9.0 Final Exam Script": true
}

FOLDER CONTENT VALIDATION
{
  "1.0 Unit Panel Documentation": {
    "missing_folder": false,
    "missing_files": []
  },
  "2.0 Unit Outline": {
    "missing_folder": false,
    "missing_files": []
  },
  "3.0 Learning Materials": {
    "missing_folder": false,
    "missing_files": [
      "Lecture"
    ]
  },
  "4.0 Tutorial Materials": {
    "missing_folder": false,
    "missing_files": [
      "Tutorial"
    ]
  },
  "5.0 Assessments and Marking Schemes": {
    "missing_folder": false,
    "missing_files": []
  },
  "6.0 Samples of Students' Work": {
    "missing_folder": false,
    "missing_files": []
  },
  "7.0 Outcome