# Main Feedback Generation Notebook
This notebook handles preprocessing, model interaction, and feedback generation.

**--Set up: Github, Paths, Imports**

In [None]:
# Mount Google Drive (optional, you'll get a prompt to authorize account)
# from google.colab import drive
# drive.mount('/content/drive')

# Start in root Colab directory to avoid nesting fortnite
%cd /content

# Clone your GitHub repo (replace with your actual repo URL)
!git clone https://github.com/ML-name/project.git
%cd project

# List all branches (optional, for checking)
!git branch -a

/content
Cloning into 'project'...
remote: Enumerating objects: 704, done.[K
remote: Counting objects: 100% (274/274), done.[K
remote: Compressing objects: 100% (215/215), done.[K
remote: Total 704 (delta 156), reused 99 (delta 59), pack-reused 430 (from 1)[K
Receiving objects: 100% (704/704), 1.45 MiB | 7.72 MiB/s, done.
Resolving deltas: 100% (346/346), done.
/content/project
* [32mmain[m
  [31mremotes/origin/16-steps-for-phase-1[m
  [31mremotes/origin/HEAD[m -> origin/main
  [31mremotes/origin/feat/create-gradio-ui[m
  [31mremotes/origin/feat/preprocess-the-data-into-a-normalized-format[m
  [31mremotes/origin/fix/imports-and-file-paths[m
  [31mremotes/origin/main[m
  [31mremotes/origin/prompting/Deepseek[m
  [31mremotes/origin/prompting/Gemini[m
  [31mremotes/origin/refactor/docs-and-APIs[m
  [31mremotes/origin/refactor/py-and-output-documents[m
  [31mremotes/origin/scraping/inline_feedback[m
  [31mremotes/origin/scraping/rubric_table[m
  [31mremotes/o

In [None]:
# Checkout YOUR branch (!!replace "your-branch-name"!!)
!git checkout -b scraping/all-together origin/scraping/all-together

fatal: 'origin/comments' is not a commit and a branch 'scraping/inline_feedback' cannot be created from it


In [None]:
# Add your src folder to the Python path
import sys
sys.path.append('/content/project/')

# import data path for loading files
import os
# Base data directory
data_base = '/content/project/data'
# Paths to specific subfolders
raw_data_path = os.path.join(data_base, 'raw')
processed_data_path = os.path.join(data_base, 'processed')

**--Install required libraries**

In [5]:
%pip install -r requirements.txt
!pip install python-docx

Collecting gradio (from -r requirements.txt (line 1))
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting langchain_openai (from -r requirements.txt (line 2))
  Downloading langchain_openai-0.3.12-py3-none-any.whl.metadata (2.3 kB)
Collecting gradio_client (from -r requirements.txt (line 3))
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting Spire.Doc (from -r requirements.txt (line 5))
  Downloading spire_doc-13.3.8-py3-none-manylinux_2_31_x86_64.whl.metadata (14 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio->-r requirements.txt (line 1))
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio->-r requirements.txt (line 1))
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio->-r requirements.txt (line 1))
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting groovy~=0.1 (from gradio->-r requirements.txt (line 1))
  Dow

In [None]:
import os
import re
import sys
import json
from pathlib import Path
from collections import defaultdict

# Setup
ROOT = '/content/project'
sys.path.append(ROOT)

from tropos.preprocess_docx import StudentSubmission

# Paths
RAW_DIR = f"{ROOT}/data/raw"
PROCESSED_DIR = f"{ROOT}/data/processed/assignment"
STUDENT_OUTPUT_DIR = f"{PROCESSED_DIR}/student_submissions"
REQUIREMENTS_PATH = f"{RAW_DIR}/Requirements.docx"

Path(STUDENT_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

def format_clean_rubric():
    criteria = {
        "introduction": [
            "Engaging",
            "Shows importance of question",
            "Indicates major parts and content"
        ],
        "background": [
            "Uses credible, neutral sources",
            "Effectively and sufficiently explains essential information",
            "Written objectively in own words",
            "Well organized and focused paragraphs with transitions",
            "Minimal error"
        ],
        "analysis": [
            "Smooth transition from background",
            "Summarizes accurately and in own words two opposing answers and one objective answer to the question",
            "Recognizes bias",
            "Reasonable observations about how all sources make their arguments impactful",
            "Well-organized and focused paragraphs with transitions",
            "Minimal error"
        ],
        "response": [
            "Smooth transition from analysis",
            "Incorporates strong points from sources",
            "Draws a reasonable and well-supported conclusion, answering the question",
            "Provides satisfying conclusion to the project",
            "Good transitions between focused paragraphs",
            "Minimal error"
        ]
    }

    return [{
        "portion_id": pid,
        "portion_name": pid.capitalize(),
        "criteria_group": criteria[pid]
    } for pid in criteria]

def format_rubric_feedback(rubric):
    portion_ids = ["introduction", "background", "analysis", "response"]
    feedback = []

    for i, portion in enumerate(rubric):
        items = portion.get("feedback", [])
        if not items:
            continue
        pid = portion_ids[i] if i < len(portion_ids) else f"portion_{i+1}"
        feedback.append({
            "portion_id": pid,
            "portion_name": pid.capitalize(),
            "criteria_group_ref": f"rubric_{pid}_criteria_group",
            "feedback": [
                {"feedback_id": f"{pid}_F{j+1}", "text": fb.get("text", "")}
                for j, fb in enumerate(items)
            ]
        })
    return feedback

def extract_student_id(filename):
    match = re.search(r"Student[ _](\d+)", filename, re.IGNORECASE)
    return f"student{match.group(1).zfill(2)}" if match else None

def extract_part_key(filename):
    name = filename.lower()
    if "final" in name:
        return "final"
    match = re.search(r"part[ _]?(\d+)", name)
    if match:
        return f"part_{int(match.group(1))}"
    if re.search(r"student[ _]\d+", name):
        return "final"
    return None

def process_student_submissions():
    requirements_ref = "requirements/requirements.json"
    student_data = defaultdict(list)

    for folder in os.listdir(RAW_DIR):
        folder_path = os.path.join(RAW_DIR, folder)
        if not os.path.isdir(folder_path):
            continue

        for fname in os.listdir(folder_path):
            if not fname.endswith(".docx") or "requirements" in fname.lower():
                continue

            sid = extract_student_id(fname)
            part = extract_part_key(fname)
            if not sid or not part:
                print(f"Skipping file: {fname}")
                continue

            student_data[sid].append({
                "filepath": os.path.join(folder_path, fname),
                "filename": fname,
                "part_key": part,
                "mtime": os.path.getmtime(os.path.join(folder_path, fname))
            })

    for sid, submissions in student_data.items():
        print(f"\nProcessing {sid}")
        output = {
            "student_id": sid,
            "requirements": requirements_ref,
            "submissions": {}
        }

        submissions.sort(key=lambda x: (float("inf") if x["part_key"] == "final" else int(x["part_key"].split("_")[1]), x["mtime"]))
        seen = set()

        for sub in submissions:
            part = sub["part_key"]
            if part in seen:
                print(f"Duplicate part skipped: {part} ({sub['filename']})")
                continue
            seen.add(part)

            try:
                parsed = StudentSubmission(sub["filepath"], REQUIREMENTS_PATH).to_dict()
                output["submissions"][part] = {
                    "submission_text": parsed.get("submission_text", ""),
                    "comments": [
                        {k: v for k, v in c.items() if k != "commented_text"}
                        for c in parsed.get("comments", [])
                    ],
                    "rubric_feedback": format_rubric_feedback(parsed.get("rubric", []))
                }
                print(f"{sub['filename']} -> {part}")
            except Exception as e:
                print(f"Failed {sub['filename']}: {e}")

        out_path = os.path.join(STUDENT_OUTPUT_DIR, f"{sid}.json")
        with open(out_path, "w") as f:
            json.dump(output, f, indent=2)
        print(f"Saved {out_path}")

    with open(os.path.join(PROCESSED_DIR, "rubric_table.json"), "w") as f:
        json.dump(format_clean_rubric(), f, indent=2)

    with open(os.path.join(PROCESSED_DIR, "assignment.json"), "w") as f:
        json.dump({"requirements": requirements_ref}, f, indent=2)

    print("\nAll done.")

# Run it
process_student_submissions()



#cycle thru each row, look for feedakc (if no feedback, dont need it, if there is, reference it
#put preprocess.docx into main, then call it in the notebook


**--Import modules (youre working on)**
<br>*each of our classes will be what will merge to this notebook (im p sure)*
<br>only loads what you explicitly request
<br>(this helps keep memory low and import fast)
<br> *the following is an example with my Rubric module*

In [None]:
# Import and call main functionfrom tropos import main
main()