# Main Feedback Generation Notebook
This notebook handles preprocessing, model interaction, and feedback generation.

**--Set up: Github, Paths, Imports**

In [None]:
# Mount Google Drive (optional, you'll get a prompt to authorize account)
# from google.colab import drive
# drive.mount('/content/drive')

# Start in root Colab directory to avoid nesting fortnite
%cd /content

# Clone your GitHub repo (replace with your actual repo URL)
!git clone https://github.com/ML-name/project.git
%cd project

# List all branches (optional, for checking)
!git branch -a

/content
Cloning into 'project'...
remote: Enumerating objects: 704, done.[K
remote: Counting objects: 100% (274/274), done.[K
remote: Compressing objects: 100% (215/215), done.[K
remote: Total 704 (delta 156), reused 99 (delta 59), pack-reused 430 (from 1)[K
Receiving objects: 100% (704/704), 1.45 MiB | 7.72 MiB/s, done.
Resolving deltas: 100% (346/346), done.
/content/project
* [32mmain[m
  [31mremotes/origin/16-steps-for-phase-1[m
  [31mremotes/origin/HEAD[m -> origin/main
  [31mremotes/origin/feat/create-gradio-ui[m
  [31mremotes/origin/feat/preprocess-the-data-into-a-normalized-format[m
  [31mremotes/origin/fix/imports-and-file-paths[m
  [31mremotes/origin/main[m
  [31mremotes/origin/prompting/Deepseek[m
  [31mremotes/origin/prompting/Gemini[m
  [31mremotes/origin/refactor/docs-and-APIs[m
  [31mremotes/origin/refactor/py-and-output-documents[m
  [31mremotes/origin/scraping/inline_feedback[m
  [31mremotes/origin/scraping/rubric_table[m
  [31mremotes/o

In [None]:
# Checkout YOUR branch (!!replace "your-branch-name"!!)
!git checkout -b scraping/all-together origin/scraping/all-together

fatal: 'origin/comments' is not a commit and a branch 'scraping/inline_feedback' cannot be created from it


In [None]:
# Add your src folder to the Python path
import sys
sys.path.append('/content/project/')

# import data path for loading files
import os
# Base data directory
data_base = '/content/project/data'
# Paths to specific subfolders
raw_data_path = os.path.join(data_base, 'raw')
processed_data_path = os.path.join(data_base, 'processed')

**--Install required libraries**

In [5]:
%pip install -r requirements.txt
!pip install python-docx

Collecting gradio (from -r requirements.txt (line 1))
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting langchain_openai (from -r requirements.txt (line 2))
  Downloading langchain_openai-0.3.12-py3-none-any.whl.metadata (2.3 kB)
Collecting gradio_client (from -r requirements.txt (line 3))
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting Spire.Doc (from -r requirements.txt (line 5))
  Downloading spire_doc-13.3.8-py3-none-manylinux_2_31_x86_64.whl.metadata (14 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio->-r requirements.txt (line 1))
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio->-r requirements.txt (line 1))
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio->-r requirements.txt (line 1))
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting groovy~=0.1 (from gradio->-r requirements.txt (line 1))
  Dow

In [8]:
import sys
import os
import re
import json
from pathlib import Path
from google.colab import drive

# Mount Google Drive (uncomment if needed)
# drive.mount('/content/drive')

# Set up paths
ROOT = '/content/project'
sys.path.append(ROOT)

# Import after adding to path
from tropos.preprocess_docx.comments import Comments

# Configure paths
raw_dir = f"{ROOT}/data/raw/"
processed_dir = f"{ROOT}/data/processed/student_submissions/"

def process_student_submissions():
    """Process all student submissions and save comments to structured directories."""
    # Create processed directory if needed
    Path(processed_dir).mkdir(parents=True, exist_ok=True)

    all_comments = {}  # Master dictionary for backup

    for student_folder in os.listdir(raw_dir):
        student_path = os.path.join(raw_dir, student_folder)

        if not os.path.isdir(student_path):
            continue

        print(f"\nProcessing {student_folder}...")

        for filename in os.listdir(student_path):
            if not filename.endswith('.docx'):
                continue

            # Parse student number and part number
            student_num = re.search(r'Student[ _](\d+)', filename, re.IGNORECASE)
            part_info = re.search(r'(Part[ _]?(\d+)|Final)', filename, re.IGNORECASE)

            if not student_num or not part_info:
                print(f"  Skipping {filename} - couldn't parse info")
                continue

            # Standardize naming
            student_key = f"Student_{student_num.group(1).zfill(2)}"
            part_key = part_info.group(0).lower().replace(' ', '')

            # Set up target path
            target_dir = os.path.join(processed_dir, student_key)
            Path(target_dir).mkdir(exist_ok=True)

            target_path = os.path.join(target_dir, f"{part_key}.json")
            doc_path = os.path.join(student_path, filename)

            # Extract and save comments
            try:
                comments = Comments(doc_path).parse_comments().get_results()

                with open(target_path, 'w') as f:
                    json.dump(comments, f, indent=2)

                # Update master record
                if student_key not in all_comments:
                    all_comments[student_key] = {}
                all_comments[student_key][part_key] = comments

                print(f"  Processed: {filename} ‚Üí {target_path}")

            except Exception as e:
                print(f"  Error processing {filename}: {str(e)}")



    print(f"\nProcessing complete!")

# Run the processing
process_student_submissions()




Processing Student 1...
  Error processing Student 1 Part 1.docx: 'Comments' object has no attribute 'parse_comments'
  Error processing Student 1 Part 2.docx: 'Comments' object has no attribute 'parse_comments'
  Error processing Student 1 Part 3.docx: 'Comments' object has no attribute 'parse_comments'
  Error processing Student 1 Final.docx: 'Comments' object has no attribute 'parse_comments'

Processing Student 3...
  Error processing Student 3 Part 3.docx: 'Comments' object has no attribute 'parse_comments'
  Error processing Student 3 Part 1.docx: 'Comments' object has no attribute 'parse_comments'
  Error processing Student 3 Final.docx: 'Comments' object has no attribute 'parse_comments'
  Error processing Student 3 Part 2.docx: 'Comments' object has no attribute 'parse_comments'

Processing Student 4...
  Error processing Student 4 Final.docx: 'Comments' object has no attribute 'parse_comments'
  Error processing Student 4 Part 1.docx: 'Comments' object has no attribute 'pars

In [None]:
import sys
import os
import re
import json
from pathlib import Path
from google.colab import drive

# Mount Google Drive if needed
# drive.mount('/content/drive')

# Set up project root
ROOT = '/content/project'
sys.path.append(ROOT)

# Import custom parsers
from tropos.preprocess_docx.comments import Comments
from tropos.preprocess_docx.rubric import parse_rubric
from tropos.preprocess_docx.assignment_requirements import parse_requirements
from tropos.preprocess_docx.submission import parse_submission
from tropos.preprocess_docx.__init__ import StudentSubmission

#mostly AI generated scraper
# Set raw and processed directories
raw_dir = f"{ROOT}/data/raw/"
processed_dir = f"{ROOT}/data/processed/assignment/"
students_dir = f"{processed_dir}/student_submissions/"
Path(students_dir).mkdir(parents=True, exist_ok=True)

# Filter rubric portions that have actual feedback and transform structure
def filter_rubric_with_feedback(rubric, assignment_id):
    filtered = []
    for i, portion in enumerate(rubric):
        if portion.get('feedback'):
            transformed = {
                'assignment_id': assignment_id,
                'portion_id': f"P{i+1}",
                'portion_name': portion.get('name', ''),
                'criteria_block': {
                    'criteria_id': f"P{i+1}.C",
                    'full_criteria_text': [
                        {
                            'id': f"analysis_c{i+1}",  # assign IDs based on the portion index
                            'text': portion.get('criteria', '')
                        }
                    ],
                    'max_points': portion.get('max_points', 0)  # do we even need? maxpointts
                },
                'score': portion.get('score', 0),  # do we even need?
                'feedback': portion.get('feedback', '')  
            }
            filtered.append(transformed)
    return filtered

# Main processing function
def process_student_submissions():
    all_rubrics = []
    all_requirements = set()

    for student_folder in os.listdir(raw_dir):
        student_path = os.path.join(raw_dir, student_folder)
        if not os.path.isdir(student_path):
            continue

        print(f"\nüìÇ Processing {student_folder}...")

        for filename in os.listdir(student_path):
            if not filename.endswith('.docx') or 'Requirements' in filename:
                continue

            student_num = re.search(r'Student[ _](\d+)', filename, re.IGNORECASE)
            part_info = re.search(r'(Part[ _]?(\d+)|Final)', filename, re.IGNORECASE)

            if not student_num or not part_info:
                print(f"  ‚è≠Ô∏è Skipping {filename} - couldn't parse info")
                continue

            student_key = f"student{student_num.group(1).zfill(2)}"
            part_key = part_info.group(0).lower().replace(' ', '')  # This should be 'part1', 'part2', (part3?), 'final'

            student_folder_path = os.path.join(students_dir, student_key)
            Path(student_folder_path).mkdir(exist_ok=True)

            submission_path = os.path.join(student_path, filename)
            requirements_path = f"{ROOT}/data/raw/Requirements.docx"
            target_path = os.path.join(student_folder_path, f"{part_key}.json")

            try:
                # Load submission
                submission = StudentSubmission(submission_path, requirements_path)
                data = submission.to_dict()

                rubric_data = data.get('rubric', [])
                if part_key == "final":
                    transformed_rubric = []
                    for i, portion in enumerate(rubric_data):
                        transformed = {
                            'assignment_id': part_key,  # Include assignment_id in final part rubric
                            'portion_id': f"P{i+1}",
                            'portion_name': portion.get('name', ''),
                            'criteria_block': {
                                'criteria_id': f"P{i+1}.C",
                                'full_criteria_text': portion.get('criteria', ''),
                                'max_points': portion.get('max_points', 0)
                            },
                            'score': portion.get('score', 0),
                            'feedback': portion.get('feedback', '')
                        }
                        transformed_rubric.append(transformed)
                    data['rubric'] = transformed_rubric
                else:
                    filtered_rubric = filter_rubric_with_feedback(rubric_data, part_key)
                    data['rubric'] = filtered_rubric
                    all_rubrics.extend(filtered_rubric)

                # Ensure comments and feedback are not empty
                data['comments'] = data.get('comments', [])
                data['feedback'] = data.get('feedback', [])

                # Save output
                with open(target_path, 'w') as f:
                    json.dump(data, f, indent=2)

                print(f"  Saved ‚Üí {target_path}")
                all_requirements.add(data['assignment_requirements'])

            except Exception as e:
                print(f"  Error processing {filename}: {str(e)}")

    with open(os.path.join(processed_dir, 'rubric_table.json'), 'w') as f:
        json.dump(all_rubrics, f, indent=2)

    with open(os.path.join(processed_dir, 'assignment.json'), 'w') as f:
        json.dump({"requirements": list(all_requirements)}, f, indent=2)

    print("\nAll student submissions processed.")

process_student_submissions()




#cycle thru each row, look for feedakc (if no feedback, dont need it, if there is, reference it
#put preprocess.docx into main, then call it in the notebook


**--Import modules (youre working on)**
<br>*each of our classes will be what will merge to this notebook (im p sure)*
<br>only loads what you explicitly request
<br>(this helps keep memory low and import fast)
<br> *the following is an example with my Rubric module*

In [None]:
# Import and call main functionfrom tropos import main
main()