In [None]:
# Vibe coded with ChatGPT 4.1
# https://chatgpt.com/share/6850e4a7-4568-8009-a058-fa48eebdaa19
# README mostly built with Cursor using Claude 4
#

import sys
!{sys.executable} -m pip install python-docx

import pandas as pd
import docx
import os
import re

In [None]:
docx_folder = './data/reflection_docs/'


#### Uncomment appropriate header, keep it simple

# row_header = 'What did I learn?'
# row_header = 'How, specifically, did I learn it?'
row_header = 'What goals might I set in accordance with what I have learned in order to improve myself and/or the quality of my learning and/or the quality of my future?'

In [None]:
def extract_table_from_docx(filepath, key_row_name):
    doc = docx.Document(filepath)
    for table in doc.tables:
        # Assume the first row is the header
        for row in table.rows:
            if key_row_name in row.cells[0].text:
                # Found the target table
                # Optionally, process the rest of the table here
                return table
    return None

In [None]:
def split_answer(answer):
    """
    Split the answer into individual response lines, using:
    - newline (\n)
    - bullet (•)
    - hyphen (-) only at the start of a line or after whitespace (not mid-word)
    """
    # Matches newline, bullet, or hyphen at start or after whitespace
    pattern = r'(?:^|\s)-|[\n•]'
    parts = re.split(pattern, answer)
    return [p.strip() for p in parts if p.strip()]

In [None]:
def parse_answers(table, student_id, question_header):
    """
    Extracts responses for a specific question from the given table for one student.
    - Only the row matching question_header is processed.
    - Returns a list of dicts with Student ID, Question, and Response.
    """
    data = []
    # Loop over rows (skip header row if present)
    for row in table.rows[1:]:
        question = row.cells[0].text.strip()
        if question.lower() == question_header.lower():
            answer = row.cells[1].text.strip()
            if answer:
                # If answer uses bullet, newline, or list-formatting, split accordingly
                if re.search(r'(?:^|\s)-|[\n•]', answer):
                    lines = split_answer(answer)
                else:
                    # Otherwise, split on periods (sentence boundaries)
                    lines = [s.strip() for s in answer.split('.') if s.strip()]
                for line in lines:
                    if line:
                        data.append({
                            'Student ID': student_id,
                            'Question': question_header,
                            'Response': line
                        })
            break  # Stop after the first matching question
    return data

In [None]:
all_data = []
for filename in os.listdir(docx_folder):
    if filename.endswith('.docx'):
        # Assumes filename format: 'Copy of P1.docx'
        student_id = filename.replace('Copy of ', '').replace('.docx', '')
        filepath = os.path.join(docx_folder, filename)
        table = extract_table_from_docx(filepath, row_header)
        if table:
            # Use the actual question from header row or hardcode if fixed
            all_data.extend(parse_answers(table, student_id, row_header))

df = pd.DataFrame(all_data)

In [None]:
pd.set_option('display.max_colwidth', None)

print(df)

In [None]:
def header_to_filename(header, maxlen=20, dir='./outputs'):
    # Remove non-word characters (punctuation), except spaces
    s = re.sub(r'[^\w\s]', '', header)
    # Replace spaces with underscores
    s = s.replace(' ', '_')
    # Limit to maxlen characters
    s = s[:maxlen]
    # Optionally: lowercase for consistency
    # s = s.lower()
    # Strip trailing underscores if truncation lands on one
    s = s.rstrip('_')
    # Add .csv
    filename = f"{s}.csv"
    # Prepend directory
    return os.path.join(dir, filename)

# Example usage:
csv_filename = header_to_filename(row_header)
# Make sure the output directory exists!
os.makedirs(os.path.dirname(csv_filename), exist_ok=True)
df.to_csv(csv_filename, index=False)