In [1]:
!pip install fitz
!pip install PyMuPDF python-Levenshtein



In [3]:
import fitz  # PyMuPDF
import os
from Levenshtein import ratio


# --- CONFIG ---
BASE_FOLDER = "Original from XX1"
TEAM_COMMENTS_FOLDER = "Commented by the TEAM 2"
OUTPUT_FOLDER = "Combined_Annotated"
LATEST_FOLDER = "Latest Versions"
MODIFIED_OUTPUT_FOLDER = "Modified_With_Changes"


os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(MODIFIED_OUTPUT_FOLDER, exist_ok=True)

# --- TEXT MATCHING HELPERS ---

def extract_text_per_page(path):
    doc = fitz.open(path)
    return [page.get_text("text") for page in doc]

def is_text_changed(base_texts, latest_texts):
    if len(base_texts) != len(latest_texts):
        return True
    for base_page, latest_page in zip(base_texts, latest_texts):
        if base_page.strip() != latest_page.strip():
            return True
    return False

def extract_highlighted_text_and_rects(page, annot):
    """Extract highlighted text and rects from original annotation"""
    if not annot.vertices:
        return "", []

    quads = []
    text_parts = []
    quad_count = len(annot.vertices) // 4
    for i in range(quad_count):
        quad = fitz.Quad(annot.vertices[i * 4: i * 4 + 4])
        quads.append(quad.rect)
        text = page.get_textbox(quad.rect).strip()
        if text:
            text_parts.append(text)
    return " ".join(text_parts), quads


def find_all_occurrences(text, page):
    """Return quads for all exact matches of `text` on the page."""
    matches = []
    for w in page.get_text("words"):
        if w[4].strip().lower() == text.lower():
            rect = fitz.Rect(w[:4])
            quad = fitz.Quad(
                fitz.Point(rect.x0, rect.y0),
                fitz.Point(rect.x1, rect.y0),
                fitz.Point(rect.x0, rect.y1),
                fitz.Point(rect.x1, rect.y1)
            )
            matches.append(quad)
    return matches


def find_exact_span_in_latest_page(text, page, original_rect, threshold=0.85):
    """Find the best matching span on latest page for the original highlighted text"""
    words = page.get_text("words")
    if not words or not text.strip():
        return []

    text = text.strip()
    best_score = 0
    best_rects = []

    # --- Case: short highlights like "SAM" ---
    if len(text.split()) <= 2:
        matches = page.search_for(text, quads=True)

        if matches:
            def dist(m):  # Closest to original position
                dx = m.rect.tl.x - original_rect.tl.x
                dy = m.rect.tl.y - original_rect.tl.y
                return dx ** 2 + dy ** 2
            return [min(matches, key=dist).rect]  # get bounding rect from Quad

    # --- Case: longer highlights, with possible substring alignment ---
    for window_size in range(3, 20):  # Try increasing block sizes
        for i in range(len(words) - window_size + 1):
            block = words[i:i + window_size]
            candidate_text = " ".join(w[4] for w in block)

            # Exact substring match within block
            candidate_text_lower = candidate_text.lower()
            text_lower = text.lower()

            if text_lower in candidate_text_lower:
                # Get word index of match
                prefix_len = candidate_text_lower[:candidate_text_lower.index(text_lower)].split()
                start_idx = len(prefix_len)
                highlight_len = len(text_lower.split())

                highlight_words = block[start_idx:start_idx + highlight_len]
                rects = [fitz.Rect(w[:4]) for w in highlight_words]

                return rects

            # Fuzzy fallback
            score = ratio(candidate_text_lower, text_lower)
            if score > threshold and score > best_score:
                best_score = score
                best_rects = [fitz.Rect(w[:4]) for w in block]

    return best_rects

# --- CORE COMMENT MERGING ---

def merge_comments(base_pdf_path, team_pdf_paths, output_path):
    latest_doc = fitz.open(base_pdf_path)

    for team_pdf in team_pdf_paths:
        team_doc = fitz.open(team_pdf)

        for team_page_index in range(len(team_doc)):
            if team_page_index >= len(latest_doc):
                continue

            team_page = team_doc[team_page_index]
            if not team_page.annots():
                continue

            for annot in team_page.annots():
                content = annot.info.get("content", "")
                author = annot.info.get("title", "Reviewer")
                annot_type = annot.type[0]

                if annot_type == fitz.PDF_ANNOT_HIGHLIGHT:
                    text, _ = extract_highlighted_text_and_rects(team_page, annot)
                    if not text:
                        continue

                    # Collect all matching rects across all pages
                    best_score = float("inf")
                    best_page = None
                    best_rects = None

                    for latest_page in latest_doc:
                        rects = find_exact_span_in_latest_page(text, latest_page, annot.rect)
                        if rects:
                            # Use top-left point of first rect as match point
                            match_point = fitz.Point(rects[0].x0, rects[0].y0)
                            dx = match_point.x - annot.rect.x0
                            dy = match_point.y - annot.rect.y0
                            distance = dx**2 + dy**2

                            if distance < best_score:
                                best_score = distance
                                best_page = latest_page
                                best_rects = rects

                    # Apply highlight to the best match
                    if best_rects and best_page:
                        hl = best_page.add_highlight_annot(best_rects)
                        hl.set_colors(stroke=(0.5, 0.0, 0.5))
                        hl.set_opacity(0.4)
                        hl.set_info({"title": author, "content": content})
                        hl.update()
                    else:
                        print(f"⚠️ No match for: '{text[:40]}'")
                elif annot_type == fitz.PDF_ANNOT_TEXT:
                    text_near = team_page.get_textbox(annot.rect).strip()
                    if not text_near:
                        continue
                    for latest_page in latest_doc:
                        found = latest_page.search_for(text_near.strip())
                        if found:
                            note = latest_page.add_text_annot(found[0].tl, content)
                            note.set_info({"title": author})
                            note.update()
                            break
        team_doc.close()

    latest_doc.save(output_path, deflate=True)
    latest_doc.close()
    print(f"✅ Combined with fuzzy comment mapping: {output_path}")

def merge_comments_by_users(base_pdf_path, team_pdf_paths, output_path):
    base_doc = fitz.open(base_pdf_path)

    for team_pdf in team_pdf_paths:
        team_doc = fitz.open(team_pdf)

        for page_num in range(len(team_doc)):
            team_page = team_doc[page_num]
            base_page = base_doc[page_num]

            if team_page.annots():
                for annot in team_page.annots():
                    rect = annot.rect
                    content = annot.info.get("content", "")
                    author = annot.info.get("title", "")

                    if annot.type[0] == fitz.PDF_ANNOT_HIGHLIGHT:
                        quads = annot.vertices
                        quad_rects = [fitz.Quad(quad).rect for quad in zip(quads[::4], quads[1::4], quads[2::4], quads[3::4])]
                        highlight = base_page.add_highlight_annot(quad_rects)
                        highlight.set_colors(stroke=(0.5, 0.0, 0.5))
                        highlight.set_opacity(0.4)
                        highlight.set_info({
                            "title": author or "Reviewer",
                            "content": content
                        })
                        highlight.update()
                    else:
                        # Fallback: use bounding rect (for sticky notes or unknown types)
                        words = base_page.get_text("words")
                        spans_to_highlight = []
                        for w in words:
                            word_rect = fitz.Rect(w[0], w[1], w[2], w[3])
                            if rect.contains(word_rect):
                                spans_to_highlight.append(word_rect)

                        if spans_to_highlight:
                            highlight = base_page.add_highlight_annot(spans_to_highlight)
                            highlight.set_colors(stroke=(0.5, 0.0, 0.5))
                            highlight.set_opacity(0.4)
                            highlight.set_info({
                                "title": author or "Reviewer",
                                "content": content
                            })
                            highlight.update()

        team_doc.close()

    base_doc.save(output_path, deflate=True)
    base_doc.close()
    print(f"✅ Combined highlights with purple (40% opacity) saved: {output_path}")

# --- WRAPPERS ---

def merge_text_changes(latest_path, team_paths, output_path_with_comments):
    latest_doc = fitz.open(latest_path)
    temp_path = os.path.join(MODIFIED_OUTPUT_FOLDER, "temp_latest.pdf")
    latest_doc.save(temp_path, deflate=True)
    latest_doc.close()
    merge_comments(temp_path, team_paths, output_path_with_comments)
    os.remove(temp_path)

def run_batch_merge():
    print("\n📂 Starting Initial Merge from BASE_FOLDER and TEAM_COMMENTS_FOLDER...\n")

    merged_files = []
    untouched_files = []

    for base_file in os.listdir(BASE_FOLDER):
        if not base_file.endswith(".pdf"):
            continue

        base_name = os.path.splitext(base_file)[0]
        base_path = os.path.join(BASE_FOLDER, base_file)

        # Check if any team commented file exists for this base file
        team_files = [f for f in os.listdir(TEAM_COMMENTS_FOLDER) if base_name in f]
        team_paths = [os.path.join(TEAM_COMMENTS_FOLDER, f) for f in team_files]

        if not team_paths:
            # File was not touched by team — copy it to OUTPUT_FOLDER directly
            untouched_output = os.path.join(OUTPUT_FOLDER, base_file)
            print(f"⚠️ File untouched by team: {base_file} — copying to OUTPUT_FOLDER.")
            with fitz.open(base_path) as doc:
                doc.save(untouched_output, deflate=True)
            untouched_files.append(base_file)
        else:
            # Merge team comments
            output_path = os.path.join(OUTPUT_FOLDER, base_name + "_Combined.pdf")
            print(f"✅ File commented by team: {base_file} — merging {len(team_paths)} team files.")
            merge_comments_by_users(base_path, team_paths, output_path)
            merged_files.append(base_file)

    # --- Summary ---
    print("\n📊 Summary of BASE_FOLDER Merge")
    if merged_files:
        print("\n🟢 Commented and Merged Files:")
        for f in merged_files:
            print(f"   - {f}")
    if untouched_files:
        print("\n🟡 Untouched Files Copied As-Is:")
        for f in untouched_files:
            print(f"   - {f}")
    print("\n✅ Phase 1 complete: All BASE_FOLDER files processed.\n")
    
def run_batch_add():
    print("\n📂 Starting Weekly Additions from LATEST_FOLDER...\n")

    changed_files = []
    unchanged_files = []
    new_files = []

    for latest_file in os.listdir(LATEST_FOLDER):
        if not latest_file.endswith(".pdf"):
            continue

        base_name = os.path.splitext(latest_file)[0].replace("_add", "")
        latest_path = os.path.join(LATEST_FOLDER, latest_file)

        # Find team-annotated files for this document
        team_files = [f for f in os.listdir(TEAM_COMMENTS_FOLDER) if base_name in f]
        team_paths = [os.path.join(TEAM_COMMENTS_FOLDER, f) for f in team_files]

        if not team_paths:
            print(f"🆕 New file from client: {latest_file} — no team comments found. Copying to OUTPUT_FOLDER.")
            untouched_output = os.path.join(OUTPUT_FOLDER, latest_file)
            with fitz.open(latest_path) as doc:
                doc.save(untouched_output, deflate=True)
            new_files.append(latest_file)
            continue

        # Compare contents of first commented file to latest version
        base_combined_path = team_paths[0]
        combined_texts = extract_text_per_page(base_combined_path)
        latest_texts = extract_text_per_page(latest_path)

        if is_text_changed(combined_texts, latest_texts):
            print(f"🔄 Detected text change for: {base_name} — re-merging {len(team_paths)} team files.")
            modified_output_path = os.path.join(OUTPUT_FOLDER, base_name + "_modified.pdf")
            merge_text_changes(latest_path, team_paths, modified_output_path)
            changed_files.append(latest_file)
        else:
            print(f"♻️ No text change for: {base_name} — merging {len(team_paths)} team files as-is.")
            output_path = os.path.join(OUTPUT_FOLDER, base_name + "_Combined.pdf")
            merge_comments_by_users(latest_path, team_paths, output_path)
            unchanged_files.append(latest_file)

    # --- Summary ---
    print("\n📊 Summary of LATEST_FOLDER Additions")
    if changed_files:
        print("\n🟢 Files with Text Changes (Re-merged):")
        for f in changed_files:
            print(f"   - {f}")
    if unchanged_files:
        print("\n🔵 Files with No Text Change (Comments Merged):")
        for f in unchanged_files:
            print(f"   - {f}")
    if new_files:
        print("\n🆕 New Files from Client (No Comments):")
        for f in new_files:
            print(f"   - {f}")
    print("\n✅ Phase 2 complete: All LATEST_FOLDER files processed.\n")


# --- MAIN ---

if __name__ == "__main__":
    run_batch_merge()
    run_batch_add()


📂 Starting Initial Merge from BASE_FOLDER and TEAM_COMMENTS_FOLDER...

✅ File commented by team: EVM-PO-181.pdf — merging 3 team files.
✅ Combined highlights with purple (40% opacity) saved: Combined_Annotated\EVM-PO-181_Combined.pdf
⚠️ File untouched by team: LOG-PR-295.pdf — copying to OUTPUT_FOLDER.
✅ File commented by team: LOG-PR-939.pdf — merging 3 team files.
✅ Combined highlights with purple (40% opacity) saved: Combined_Annotated\LOG-PR-939_Combined.pdf

📊 Summary of BASE_FOLDER Merge

🟢 Commented and Merged Files:
   - EVM-PO-181.pdf
   - LOG-PR-939.pdf

🟡 Untouched Files Copied As-Is:
   - LOG-PR-295.pdf

✅ Phase 1 complete: All BASE_FOLDER files processed.


📂 Starting Weekly Additions from LATEST_FOLDER...

🆕 New file from client: LOG-PR-295 User 1.pdf — no team comments found. Copying to OUTPUT_FOLDER.
🆕 New file from client: LOG-PR-295 User 2.pdf — no team comments found. Copying to OUTPUT_FOLDER.
🆕 New file from client: LOG-PR-295 User 3.pdf — no team comments found. C

In [None]:
def report_removed_files():
    print("\n📁 Checking for removed files (TEAM_COMMENTS_FOLDER ➡️ not matched in LATEST_FOLDER)...\n")

    latest_basenames = {
        os.path.splitext(f)[0].replace("_add", "") for f in os.listdir(LATEST_FOLDER) if f.endswith(".pdf")
    }

    team_files = [f for f in os.listdir(TEAM_COMMENTS_FOLDER) if f.endswith(".pdf")]
    removed_files = []

    for team_file in team_files:
        team_base = os.path.splitext(team_file)[0].replace("_Combined", "").replace("_modified", "")
        if not any(latest_base in team_base for latest_base in latest_basenames):
            removed_files.append(team_file)

    if removed_files:
        print("🛑 The following team-reviewed files have no matching file in LATEST_FOLDER:")
        for f in sorted(removed_files):
            print(f"   - {f}")
    else:
        print("✅ All TEAM_COMMENTS_FOLDER files have corresponding matches in LATEST_FOLDER.")

    print()


def run_batch_add():
    print("\n📂 Starting Weekly Additions from LATEST_FOLDER...\n")

    report_removed_files()  # Add this line to report missing files