In [None]:
#@title CSV to markdown

import pandas as pd # Still useful for initial check, though reading is manual
import csv
import re
import os
import difflib # For finding sequence similarities
import html    # For escaping text before wrapping in HTML tags

# --- Configuration ---
# Input file path (as provided in your script)
input_file = '/content/Qu’apprenons-nous de nos affects _ (AGREG ext. 2020 - note _ 15).csv'

# Overlap Detection Configuration
MIN_OVERLAP_LENGTH = 15  # Minimum number of characters for overlap
MATCH_START_THRESHOLD = 5 # How close to the start of the *next* chunk the match must begin

# --- Helper Functions ---

# Function to replace *text* with <u><em>text</em></u> (from your script)
def format_special_text(text):
    # Use html.escape before applying regex to avoid conflicts if text contains HTML-like syntax
    # Apply regex on the potentially escaped text
    # Important: Need to handle cases where spans might interfere.
    # Let's apply this *after* joining, carefully.
    # UPDATED APPROACH: Apply formatting at the very end to the joined string.
    pass # Placeholder, formatting moved to the end

# --- Main Logic ---

if __name__ == "__main__":
    # 1. Check if Input CSV exists
    if not os.path.exists(input_file):
        print(f"🛑 Error: Input CSV file not found at '{input_file}'")
    else:
        # 2. Determine Output MD Path
        base_filename = os.path.splitext(os.path.basename(input_file))[0]
        # Construct output path in the same directory as input
        output_dir = os.path.dirname(input_file)
        output_file = os.path.join(output_dir, f'{base_filename}.md')


        print(f"Input CSV: '{input_file}'")
        print(f"Output MD: '{output_file}'")

        try:
            # 3. Read CSV data using csv module (as in your script)
            # Open WITHOUT specifying encoding
            with open(input_file, 'r') as f:
                csv_reader = csv.reader(f)
                # Handle potential errors during reading if needed
                try:
                    rows = list(csv_reader)
                except csv.Error as e:
                    print(f"🛑 Error reading CSV file {input_file}: {e}")
                    rows = [] # Set rows to empty list on error

            if not rows:
                print(f"🛑 Error: CSV file '{input_file}' is empty or could not be read.")
            else:
                # Extract the first column values only, handling empty rows
                first_column = [row[0] if row else '' for row in rows] # Get first element or empty string

                # 4. Extract title and content
                title = first_column[0]
                content_chunks = first_column[1:] # Content starts from the second row
                print(f"Read title: '{title}'")
                print(f"Read {len(content_chunks)} content chunks.")

                if not content_chunks:
                    print("⚠️ Warning: No content chunks found after the title row.")
                    final_content_parts = []
                else:
                    # 5. Process Chunks for Overlaps
                    final_content_parts = []
                    # Add the first chunk initially (raw, no escaping yet)
                    final_content_parts.append(content_chunks[0])

                    for i in range(1, len(content_chunks)):
                        text_prev_raw = content_chunks[i-1]
                        text_curr_raw = content_chunks[i]

                        s = difflib.SequenceMatcher(None, text_prev_raw, text_curr_raw, autojunk=False)
                        match = s.find_longest_match(0, len(text_prev_raw), 0, len(text_curr_raw))

                        is_boundary_match = (
                            match.size >= MIN_OVERLAP_LENGTH and
                            match.a + match.size == len(text_prev_raw) and
                            match.b < MATCH_START_THRESHOLD
                        )

                        if is_boundary_match:
                            overlap_size = match.size
                            overlap_text_in_prev_raw = text_prev_raw[match.a : match.a + overlap_size]
                            overlap_text_in_curr_raw = text_curr_raw[match.b : match.b + overlap_size]
                            print(f"  Found overlap (size {overlap_size}) between chunk {i-1} and {i}: '...{overlap_text_in_prev_raw[-30:]}'")

                            # Escape parts *before* wrapping in HTML
                            escaped_prev_part_start = html.escape(text_prev_raw[:match.a])
                            escaped_overlap_part = html.escape(overlap_text_in_prev_raw)
                            escaped_curr_overlap = html.escape(overlap_text_in_curr_raw)
                            escaped_curr_remaining = html.escape(text_curr_raw[match.b + overlap_size:])

                            # Modify the *previous* part already added
                            final_content_parts[-1] = (
                                f'{escaped_prev_part_start}'
                                f'<span style="color:red">{escaped_overlap_part}</span>'
                            )

                            # Prepare the *current* part for adding
                            current_part_processed = (
                                f'<span style="color:red">{escaped_curr_overlap}</span>'
                                f'{escaped_curr_remaining}'
                            )
                            final_content_parts.append(current_part_processed)

                        else:
                            # No significant overlap found, add current chunk escaped
                            final_content_parts.append(html.escape(text_curr_raw))

                # 6. Join the processed parts
                # Note: final_content_parts now contains HTML-escaped text + red spans
                joined_content = " ".join(final_content_parts)

                # 7. Apply the *text* -> <u><em>text</em></u> formatting AFTER joining
                # This regex needs to be careful not to mess up the existing HTML spans.
                # It should only target '*' outside of HTML tags.
                # A simpler approach might be less robust but easier: apply it before joining?
                # Let's try applying it *before* joining to the escaped parts, carefully.

                # --- Revisit Step 5/6 with Formatting Applied Earlier ---
                # Process Chunks for Overlaps AND apply formatting *before* joining

                formatted_final_parts = []
                # Add the first chunk, escaping first
                formatted_final_parts.append(html.escape(content_chunks[0]))

                for i in range(1, len(content_chunks)):
                    text_prev_raw = content_chunks[i-1]
                    text_curr_raw = content_chunks[i]

                    s = difflib.SequenceMatcher(None, text_prev_raw, text_curr_raw, autojunk=False)
                    match = s.find_longest_match(0, len(text_prev_raw), 0, len(text_curr_raw))

                    is_boundary_match = (
                        match.size >= MIN_OVERLAP_LENGTH and
                        match.a + match.size == len(text_prev_raw) and
                        match.b < MATCH_START_THRESHOLD
                    )

                    if is_boundary_match:
                        overlap_size = match.size
                        # Get raw text pieces
                        prev_part_start_raw = text_prev_raw[:match.a]
                        overlap_part_raw = text_prev_raw[match.a : match.a + overlap_size]
                        curr_overlap_raw = text_curr_raw[match.b : match.b + overlap_size]
                        curr_remaining_raw = text_curr_raw[match.b + overlap_size:]

                        # Escape pieces
                        escaped_prev_part_start = html.escape(prev_part_start_raw)
                        escaped_overlap_part = html.escape(overlap_part_raw)
                        escaped_curr_overlap = html.escape(curr_overlap_raw)
                        escaped_curr_remaining = html.escape(curr_remaining_raw)

                        # Modify the *previous* part already added
                        formatted_final_parts[-1] = (
                            f'{escaped_prev_part_start}'
                            f'<span style="color:red">{escaped_overlap_part}</span>'
                        )

                        # Prepare the *current* part for adding
                        current_part_processed = (
                            f'<span style="color:red">{escaped_curr_overlap}</span>'
                            f'{escaped_curr_remaining}'
                        )
                        formatted_final_parts.append(current_part_processed)

                    else:
                        # No overlap, add current chunk escaped
                        formatted_final_parts.append(html.escape(text_curr_raw))

                # NOW apply the *text* -> <u><em>text</em></u> formatting to each part
                # This might still break HTML if '*' is inside an attribute, but less likely
                def apply_final_formatting(text):
                     # Apply the markdown * to HTML tag conversion
                     # Be careful not to match inside existing tags
                     # This regex is basic and might fail on complex cases
                     # It tries to avoid matching * within <...>
                     parts = re.split(r'(<[^>]*>)', text) # Split by tags
                     result = []
                     for part in parts:
                         if part.startswith('<') and part.endswith('>'):
                             result.append(part) # Keep tags as is
                         else:
                             # Apply formatting only to non-tag parts
                             formatted_part = re.sub(r'\*(.*?)\*', r'<u><em>\1</em></u>', part)
                             result.append(formatted_part)
                     return "".join(result)

                fully_formatted_parts = [apply_final_formatting(part) for part in formatted_final_parts]

                # Join the fully processed parts
                joined_content = " ".join(fully_formatted_parts)

                # Apply final formatting to the title as well
                # Escape title first, then apply formatting
                formatted_title = apply_final_formatting(html.escape(title))

                # Create final markdown content
                markdown_content = f"# {formatted_title}\n\n{joined_content}"

                # 8. Write to Markdown file (WITHOUT encoding)
                # Use 'w' mode which overwrites by default
                with open(output_file, 'w') as f:
                    f.write(markdown_content)

                print(f"✅ Markdown file created successfully at '{output_file}'")

        except FileNotFoundError:
             print(f"🛑 Error: Input CSV file not found at '{input_file}'.")
        except Exception as e:
            print(f"🛑 An unexpected error occurred: {e}")
            import traceback
            traceback.print_exc() # Print full traceback for debugging

Input CSV: '/content/Qu’apprenons-nous de nos affects _ (AGREG ext. 2020 - note _ 15).csv'
Output MD: '/content/Qu’apprenons-nous de nos affects _ (AGREG ext. 2020 - note _ 15).md'
Read title: 'folder'
Read 62 content chunks.
  Found overlap (size 38) between chunk 0 and 1: '...e relative aux objets étudiés.'
  Found overlap (size 59) between chunk 1 and 2: '... les mêmes à être affectés, et'
  Found overlap (size 57) between chunk 3 and 4: '...rement impliquer une affection'
  Found overlap (size 59) between chunk 4 and 5: '...de l'ordre de l'apprentissage,'
  Found overlap (size 57) between chunk 6 and 7: '...s que les objets qui en sont à'
  Found overlap (size 63) between chunk 7 and 8: '...aissance pratique et empirique'
  Found overlap (size 69) between chunk 10 and 11: '...nes idées viendraient frapper,'
  Found overlap (size 63) between chunk 11 and 12: '...premier moyen d'-apprentissage'
  Found overlap (size 65) between chunk 12 and 13: '...ais l'intériorisation de cette'
  F

In [None]:
import pandas as pd # Still useful for initial check, though reading is manual
import csv
import re
import os
import difflib # For finding sequence similarities
import html    # For escaping text before wrapping in HTML tags
import math    # For potential calculations if needed

# --- Configuration ---
# Input file path (as provided in your script)
input_file = '/content/drive/MyDrive/CSV_2/« Être soi-même » cela a t-il un sens _ (CAPES 2023 - note _ 14,5)/« Être soi-même » cela a t-il un sens _ (CAPES 2023 - note _ 14,5).csv'
# --- Overlap Detection Configuration ---

# Threshold 1: Exact Word Overlap for DELETION
# If the overlapping text is *identical* between chunks AND contains at least this many words,
# the overlap in the *second* chunk is removed entirely.
EXACT_WORD_OVERLAP_THRESHOLD = 8  # Minimum number of *identical words* to trigger deletion

# Threshold 2: General Character Overlap for HIGHLIGHTING
# If an overlap (exact or not) is found at the boundary and has at least this many characters,
# but *doesn't* meet the exact word deletion threshold, it gets highlighted in red.
# Lowered slightly as requested to potentially catch more subtle repeats.
MIN_CHAR_OVERLAP_LENGTH = 12 # Minimum number of *characters* for highlighting (was 15)

# Boundary Condition Configuration
# How close to the start of the *next* chunk the match must begin to be considered a boundary match
MATCH_START_THRESHOLD = 5

# --- Helper Functions ---

# Function to apply *text* -> <u><em>text</em></u> formatting safely
# Applied at the very end to the joined content parts.
def apply_final_formatting(text):
     # Apply the markdown * to HTML tag conversion
     # Be careful not to match inside existing tags
     # This regex is basic and might fail on complex cases
     # It tries to avoid matching * within <...>
     parts = re.split(r'(<[^>]*>)', text) # Split by tags
     result = []
     for part in parts:
         if part.startswith('<') and part.endswith('>'):
             result.append(part) # Keep tags as is
         else:
             # Apply formatting only to non-tag parts
             # Make sure not to double-format if somehow already done (unlikely here)
             formatted_part = re.sub(r'\*(.*?)\*', r'<u><em>\1</em></u>', part)
             result.append(formatted_part)
     return "".join(result)

# Helper to count words (simple split by whitespace)
def count_words(text):
    return len(text.split())

# --- Main Logic ---

if __name__ == "__main__":
    # 1. Check if Input CSV exists
    if not os.path.exists(input_file):
        print(f"🛑 Error: Input CSV file not found at '{input_file}'")
    else:
        # 2. Determine Output MD Path
        base_filename = os.path.splitext(os.path.basename(input_file))[0]
        output_dir = os.path.dirname(input_file)
        output_file = os.path.join(output_dir, f'{base_filename}_processed.md') # Added suffix

        print(f"--- Configuration ---")
        print(f"Input CSV:                  '{input_file}'")
        print(f"Output MD:                  '{output_file}'")
        print(f"Exact Word Deletion Threshold: {EXACT_WORD_OVERLAP_THRESHOLD} words")
        print(f"Highlighting Char Threshold:   {MIN_CHAR_OVERLAP_LENGTH} chars")
        print(f"Match Start Threshold:       {MATCH_START_THRESHOLD} chars")
        print(f"--- Processing ---")

        try:
            # 3. Read CSV data using csv module
            rows = []
            try:
                # Try reading with UTF-8 first, which is common
                with open(input_file, 'r', encoding='utf-8') as f:
                    csv_reader = csv.reader(f)
                    rows = list(csv_reader)
            except UnicodeDecodeError:
                print("⚠️ Warning: UTF-8 decoding failed. Trying default system encoding.")
                try:
                    # Fallback to default encoding if UTF-8 fails
                    with open(input_file, 'r') as f:
                        csv_reader = csv.reader(f)
                        rows = list(csv_reader)
                except Exception as e_read:
                    print(f"🛑 Error reading CSV file {input_file} even with default encoding: {e_read}")
            except csv.Error as e_csv:
                 print(f"🛑 Error parsing CSV file {input_file}: {e_csv}")


            if not rows:
                print(f"🛑 Error: CSV file '{input_file}' is empty or could not be read properly.")
            else:
                # Extract the first column values only, handling empty rows/cells
                first_column = [row[0].strip() if row and len(row) > 0 else '' for row in rows]

                # 4. Extract title and content
                title = first_column[0] if first_column else "Untitled"
                content_chunks = first_column[1:] # Content starts from the second row
                print(f"Read title: '{title}'")
                print(f"Read {len(content_chunks)} content chunks.")

                if not content_chunks:
                    print("⚠️ Warning: No content chunks found after the title row.")
                    processed_content_parts = []
                else:
                    # 5. Process Chunks for Overlaps (Deletion or Highlighting)
                    processed_content_parts = []
                    # Add the first chunk initially (will be modified if overlap found with chunk 2)
                    # We escape it here, as it might be modified later.
                    processed_content_parts.append(html.escape(content_chunks[0]))

                    for i in range(1, len(content_chunks)):
                        # Use the *raw* text for comparison
                        text_prev_raw = content_chunks[i-1]
                        text_curr_raw = content_chunks[i]

                        # Skip processing if either chunk is empty
                        if not text_prev_raw or not text_curr_raw:
                            print(f"  Skipping comparison between chunk {i-1} and {i} due to empty chunk.")
                            processed_content_parts.append(html.escape(text_curr_raw))
                            continue

                        # Use SequenceMatcher to find the longest common block
                        s = difflib.SequenceMatcher(None, text_prev_raw, text_curr_raw, autojunk=False)
                        # Find the *best* match ending at/near the end of prev and starting near the start of curr
                        match = s.find_longest_match(0, len(text_prev_raw), 0, len(text_curr_raw))

                        # Check if it's a boundary match (ends at the end of prev, starts near beginning of curr)
                        is_boundary_match = (
                            match.size > 0 and # Ensure there is some match
                            match.a + match.size == len(text_prev_raw) and
                            match.b < MATCH_START_THRESHOLD
                        )

                        if is_boundary_match:
                            overlap_size = match.size
                            overlap_text_in_prev_raw = text_prev_raw[match.a : match.a + overlap_size]
                            overlap_text_in_curr_raw = text_curr_raw[match.b : match.b + overlap_size]

                            # --- Condition 1: Check for Exact Word Overlap for Deletion ---
                            is_exact_string_match = (overlap_text_in_prev_raw == overlap_text_in_curr_raw)
                            word_count = count_words(overlap_text_in_prev_raw) # Count words in the identical segment

                            if is_exact_string_match and word_count >= EXACT_WORD_OVERLAP_THRESHOLD:
                                print(f"  ✅ Found EXACT overlap ({word_count} words, {overlap_size} chars) between chunk {i-1} and {i}. DELETING repetition.")
                                # print(f"     Overlap: '...{overlap_text_in_prev_raw[-50:]}'") # Optional: print deleted text

                                # Modify the *previous* part: Remove the overlapping section
                                # Get the non-overlapping part of the previous chunk
                                prev_part_start_raw = text_prev_raw[:match.a]
                                # Re-escape and update the last element in processed_content_parts
                                processed_content_parts[-1] = html.escape(prev_part_start_raw)

                                # Prepare the *current* part: Add only the part *after* the deleted overlap
                                curr_remaining_raw = text_curr_raw[match.b + overlap_size:]
                                processed_content_parts.append(html.escape(curr_remaining_raw))

                            # --- Condition 2: Check for General Overlap for Highlighting ---
                            elif overlap_size >= MIN_CHAR_OVERLAP_LENGTH:
                                print(f"  ⚠️ Found boundary overlap ({overlap_size} chars) between chunk {i-1} and {i}. HIGHLIGHTING.")
                                # print(f"     Prev overlap: '...{overlap_text_in_prev_raw[-50:]}'") # Debug
                                # print(f"     Curr overlap: '{overlap_text_in_curr_raw[:50]}...'") # Debug

                                # Escape parts *before* wrapping in HTML
                                escaped_prev_part_start = html.escape(text_prev_raw[:match.a])
                                escaped_overlap_part_prev = html.escape(overlap_text_in_prev_raw)
                                escaped_overlap_part_curr = html.escape(overlap_text_in_curr_raw)
                                escaped_curr_remaining = html.escape(text_curr_raw[match.b + overlap_size:])

                                # Modify the *previous* part already added to highlight its end
                                processed_content_parts[-1] = (
                                    f'{escaped_prev_part_start}'
                                    f'<span style="color:red">{escaped_overlap_part_prev}</span>'
                                )

                                # Prepare the *current* part for adding, highlighting its start
                                current_part_processed = (
                                    f'<span style="color:red">{escaped_overlap_part_curr}</span>'
                                    f'{escaped_curr_remaining}'
                                )
                                processed_content_parts.append(current_part_processed)

                            # --- Condition 3: Boundary match found, but too small for action ---
                            else:
                                print(f"  ℹ️ Found boundary overlap ({overlap_size} chars) below thresholds. Adding current chunk normally.")
                                processed_content_parts.append(html.escape(text_curr_raw))

                        # --- Condition 4: No significant boundary overlap found ---
                        else:
                            # Find longest match anywhere just for info, if needed for debugging
                            # best_match_anywhere = s.find_longest_match(0, len(text_prev_raw), 0, len(text_curr_raw))
                            # if best_match_anywhere.size >= MIN_CHAR_OVERLAP_LENGTH:
                            #      print(f"  (Note: Non-boundary overlap found size {best_match_anywhere.size}, ignored)")

                            print(f"  No significant boundary overlap found between chunk {i-1} and {i}. Adding current chunk normally.")
                            processed_content_parts.append(html.escape(text_curr_raw))


                # 6. Join the processed parts (already contain escaped text + red spans)
                # Filter out any potentially empty strings that might result from deletion
                final_joined_parts = [part for part in processed_content_parts if part]
                joined_content_intermediate = " ".join(final_joined_parts)

                # 7. Apply the *text* -> <u><em>text</em></u> formatting AFTER joining and overlap handling
                joined_content_final = apply_final_formatting(joined_content_intermediate)

                # Apply final formatting to the title as well
                # Escape title first, then apply formatting
                formatted_title = apply_final_formatting(html.escape(title))

                # Create final markdown content
                markdown_content = f"# {formatted_title}\n\n{joined_content_final}"

                # 8. Write to Markdown file
                try:
                    with open(output_file, 'w', encoding='utf-8') as f:
                        f.write(markdown_content)
                    print(f"✅ Markdown file created successfully at '{output_file}'")
                except IOError as e_write:
                    print(f"🛑 Error writing output file '{output_file}': {e_write}")

        except FileNotFoundError:
             print(f"🛑 Error: Input CSV file not found at '{input_file}'. Please check the path.")
        except Exception as e:
            print(f"🛑 An unexpected error occurred during processing: {e}")
            import traceback
            traceback.print_exc() # Print full traceback for debugging

--- Configuration ---
Input CSV:                  '/content/drive/MyDrive/CSV_2/« Être soi-même » cela a t-il un sens _ (CAPES 2023 - note _ 14,5)/« Être soi-même » cela a t-il un sens _ (CAPES 2023 - note _ 14,5).csv'
Output MD:                  '/content/drive/MyDrive/CSV_2/« Être soi-même » cela a t-il un sens _ (CAPES 2023 - note _ 14,5)/« Être soi-même » cela a t-il un sens _ (CAPES 2023 - note _ 14,5)_processed.md'
Exact Word Deletion Threshold: 8 words
Highlighting Char Threshold:   12 chars
Match Start Threshold:       5 chars
--- Processing ---
Read title: '« Être soi-même » cela a t-il un sens ? (CAPES 2023 - note : 14,5)'
Read 55 content chunks.
  No significant boundary overlap found between chunk 0 and 1. Adding current chunk normally.
  ⚠️ Found boundary overlap (42 chars) between chunk 1 and 2. HIGHLIGHTING.
  No significant boundary overlap found between chunk 2 and 3. Adding current chunk normally.
  ⚠️ Found boundary overlap (53 chars) between chunk 3 and 4.