### Ce code python permet d'extraire le contenu pour une seule notion

In [None]:
#@title SQL to DOCX

import re
import os
import sqlparse
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

# --- Configuration ---
SQL_FILE_PATH = "/content/TEXT.sql"
SINGLE_DOCX_OUTPUT_PATH = "/content/philosophy_texts_by_notion_combination.docx"

SQL_COLUMN_INDICES_TO_EXTRACT = [1, 2, 3, 4, 5, 6, 7, 8] # Title, Intro, ..., PrimaryNotion, SecondaryNotion

COL_IDX_TITLE = 0
COL_IDX_INTRO = 1
COL_IDX_BODY = 4
COL_IDX_SOURCE = 5
COL_IDX_NOTION_PRIMARY = 6      # SQL column 7
COL_IDX_NOTION_SECONDARY = 7    # SQL column 8

PHILOSOPHICAL_NOTIONS_LIST = sorted([ # Sorted for consistent processing if needed
    "art", "état", "conscience", "justice", "liberté", "nature", "raison",
    "religion", "science", "technique", "vérité", "bonheur", "devoir",
    "langage", "temps", "travail", "inconscient"
])
# Ensure "état" is in the list, though it's handled specially
if "état" not in PHILOSOPHICAL_NOTIONS_LIST:
    PHILOSOPHICAL_NOTIONS_LIST.append("état")
    PHILOSOPHICAL_NOTIONS_LIST.sort()


ETAT_SEUL_CATEGORY_TITLE = "Textes convoquant seulement la notion “État”"
# --- End Configuration ---

# Global counters
insert_statements_found_g = 0
rows_processed_count_g = 0
texts_added_to_docx_g = 0 # Counts each time a text is appended to any category


def repair_mojibake(text):
    if not text: return text
    try: return text.encode('latin-1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError): return text

def unescape_sql_string(s):
    s = s.replace("''", "'").replace('""', '"')
    s = s.replace("\\\\", "\\").replace("\\'", "'").replace('\\"', '"')
    s = s.replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t")
    return s

def extract_sql_row_data(p_token, list_of_sql_indices_to_extract):
    value_items_tokens = []
    initial_sub_tokens = []
    if hasattr(p_token, 'tokens'):
        for sub_token in p_token.tokens:
            if not sub_token.is_whitespace and sub_token.ttype != sqlparse.tokens.Punctuation:
                initial_sub_tokens.append(sub_token)
    if len(initial_sub_tokens) == 1:
        content_string_token = initial_sub_tokens[0]
        parsed_content_string = sqlparse.parse(str(content_string_token))
        if parsed_content_string and len(parsed_content_string) > 0:
            statement_from_reparse = parsed_content_string[0]
            if statement_from_reparse.tokens and isinstance(statement_from_reparse.tokens[0], sqlparse.sql.IdentifierList):
                value_items_tokens = list(statement_from_reparse.tokens[0].get_identifiers())
            else:
                for token_in_list in statement_from_reparse.tokens:
                    if not token_in_list.is_whitespace and token_in_list.ttype != sqlparse.tokens.Punctuation:
                        value_items_tokens.append(token_in_list)
    else: value_items_tokens = initial_sub_tokens

    max_needed_sql_index = max(list_of_sql_indices_to_extract) if list_of_sql_indices_to_extract else -1
    if len(value_items_tokens) <= max_needed_sql_index: return None

    extracted_data_row = []
    for sql_idx in list_of_sql_indices_to_extract:
        if sql_idx < len(value_items_tokens):
            raw_value = str(value_items_tokens[sql_idx]).strip()
            if (raw_value.startswith("'") and raw_value.endswith("'")) or \
               (raw_value.startswith('"') and raw_value.endswith('"')):
                cleaned_value = unescape_sql_string(raw_value[1:-1])
            elif raw_value.upper() == 'NULL': cleaned_value = ''
            else: cleaned_value = raw_value
            extracted_data_row.append(cleaned_value)
        else: return None
    return extracted_data_row if len(extracted_data_row) == len(list_of_sql_indices_to_extract) else None

def capitalize_first_letter(text_string):
    if not text_string: return ""
    return text_string[0].upper() + text_string[1:]

def capitalize_all_sentences(paragraph_text):
    if not paragraph_text: return ""
    paragraph_text = capitalize_first_letter(paragraph_text)
    def cap_match(match): return match.group(1) + match.group(2).upper()
    paragraph_text = re.sub(r'([.!?]\s+)([a-z])', cap_match, paragraph_text)
    return paragraph_text

def parse_notions_from_cell(cell_content_str, valid_notions_list):
    """Parses a cell string and returns a set of valid notions found."""
    if not cell_content_str:
        return set()

    found_notions = set()
    normalized_content = cell_content_str.lower().strip()

    # Split by common delimiters like comma, semicolon, or multiple spaces.
    potential_notions = re.split(r'[,;\s]+', normalized_content)

    for pn_candidate in potential_notions:
        cleaned_pn = pn_candidate.strip()
        if cleaned_pn in valid_notions_list: # Check against the master list
            found_notions.add(cleaned_pn)
    return found_notions

# MODIFIED FUNCTION
def add_document_heading(doc, text_content, level=2):
    """Adds a heading to the document.
    The heading is formatted according to 'Heading {level}' style (e.g., 'Heading 2'),
    then explicitly centered, set to Times New Roman font,
    and followed by an empty paragraph (visual line break)."""

    # Add the heading paragraph with the specified level
    heading_paragraph = doc.add_heading(text_content, level=level)

    # 1. Center the heading paragraph
    heading_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

    # 3. Set the font to Times New Roman for all runs in the heading
    #    add_heading usually creates a paragraph with one run for the text.
    for run in heading_paragraph.runs:
        run.font.name = 'Times New Roman'
        # Note: Other font properties like size, bold, italic will be
        # inherited from the 'Heading {level}' style unless also overridden here.
        # If the 'Heading 2' style already defines font size, boldness, etc.,
        # those will be retained, only the font family changes to Times New Roman.

    # 2. Add a line break (an empty paragraph) after the heading
    doc.add_paragraph()

    return heading_paragraph
# END OF MODIFIED FUNCTION

def append_philosophy_text_to_doc(doc, title_text, intro_text, body_text_raw, source_text_raw, add_page_break_after):
    default_font_name = 'Times New Roman'
    default_font_size = Pt(12)

    title_text_proc = capitalize_first_letter(str(title_text or "").strip())
    intro_text_proc = capitalize_all_sentences(str(intro_text or "").strip())
    body_text_raw = str(body_text_raw or "")
    source_text_raw = str(source_text_raw or "")

    p_title = doc.add_paragraph()
    run_title = p_title.add_run(title_text_proc or "No Title Provided")
    run_title.font.name = default_font_name; run_title.font.size = Pt(14); run_title.font.bold = True
    doc.add_paragraph()

    if intro_text_proc:
        p_intro = doc.add_paragraph()
        run_intro = p_intro.add_run(intro_text_proc)
        run_intro.font.name = default_font_name; run_intro.font.size = default_font_size; run_intro.font.italic = True
        p_intro.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
        doc.add_paragraph()
    elif not (body_text_raw.strip() or source_text_raw.strip()): pass
    else: doc.add_paragraph()

    if body_text_raw.strip():
        sql_paragraphs = body_text_raw.split('\n')
        tag_parser_regex = re.compile(r'<(b|i|u)>(.*?)</\1>', re.IGNORECASE | re.DOTALL)
        for sql_para_content in sql_paragraphs:
            text_lines_from_br = re.split(r'<br\s*/?>', sql_para_content, flags=re.IGNORECASE)
            for line_content in text_lines_from_br:
                p_body = doc.add_paragraph(); p_body.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                current_pos = 0
                while current_pos < len(line_content):
                    match = tag_parser_regex.search(line_content, current_pos)
                    if match:
                        if line_content[current_pos:match.start()]:
                            run = p_body.add_run(line_content[current_pos:match.start()])
                            run.font.name = default_font_name; run.font.size = default_font_size
                        tag_name, tag_content = match.group(1).lower(), match.group(2)
                        run = p_body.add_run(tag_content)
                        run.font.name = default_font_name; run.font.size = default_font_size
                        if tag_name == 'b': run.font.bold = True
                        elif tag_name == 'i': run.font.italic = True
                        elif tag_name == 'u': run.font.underline = True
                        current_pos = match.end()
                    else:
                        if line_content[current_pos:]:
                            run = p_body.add_run(line_content[current_pos:])
                            run.font.name = default_font_name; run.font.size = default_font_size
                        current_pos = len(line_content)

    if source_text_raw.strip():
        doc.add_paragraph()
        p_source = doc.add_paragraph(); p_source.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        source_tag_parser_regex = re.compile(r'<(b|i|u)>(.*?)</\1>', re.IGNORECASE | re.DOTALL)
        current_pos_source = 0
        while current_pos_source < len(source_text_raw):
            match_s = source_tag_parser_regex.search(source_text_raw, current_pos_source)
            if match_s:
                if source_text_raw[current_pos_source:match_s.start()]:
                    run_s = p_source.add_run(source_text_raw[current_pos_source:match_s.start()])
                    run_s.font.name = default_font_name; run_s.font.size = default_font_size
                tag_name_s, tag_content_s = match_s.group(1).lower(), match_s.group(2)
                run_s = p_source.add_run(tag_content_s)
                run_s.font.name = default_font_name; run_s.font.size = default_font_size
                if tag_name_s == 'b': run_s.font.bold = True
                elif tag_name_s == 'i': run_s.font.italic = True
                elif tag_name_s == 'u': run_s.font.underline = True
                current_pos_source = match_s.end()
            else:
                if source_text_raw[current_pos_source:]:
                    run_s = p_source.add_run(source_text_raw[current_pos_source:])
                    run_s.font.name = default_font_name; run_s.font.size = default_font_size
                current_pos_source = len(source_text_raw)

    if add_page_break_after:
        doc.add_page_break()

def main():
    global insert_statements_found_g, rows_processed_count_g, texts_added_to_docx_g

    print(f"Starting processing of SQL file: '{SQL_FILE_PATH}'")
    if not os.path.exists(SQL_FILE_PATH):
        print(f"ERROR: SQL input file not found: '{SQL_FILE_PATH}'"); return

    master_doc = Document()
    # Set default font for 'Normal' style. Headings may have their own style definitions.
    style = master_doc.styles['Normal']
    style.font.name = 'Times New Roman'; style.font.size = Pt(12)

    # It's good practice to define or check heading styles if specific looks are desired
    # For example, to ensure 'Heading 2' has some base properties (though we override font and alignment below)
    # heading2_style = master_doc.styles['Heading 2']
    # heading2_style.font.size = Pt(13) # Example: set default size for Heading 2
    # heading2_style.font.bold = True   # Example: set default bold for Heading 2


    try:
        with open(SQL_FILE_PATH, 'r', encoding='utf-8') as f: sql_content = f.read()
    except Exception as e: print(f"Error reading SQL file: {e}"); return

    parsed_statements = sqlparse.parse(sql_content)

    categorized_texts = {} # Key: category title, Value: list of text_data dicts

    print("\nCategorizing texts based on notions 'état' and others...")
    for stmt in parsed_statements:
        if stmt.get_type() != 'INSERT': continue
        insert_statements_found_g += 1
        row_tokens = []
        values_token = next((t for t in stmt.tokens if isinstance(t, sqlparse.sql.Values)), None)
        if values_token: row_tokens = [st for st in values_token.get_sublists() if isinstance(st, sqlparse.sql.Parenthesis)]
        else:
            after_values = False
            for t in stmt.tokens:
                if after_values and isinstance(t, sqlparse.sql.Parenthesis): row_tokens.append(t)
                elif not after_values and t.is_keyword and t.normalized == 'VALUES': after_values = True
                elif after_values and not t.is_whitespace and str(t) != ',': break

        for p_token_row in row_tokens:
            rows_processed_count_g += 1
            extracted = extract_sql_row_data(p_token_row, SQL_COLUMN_INDICES_TO_EXTRACT)
            if not (extracted and len(extracted) == len(SQL_COLUMN_INDICES_TO_EXTRACT)):
                continue

            text_data_payload = {
                "title": repair_mojibake(extracted[COL_IDX_TITLE]),
                "intro": repair_mojibake(extracted[COL_IDX_INTRO]),
                "body": repair_mojibake(extracted[COL_IDX_BODY]),
                "source": repair_mojibake(extracted[COL_IDX_SOURCE]),
                "_raw_primary_notion": extracted[COL_IDX_NOTION_PRIMARY],
                "_raw_secondary_notion": extracted[COL_IDX_NOTION_SECONDARY]
            }

            primary_notions_str = repair_mojibake(extracted[COL_IDX_NOTION_PRIMARY])
            secondary_notions_str = repair_mojibake(extracted[COL_IDX_NOTION_SECONDARY])

            set_primary = parse_notions_from_cell(primary_notions_str, PHILOSOPHICAL_NOTIONS_LIST)
            set_secondary = parse_notions_from_cell(secondary_notions_str, PHILOSOPHICAL_NOTIONS_LIST)

            has_etat_primary = "bonheur" in set_primary
            has_etat_secondary = "bonheur" in set_secondary

            if not has_etat_primary and not has_etat_secondary:
                continue

            all_other_valid_notions = (set_primary | set_secondary) - {"état"}
            all_other_valid_notions = {n for n in all_other_valid_notions if n in PHILOSOPHICAL_NOTIONS_LIST}


            if not all_other_valid_notions:
                category_key = ETAT_SEUL_CATEGORY_TITLE
                if category_key not in categorized_texts: categorized_texts[category_key] = []
                categorized_texts[category_key].append(text_data_payload)
            else:
                for other_notion in all_other_valid_notions:
                    if other_notion == "état": continue

                    category_key = f"Bonheur et {other_notion.capitalize()}"
                    if category_key not in categorized_texts: categorized_texts[category_key] = []
                    categorized_texts[category_key].append(text_data_payload)

    if not categorized_texts:
        print("No texts related to 'état' found for DOCX generation.")
    else:
        print(f"\nFound {len(categorized_texts)} categories of texts involving 'état'. Generating DOCX...")
        first_section_written_to_doc = False

        # Handle "État seul" category first if it exists
        if ETAT_SEUL_CATEGORY_TITLE in categorized_texts:
            add_document_heading(master_doc, ETAT_SEUL_CATEGORY_TITLE, level=2)
            first_section_written_to_doc = True
            texts_in_cat = categorized_texts[ETAT_SEUL_CATEGORY_TITLE]
            for i, data in enumerate(texts_in_cat):
                is_last_in_cat = (i == len(texts_in_cat) - 1)
                append_philosophy_text_to_doc(master_doc, data["title"], data["intro"], data["body"], data["source"], add_page_break_after=not is_last_in_cat)
                texts_added_to_docx_g +=1
            # Remove the processed category to avoid processing it again
            del categorized_texts[ETAT_SEUL_CATEGORY_TITLE]

        # Sort remaining categories (État et X) alphabetically for consistent order
        other_combo_keys = sorted(categorized_texts.keys())

        for category_key in other_combo_keys:
            if first_section_written_to_doc:
                master_doc.add_page_break()
            else:
                first_section_written_to_doc = True # This is the first section being written

            add_document_heading(master_doc, category_key, level=2)
            texts_in_cat = categorized_texts[category_key]
            for i, data in enumerate(texts_in_cat):
                is_last_in_cat = (i == len(texts_in_cat) - 1)
                append_philosophy_text_to_doc(master_doc, data["title"], data["intro"], data["body"], data["source"], add_page_break_after=not is_last_in_cat)
                texts_added_to_docx_g +=1

        try:
            master_doc.save(SINGLE_DOCX_OUTPUT_PATH)
            print(f"\nSuccessfully created DOCX: '{SINGLE_DOCX_OUTPUT_PATH}'")
        except Exception as e: print(f"ERROR saving DOCX: {e}")

    print("\n--- Summary ---")
    print(f"Total INSERT statements found: {insert_statements_found_g}")
    print(f"Total data rows processed: {rows_processed_count_g}")
    print(f"Total text append operations to DOCX: {texts_added_to_docx_g}")

if __name__ == '__main__':
    # Check for dependencies
    missing_libs = []
    try:
        import docx
    except ImportError:
        missing_libs.append("python-docx")
    try:
        import sqlparse
    except ImportError:
        missing_libs.append("sqlparse")

    if missing_libs:
        print(f"ERROR: Missing required libraries: {', '.join(missing_libs)}.")
        print(f"Please install them by running: pip install {' '.join(missing_libs)}")
        exit(1)

    main()

Starting processing of SQL file: '/content/TEXT.sql'

Categorizing texts based on notions 'état' and others...

Found 13 categories of texts involving 'état'. Generating DOCX...

Successfully created DOCX: '/content/philosophy_texts_by_notion_combination.docx'

--- Summary ---
Total INSERT statements found: 138
Total data rows processed: 1185
Total text append operations to DOCX: 100


## Ancienne version visant à extraire les textes sans les trier

In [None]:
#@title SQL to DOCX sans tri

import re
import os
import sqlparse
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

# --- Configuration ---
SQL_FILE_PATH = "/content/TEXT.sql"
SINGLE_DOCX_OUTPUT_PATH = "/content/all_philosophy_texts_sentence_caps.docx"

SQL_COLUMN_INDICES_TO_EXTRACT = [1, 2, 3, 4, 5, 6, 7, 8]

COL_IDX_TITLE = 0
COL_IDX_INTRO = 1
COL_IDX_BODY = 4
COL_IDX_SOURCE = 5
COL_IDX_NOTION = 6
COL_IDX_SECONDARY_NOTION = 7
# --- End Configuration ---

insert_statements_found_g = 0
rows_processed_count_g = 0
primary_texts_added_g = 0
secondary_texts_added_g = 0
rows_with_state_in_col8_g = 0


def repair_mojibake(text):
    if not text: return text
    try: return text.encode('latin-1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError): return text

def unescape_sql_string(s):
    s = s.replace("''", "'").replace('""', '"')
    s = s.replace("\\\\", "\\").replace("\\'", "'").replace('\\"', '"')
    s = s.replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t")
    return s

def extract_sql_row_data(p_token, list_of_sql_indices_to_extract):
    value_items_tokens = []
    initial_sub_tokens = []
    if hasattr(p_token, 'tokens'):
        for sub_token in p_token.tokens:
            if not sub_token.is_whitespace and sub_token.ttype != sqlparse.tokens.Punctuation:
                initial_sub_tokens.append(sub_token)
    if len(initial_sub_tokens) == 1:
        content_string_token = initial_sub_tokens[0]
        parsed_content_string = sqlparse.parse(str(content_string_token))
        if parsed_content_string and len(parsed_content_string) > 0:
            statement_from_reparse = parsed_content_string[0]
            if statement_from_reparse.tokens and isinstance(statement_from_reparse.tokens[0], sqlparse.sql.IdentifierList):
                value_items_tokens = list(statement_from_reparse.tokens[0].get_identifiers())
            else:
                for token_in_list in statement_from_reparse.tokens:
                    if not token_in_list.is_whitespace and token_in_list.ttype != sqlparse.tokens.Punctuation:
                        value_items_tokens.append(token_in_list)
    else: value_items_tokens = initial_sub_tokens

    max_needed_sql_index = max(list_of_sql_indices_to_extract) if list_of_sql_indices_to_extract else -1
    if len(value_items_tokens) <= max_needed_sql_index: return None

    extracted_data_row = []
    for sql_idx in list_of_sql_indices_to_extract:
        if sql_idx < len(value_items_tokens):
            raw_value = str(value_items_tokens[sql_idx]).strip()
            if (raw_value.startswith("'") and raw_value.endswith("'")) or \
               (raw_value.startswith('"') and raw_value.endswith('"')):
                cleaned_value = unescape_sql_string(raw_value[1:-1])
            elif raw_value.upper() == 'NULL': cleaned_value = ''
            else: cleaned_value = raw_value
            extracted_data_row.append(cleaned_value)
        else: return None
    return extracted_data_row if len(extracted_data_row) == len(list_of_sql_indices_to_extract) else None

def capitalize_first_letter(text_string):
    """Capitalizes the first letter of a string if it's not empty."""
    if not text_string:
        return ""
    return text_string[0].upper() + text_string[1:]

def capitalize_all_sentences(paragraph_text):
    """Capitalizes the first letter of each sentence in a paragraph."""
    if not paragraph_text:
        return ""

    # Capitalize the very first letter of the paragraph
    paragraph_text = capitalize_first_letter(paragraph_text)

    # Use regex to find sentence endings followed by a lowercase letter
    # and capitalize that letter.
    # This matches: (punctuation)(whitespace)(lowercase letter)
    # (.!?) - common sentence endings
    # \s+   - one or more whitespace characters
    # ([a-z]) - a lowercase letter (this will be capitalized)
    def cap_match(match):
        return match.group(1) + match.group(2).upper()

    paragraph_text = re.sub(r'([.!?]\s+)([a-z])', cap_match, paragraph_text)
    return paragraph_text

def append_philosophy_text_to_doc(doc, title_text, intro_text, body_text_raw, source_text_raw, add_page_break_after_this_text):
    default_font_name = 'Times New Roman'
    default_font_size = Pt(12)

    title_text = str(title_text or "")
    intro_text = str(intro_text or "") # Keep raw intro for now
    body_text_raw = str(body_text_raw or "")
    source_text_raw = str(source_text_raw or "")

    # Capitalize first letter of title
    title_text_processed = capitalize_first_letter(title_text.strip())

    # Capitalize all sentences in the intro paragraph
    intro_text_processed = capitalize_all_sentences(intro_text.strip())


    # 1. Headline (Title)
    p_title = doc.add_paragraph()
    run_title = p_title.add_run(title_text_processed or "No Title Provided")
    run_title.font.name = default_font_name; run_title.font.size = Pt(14); run_title.font.bold = True
    doc.add_paragraph()

    # 2. Introductory Paragraph
    if intro_text_processed: # Check if processed intro_text has content
        p_intro = doc.add_paragraph()
        run_intro = p_intro.add_run(intro_text_processed) # Use fully capitalized version
        run_intro.font.name = default_font_name; run_intro.font.size = default_font_size; run_intro.font.italic = True
        p_intro.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
        doc.add_paragraph()
    elif not (body_text_raw.strip() or source_text_raw.strip()): pass
    else: doc.add_paragraph()

    # 3. Body Text
    if body_text_raw.strip():
        sql_paragraphs = body_text_raw.split('\n')
        tag_parser_regex = re.compile(r'<(b|i|u)>(.*?)</\1>', re.IGNORECASE | re.DOTALL)
        for sql_para_content in sql_paragraphs:
            text_lines_from_br = re.split(r'<br\s*/?>', sql_para_content, flags=re.IGNORECASE)
            for line_content in text_lines_from_br:
                p_body = doc.add_paragraph(); p_body.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                current_pos = 0
                while current_pos < len(line_content):
                    match = tag_parser_regex.search(line_content, current_pos)
                    if match:
                        if line_content[current_pos:match.start()]:
                            run = p_body.add_run(line_content[current_pos:match.start()])
                            run.font.name = default_font_name; run.font.size = default_font_size
                        tag_name, tag_content = match.group(1).lower(), match.group(2)
                        run = p_body.add_run(tag_content)
                        run.font.name = default_font_name; run.font.size = default_font_size
                        if tag_name == 'b': run.font.bold = True
                        elif tag_name == 'i': run.font.italic = True
                        elif tag_name == 'u': run.font.underline = True
                        current_pos = match.end()
                    else:
                        if line_content[current_pos:]:
                            run = p_body.add_run(line_content[current_pos:])
                            run.font.name = default_font_name; run.font.size = default_font_size
                        current_pos = len(line_content)

    # 4. Source Text
    if source_text_raw.strip():
        doc.add_paragraph()
        p_source = doc.add_paragraph(); p_source.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        source_tag_parser_regex = re.compile(r'<(b|i|u)>(.*?)</\1>', re.IGNORECASE | re.DOTALL)
        current_pos_source = 0
        while current_pos_source < len(source_text_raw):
            match_s = source_tag_parser_regex.search(source_text_raw, current_pos_source)
            if match_s:
                if source_text_raw[current_pos_source:match_s.start()]:
                    run_s = p_source.add_run(source_text_raw[current_pos_source:match_s.start()])
                    run_s.font.name = default_font_name; run_s.font.size = default_font_size
                tag_name_s, tag_content_s = match_s.group(1).lower(), match_s.group(2)
                run_s = p_source.add_run(tag_content_s)
                run_s.font.name = default_font_name; run_s.font.size = default_font_size
                if tag_name_s == 'b': run_s.font.bold = True
                elif tag_name_s == 'i': run_s.font.italic = True
                elif tag_name_s == 'u': run_s.font.underline = True
                current_pos_source = match_s.end()
            else:
                if source_text_raw[current_pos_source:]:
                    run_s = p_source.add_run(source_text_raw[current_pos_source:])
                    run_s.font.name = default_font_name; run_s.font.size = default_font_size
                current_pos_source = len(source_text_raw)

    if add_page_break_after_this_text:
        doc.add_page_break()

def main():
    global insert_statements_found_g, rows_processed_count_g
    global primary_texts_added_g, secondary_texts_added_g, rows_with_state_in_col8_g

    print(f"Starting processing of SQL file: '{SQL_FILE_PATH}'")
    if not os.path.exists(SQL_FILE_PATH):
        print(f"ERROR: SQL input file not found: '{SQL_FILE_PATH}'"); return

    master_doc = Document()
    style = master_doc.styles['Normal']
    style.font.name = 'Times New Roman'; style.font.size = Pt(12)

    try:
        with open(SQL_FILE_PATH, 'r', encoding='utf-8') as f: sql_content = f.read()
    except Exception as e: print(f"Error reading SQL file: {e}"); return

    parsed_statements = sqlparse.parse(sql_content)
    primary_texts_data, secondary_texts_data = [], []

    print("\nScanning SQL rows for 'state' in primary (col 7) and secondary (col 8) notions...")
    for stmt in parsed_statements:
        if stmt.get_type() != 'INSERT': continue
        insert_statements_found_g += 1
        row_tokens = []
        values_token = next((t for t in stmt.tokens if isinstance(t, sqlparse.sql.Values)), None)
        if values_token: row_tokens = [st for st in values_token.get_sublists() if isinstance(st, sqlparse.sql.Parenthesis)]
        else:
            after_values = False
            for t in stmt.tokens:
                if after_values and isinstance(t, sqlparse.sql.Parenthesis): row_tokens.append(t)
                elif not after_values and t.is_keyword and t.normalized == 'VALUES': after_values = True
                elif after_values and not t.is_whitespace and str(t) != ',': break

        for p_token_row in row_tokens:
            rows_processed_count_g += 1
            extracted = extract_sql_row_data(p_token_row, SQL_COLUMN_INDICES_TO_EXTRACT)
            if extracted and len(extracted) == len(SQL_COLUMN_INDICES_TO_EXTRACT):
                text_data = {
                    "title": repair_mojibake(extracted[COL_IDX_TITLE]),
                    "intro": repair_mojibake(extracted[COL_IDX_INTRO]),
                    "body": repair_mojibake(extracted[COL_IDX_BODY]),
                    "source": repair_mojibake(extracted[COL_IDX_SOURCE])
                }
                primary_notion_text = repair_mojibake(extracted[COL_IDX_NOTION]).lower()
                secondary_notion_text = repair_mojibake(extracted[COL_IDX_SECONDARY_NOTION]).lower()

                state_in_primary = "justice" in primary_notion_text
                state_in_secondary = "justice" in secondary_notion_text

                if state_in_secondary:
                    rows_with_state_in_col8_g +=1

                if state_in_primary:
                    primary_texts_data.append(text_data)
                elif state_in_secondary:
                    secondary_texts_data.append(text_data)

    if rows_with_state_in_col8_g > 0:
        print(f"Found {rows_with_state_in_col8_g} occurrences of 'state' in secondary concepts (column 8).")
    else:
        print("No occurrences of 'state' found in secondary concepts (column 8).")

    if primary_texts_data:
        print(f"\nAppending {len(primary_texts_data)} primary texts to DOCX...")
        for i, data in enumerate(primary_texts_data):
            is_last_primary = (i == len(primary_texts_data) - 1)
            add_break = not (is_last_primary and not secondary_texts_data)
            append_philosophy_text_to_doc(master_doc, data["title"], data["intro"], data["body"], data["source"], add_break)
            primary_texts_added_g += 1

    if secondary_texts_data:
        print(f"\nAppending {len(secondary_texts_data)} secondary texts to DOCX...")

        if primary_texts_data or (not primary_texts_data and secondary_texts_data):
            pass

        p_heading = master_doc.add_paragraph()
        run_heading = p_heading.add_run("Secondary Texts")
        run_heading.font.name = 'Times New Roman'; run_heading.font.size = Pt(16); run_heading.font.bold = True
        p_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
        master_doc.add_paragraph()

        for i, data in enumerate(secondary_texts_data):
            add_break = (i < len(secondary_texts_data) - 1)
            append_philosophy_text_to_doc(master_doc, data["title"], data["intro"], data["body"], data["source"], add_break)
            secondary_texts_added_g += 1

    if primary_texts_added_g > 0 or secondary_texts_added_g > 0:
        try:
            master_doc.save(SINGLE_DOCX_OUTPUT_PATH)
            print(f"\nSuccessfully created single DOCX: '{SINGLE_DOCX_OUTPUT_PATH}'")
        except Exception as e: print(f"ERROR saving DOCX: {e}")
    else:
        print("\nNo texts met criteria for DOCX generation.")

    print("\n--- Summary ---")
    print(f"Total INSERT statements found: {insert_statements_found_g}")
    print(f"Total data rows processed: {rows_processed_count_g}")
    print(f"Rows with 'state' in column 8 (secondary notions): {rows_with_state_in_col8_g}")
    print(f"Primary texts ('state' in col 7) added to DOCX: {primary_texts_added_g}")
    print(f"Secondary texts ('state' ONLY in col 8) added to DOCX: {secondary_texts_added_g}")

if __name__ == '__main__':
    try: import docx, sqlparse
    except ImportError as e:
        print(f"Missing library: {e.name}. Install with: pip install python-docx sqlparse")
        exit(1)
    main()

Starting processing of SQL file: '/content/TEXT.sql'

Scanning SQL rows for 'state' in primary (col 7) and secondary (col 8) notions...
Found 38 occurrences of 'state' in secondary concepts (column 8).

Appending 68 primary texts to DOCX...

Appending 35 secondary texts to DOCX...

Successfully created single DOCX: '/content/all_philosophy_texts_sentence_caps.docx'

--- Summary ---
Total INSERT statements found: 138
Total data rows processed: 1185
Rows with 'state' in column 8 (secondary notions): 38
Primary texts ('state' in col 7) added to DOCX: 68
Secondary texts ('state' ONLY in col 8) added to DOCX: 35


In [None]:
#@title SQL to CSV

import sqlparse
import csv
import os

# --- Configuration ---
sql_file_path = "/content/TEXTES(1).sql"
# Making the CSV filename more descriptive for multiple columns
csv_file_path = "/content/TEXT_columns_2-7.csv"

# SQL column indices to extract (0-based).
# User wants SQL columns 2, 3, 4, 5, 6, 7.
# In 0-based indexing, these are: 1, 2, 3, 4, 5, 6.
sql_column_indices_to_extract = [1, 2, 3, 4, 5, 6, 7]

# CSV header row, reflecting the original SQL column numbers
csv_header = [f"SQL_Column_{idx + 1}" for idx in sql_column_indices_to_extract]
# --- End Configuration ---

print(f"Starting conversion of '{sql_file_path}' to '{csv_file_path}'.")
print(f"Extracting data from SQL columns (1-based): {[idx + 1 for idx in sql_column_indices_to_extract]}.")
print(f"CSV header will be: {csv_header}")


if not os.path.exists(sql_file_path):
    print(f"ERROR: SQL input file not found at '{sql_file_path}'")
    exit()

insert_statements_found = 0
rows_processed_count = 0
rows_written_to_csv = 0
_debug_extract_counter = 0 # Counter for debugging specific calls if needed

def extract_and_write_value_row(p_token, list_of_sql_indices, csv_w):
    """
    Processes an sqlparse.sql.Parenthesis token (representing a row of values),
    extracts the specified columns, cleans them, and writes them as a single row to the CSV.
    p_token is expected to be an sqlparse.sql.Parenthesis object.
    list_of_sql_indices are the 0-based indices to extract from the SQL data.
    Returns a tuple: (bool_was_row_format_ok, bool_written_to_csv_this_row)
    """
    global _debug_extract_counter, rows_written_to_csv # Allow modification of global rows_written_to_csv
    _debug_extract_counter += 1

    value_items = []

    initial_sub_tokens = []
    if hasattr(p_token, 'tokens'):
        for sub_token in p_token.tokens:
            if not sub_token.is_whitespace and sub_token.ttype != sqlparse.tokens.Punctuation:
                initial_sub_tokens.append(sub_token)

    # This detailed debug block can be re-enabled if issues arise with specific rows
    # For now, it's condensed as the previous step confirmed this mechanism.
    first_few_rows_debug = (_debug_extract_counter <= 3 and rows_written_to_csv == 0) # More targeted debug

    if first_few_rows_debug:
        print(f"\n--- DETAILED DEBUG for extract_and_write_value_row (call #{_debug_extract_counter}) ---")
        print(f"  Parenthesis (p_token) string: '{str(p_token)[:200]}...'")
        print(f"  Initial sub_tokens (after filtering out '(' and ')' and whitespace): {len(initial_sub_tokens)}")

    if len(initial_sub_tokens) == 1:
        content_string_token = initial_sub_tokens[0]
        parsed_content_string = sqlparse.parse(str(content_string_token))

        if parsed_content_string and len(parsed_content_string) > 0:
            statement_from_reparse = parsed_content_string[0]
            if statement_from_reparse.tokens and isinstance(statement_from_reparse.tokens[0], sqlparse.sql.IdentifierList):
                identifier_list_token = statement_from_reparse.tokens[0]
                value_items = list(identifier_list_token.get_identifiers())
                if first_few_rows_debug:
                    print(f"  SUCCESS: Extracted {len(value_items)} items from IdentifierList.")
            else: # Fallback if not IdentifierList (less likely for comma-separated values)
                if first_few_rows_debug:
                    print("  INFO: Re-parsed content's first token NOT IdentifierList. Iterating its tokens.")
                for token_in_list in statement_from_reparse.tokens:
                    if not token_in_list.is_whitespace and token_in_list.ttype != sqlparse.tokens.Punctuation:
                        value_items.append(token_in_list)
        elif first_few_rows_debug:
            print(f"  WARNING: Re-parsing content string '{str(content_string_token)[:100]}...' yielded no statements.")
    elif first_few_rows_debug:
         print(f"  WARNING: Expected 1 central content token from Parenthesis, found {len(initial_sub_tokens)}. Using these directly (likely wrong).")
         value_items = initial_sub_tokens


    max_needed_sql_index = -1
    if list_of_sql_indices:
        max_needed_sql_index = max(list_of_sql_indices)

    if len(value_items) <= max_needed_sql_index:
        if first_few_rows_debug:
            print(f"  FINAL WARNING: Need up to SQL index {max_needed_sql_index}, but only {len(value_items)} value items identified. Row skipped.")
            print(f"    Identified items: {[str(item)[:30] for item in value_items]}...")
        if first_few_rows_debug: print(f"--- END DETAILED DEBUG call #{_debug_extract_counter} ---\n")
        return True, False

    csv_row_to_write = []
    all_cols_extracted_successfully = True
    for sql_idx_to_extract in list_of_sql_indices:
        # This check should be redundant given the max_needed_sql_index check above, but for safety:
        if sql_idx_to_extract < len(value_items):
            target_token = value_items[sql_idx_to_extract]
            raw_value = str(target_token).strip()

            if (raw_value.startswith("'") and raw_value.endswith("'")) or \
               (raw_value.startswith('"') and raw_value.endswith('"')):
                cleaned_value = raw_value[1:-1]
                if raw_value.startswith("'"):
                    cleaned_value = cleaned_value.replace("''", "'")
                else:
                    cleaned_value = cleaned_value.replace('""', '"')
            elif raw_value.upper() == 'NULL':
                cleaned_value = ''
            else:
                cleaned_value = raw_value
            csv_row_to_write.append(cleaned_value)
        else:
            # Should not happen if logic above is correct
            if first_few_rows_debug:
                 print(f"  CRITICAL ERROR: Attempted to access SQL index {sql_idx_to_extract} "
                       f"but only {len(value_items)} items. This indicates a logic flaw.")
            all_cols_extracted_successfully = False
            break

    if first_few_rows_debug:
        print(f"  Final value items for extraction ({len(value_items)}): {[str(item)[:70] for item in value_items]}")
        print(f"  CSV row to write (before decision): {csv_row_to_write}")
        print(f"--- END DETAILED DEBUG call #{_debug_extract_counter} ---\n")


    if all_cols_extracted_successfully and len(csv_row_to_write) == len(list_of_sql_indices):
        csv_w.writerow(csv_row_to_write)
        return True, True

    return True, False


try:
    with open(sql_file_path, 'r', encoding='utf-8') as sql_file, \
         open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_outfile:

        csv_writer = csv.writer(csv_outfile)

        # Write the header row to the CSV
        if csv_header:
            csv_writer.writerow(csv_header)

        try:
            sql_content = sql_file.read()
        except Exception as e:
            print(f"Error reading SQL file content: {e}")
            raise

        parsed_statements = sqlparse.parse(sql_content)
        # first_insert_structure_printed = False # Can be removed or kept for one-time structure check

        for stmt_idx, stmt in enumerate(parsed_statements):
            if stmt.get_type() != 'INSERT':
                continue

            insert_statements_found += 1

            # Debug print for the structure of the first INSERT statement (can be commented out)
            # if not first_insert_structure_printed and insert_statements_found == 1:
            #     print(f"\n--- Structure of the FIRST INSERT statement found (Statement #{stmt_idx}) ---")
            #     # ... (full token printing code for the first INSERT statement)
            #     print(f"--- End structure of first INSERT statement ---\n")
            #     first_insert_structure_printed = True

            processed_values_for_this_stmt = False
            for token in stmt.tokens:
                if isinstance(token, sqlparse.sql.Values):
                    sublists = token.get_sublists()
                    for value_row_candidate in sublists:
                        if isinstance(value_row_candidate, sqlparse.sql.Parenthesis):
                            rows_processed_count += 1
                            # Pass the list of indices to the modified function
                            _, written = extract_and_write_value_row(
                                value_row_candidate, sql_column_indices_to_extract, csv_writer
                            )
                            if written:
                                rows_written_to_csv += 1
                    processed_values_for_this_stmt = True
                    break

            if not processed_values_for_this_stmt:
                is_after_values_keyword = False
                for token_fb in stmt.tokens:
                    if is_after_values_keyword:
                        if isinstance(token_fb, sqlparse.sql.Parenthesis):
                            rows_processed_count += 1
                            _, written = extract_and_write_value_row(
                                token_fb, sql_column_indices_to_extract, csv_writer
                            )
                            if written:
                                rows_written_to_csv += 1
                            processed_values_for_this_stmt = True
                            break
                        elif not token_fb.is_whitespace and not token_fb.is_comment:
                            break
                    if token_fb.is_keyword and token_fb.normalized == 'VALUES':
                        is_after_values_keyword = True

    print("\n--- Summary ---")
    print(f"Total INSERT statements found: {insert_statements_found}")
    print(f"Total data rows (Parenthesis objects) processed: {rows_processed_count}")
    print(f"Total rows written to CSV: {rows_written_to_csv}")

    if os.path.exists(csv_file_path) and rows_written_to_csv > 0:
        print(f"\nSuccessfully converted data to '{csv_file_path}'.")
        with open(csv_file_path, 'r', encoding='utf-8') as f_verify:
            print("\nFirst 5 lines of the CSV output (including header):")
            for i, line in enumerate(f_verify):
                if i >= 5: break
                print(line.strip())
    elif rows_processed_count > 0 and rows_written_to_csv == 0:
         print(f"\nWarning: Processed {rows_processed_count} data rows but wrote 0 rows to CSV.")
         max_sql_col_needed = max(sql_column_indices_to_extract) + 1 if sql_column_indices_to_extract else 0
         print(f"This means that for all processed rows, the script did not find enough columns (at least {max_sql_col_needed} for the highest requested SQL column index) to extract all target columns.")
         print("If 'DETAILED DEBUG' messages were enabled and printed above, review them for clues on column counts per row.")
    elif insert_statements_found > 0 and rows_processed_count == 0 :
        print("\nWarning: Found INSERT statements, but could not identify any processable data rows.")
    elif insert_statements_found == 0:
        print("\nWarning: No INSERT statements were found in the SQL file.")
    else:
        print(f"\nCSV file '{csv_file_path}' might be empty or not created due to issues.")

except FileNotFoundError:
    print(f"ERROR: SQL input file not found at '{sql_file_path}'. Please ensure the path is correct.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    import traceback
    traceback.print_exc()

Starting conversion of '/content/TEXTES(1).sql' to '/content/TEXT_columns_2-7.csv'.
Extracting data from SQL columns (1-based): [2, 3, 4, 5, 6, 7, 8].
CSV header will be: ['SQL_Column_2', 'SQL_Column_3', 'SQL_Column_4', 'SQL_Column_5', 'SQL_Column_6', 'SQL_Column_7', 'SQL_Column_8']

--- DETAILED DEBUG for extract_and_write_value_row (call #1) ---
  Parenthesis (p_token) string: '(1, 'la vertu et la dÃ©cadence dans la politique grecque', 'montesquieu met en lumière le contraste entre les politiques grecs d\'autrefois, qui valorisaient la vertu comme soutien du gouvernement pop...'
  Initial sub_tokens (after filtering out '(' and ')' and whitespace): 1
  SUCCESS: Extracted 13 items from IdentifierList.
  Final value items for extraction (13): ['1', "'la vertu et la dÃ©cadence dans la politique grecque'", "'montesquieu met en lumière le contraste entre les politiques grecs d\\", "'Les vertus civiques ont disparu.'", "'Montesquieu'", "'\t«\xa0Les politiques grecs qui vivaient dans le gou

In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m143.4/244.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2
