In [49]:
import pandas as pd
from make_data import *

# Get dict to classify themes

In [50]:
themes = pd.read_csv('data/themes.csv')
themes = themes.drop_duplicates(subset='QUESTION')
themes_dict = get_theme(themes)

In [51]:
themes_dict.keys()

dict_keys(['Le coeur a ses raisons que la raison ignore', 'Quand-on aime sait-on pourquoi', "Qu'aime-t-on dans l'amour", "Est-il raisonnable d'aimer", 'Peut-on aimer son prochain comme soi-même', 'Est-ce réaliste de prétendre pouvoir aimer tous les hommes', "Dans tout amour n'aime t-on que soi-même", 'Pour aimer autrui faut-il le connaître', "L'amitié est-elle une forme privilégiée de la connaissance d'autrui", 'Comment différencier amour et amitié', "Peut-on aimer une oeuvre d'art sans la comprendre", "Peut-on comprendre une oeuvre d'art sans l'aimer", "L'art peut-il manifester la vérité", 'Les oeuvres d´art sont-elles des réalités comme les autres', 'L’art transforme-t-il notre conscience du réel', "La sensibilité aux œuvres d'art demande-t-elle à être éduquée", 'Faut-il des connaissances pour apprécier une œuvre d’art', "L'art sait-il montrer ce que le langage ne peut pas dire", 'En quoi la beauté artistique est-elle supérieure à la beauté naturelle', "Pourquoi les productions qui s

# Create database

## Print test

In [None]:
import unicodedata
from docx import Document
from rapidfuzz import fuzz

def normalize_text(text):
    """
    1) Convert accented characters to their unaccented base (e.g., é -> e).
    2) Convert to lowercase.
    3) Remove commas (',') and hyphens ('-').
    """
    text_no_accents = ''.join(
        ch for ch in unicodedata.normalize('NFKD', text)
        if unicodedata.category(ch) != 'Mn'
    )
    text_no_accents = text_no_accents.lower()
    text_no_accents = text_no_accents.replace(',', '').replace('-', '')

    return text_no_accents

def is_fuzzy_match(paragraph_text, target_text, threshold=85):
    """
    Returns True if 'paragraph_text' is considered a match for 'target_text'
    above a given similarity threshold (0-100).

    We do approximate matching using RapidFuzz's fuzz.ratio.
    """
    score = fuzz.ratio(paragraph_text, target_text)
    return score >= threshold

def paragraph_contains_any_target_fuzzy(paragraph_text, targets, threshold=85):
    """
    Normalize 'paragraph_text' and each target, then check if
    ANY target is a 'fuzzy match' substring in the paragraph.

    By default, we do substring fuzzy matching in a naive way:
      - We'll slide through paragraph_text, checking each chunk
        of length 'len(target)' for fuzzy similarity
      - Alternatively, you can decide to do just 'fuzz.ratio'
        on the entire paragraph vs. entire target if that suits your logic.
    """
    # Normalize the paragraph text once
    norm_para = normalize_text(paragraph_text)

    for t in targets:
        norm_t = normalize_text(t)

        # Option 1: We check the entire paragraph vs. the entire target
        #           If similarity is high, consider it a match
        # score = fuzz.ratio(norm_para, norm_t)
        # if score >= threshold:
        #     return True

        # Option 2 (more advanced): We treat each substring of the paragraph
        # of length len(norm_t) as a candidate to compare with norm_t.
        # That way we can detect partial matches inside a bigger paragraph.
        t_len = len(norm_t)
        if t_len == 0:
            continue

        # Slide over the paragraph in windows of length t_len
        for start_idx in range(len(norm_para) - t_len + 1):
            chunk = norm_para[start_idx:start_idx+t_len]
            if is_fuzzy_match(chunk, norm_t, threshold=threshold):
                return True

    return False

def print_docx_paragraphs_fuzzy(file_path, themes_dict, threshold=85):
    """
    Reads a .docx file paragraph by paragraph, ignoring:
      - Accents (é -> e)
      - Commas and hyphens
    and allowing for approximate/fuzzy matches between paragraphs and 'themes_dict'.

    Splits (or signals a 'chunk end') if the current paragraph has a fuzzy match
    with ANY theme key, AND the next paragraph does NOT.
    """
    try:
        doc = Document(file_path)
        paragraphs = doc.paragraphs

        for index, paragraph in enumerate(paragraphs):
            para_text = paragraph.text
            print(f"Paragraph {index + 1}: {para_text}")
            print("-" * 50)

            current_has_key = paragraph_contains_any_target_fuzzy(
                para_text,
                themes_dict.keys(),
                threshold=threshold
            )

            # If there is a next paragraph, let's check that too
            if index < len(paragraphs) - 1:
                next_text = paragraphs[index + 1].text
                next_has_key = paragraph_contains_any_target_fuzzy(
                    next_text,
                    themes_dict.keys(),
                    threshold=threshold
                )
            else:
                next_has_key = False  # There's no next paragraph

            if current_has_key and not next_has_key:
                print("CHUNK ENDS HERE ------------------------------------------")

    except Exception as e:
        print(f"An error occurred: {e}")

# -----------------------------------------------------------------------
# USAGE EXAMPLE
# -----------------------------------------------------------------------
if __name__ == "__main__":
    # Suppose these are the 'headings' or target phrases we want to detect.
    # We'll handle approximate matches so even if they differ slightly,
    # we may still detect them if the fuzzy ratio is >= threshold.
    file_path = "data/Ce qui est important 4 _ Plusjapprends.com.docx"

    # Lower threshold = more lenient matching,
    # higher threshold = more strict.
    # 85 is often a decent starting point.
    print_docx_paragraphs_fuzzy(file_path, themes_dict, threshold=85)


## dict

In [52]:
import unicodedata
from docx import Document
from rapidfuzz import fuzz

def normalize_text(text):
    """
    1) Convert accented characters to their unaccented base (e.g., é -> e).
    2) Convert to lowercase.
    3) Remove commas (',') and hyphens ('-').
    """
    text_no_accents = ''.join(
        ch for ch in unicodedata.normalize('NFKD', text)
        if unicodedata.category(ch) != 'Mn'
    )
    text_no_accents = text_no_accents.lower()
    text_no_accents = text_no_accents.replace(',', '').replace('-', '')
    return text_no_accents

def is_fuzzy_match(paragraph_text, target_text, threshold=85):
    """
    Returns True if 'paragraph_text' is considered a match for 'target_text'
    above a given similarity threshold (0-100).

    Uses RapidFuzz's fuzz.ratio for approximate matching.
    """
    score = fuzz.ratio(paragraph_text, target_text)
    return score >= threshold

def paragraph_matched_targets_fuzzy(paragraph_text, targets, threshold=85):
    """
    Returns a set of all targets that fuzzy-match any substring of 'paragraph_text'.

    Uses a naive 'sliding window' approach:
      - We'll slide through 'paragraph_text' (normalized) in chunks of length len(target),
        comparing each chunk to the normalized target with is_fuzzy_match(...).
    """
    matched_targets = set()
    norm_para = normalize_text(paragraph_text)

    for t in targets:
        norm_t = normalize_text(t)
        t_len = len(norm_t)
        if t_len == 0:
            continue

        # Slide over the paragraph in windows of length t_len
        for start_idx in range(len(norm_para) - t_len + 1):
            chunk = norm_para[start_idx:start_idx + t_len]
            if is_fuzzy_match(chunk, norm_t, threshold=threshold):
                matched_targets.add(t)
                break  # No need to continue searching for this target

    return matched_targets

def chunk_docx_paragraphs_fuzzy(file_path, themes_dict, threshold=85):
    """
    Reads a .docx file paragraph by paragraph.
    For each paragraph, checks if it has a fuzzy match with any key from 'themes_dict'.
    We split (finalize a chunk) when:
      (current paragraph has a match) AND (the next paragraph does NOT).

    Returns a dictionary of the form:
        {
            "Chunk text (joined paragraphs)": {set of matched targets in this chunk}
        }

    The original snippet does not finalize any leftover paragraphs at the end unless
    the condition is met, so we preserve that exact behavior.
    """
    try:
        doc = Document(file_path)
        paragraphs = doc.paragraphs

        chunks_dict = {}

        current_chunk_paragraphs = []
        current_chunk_matched = set()

        for index, paragraph in enumerate(paragraphs):
            para_text = paragraph.text
            current_chunk_paragraphs.append(para_text)

            # Which targets match this paragraph?
            matched_targets_here = paragraph_matched_targets_fuzzy(
                para_text,
                themes_dict.keys(),
                threshold=threshold
            )
            # Accumulate matches
            current_chunk_matched |= matched_targets_here

            current_has_key = (len(matched_targets_here) > 0)

            # Check the next paragraph (if any)
            if index < len(paragraphs) - 1:
                next_text = paragraphs[index + 1].text
                matched_targets_next = paragraph_matched_targets_fuzzy(
                    next_text,
                    themes_dict.keys(),
                    threshold=threshold
                )
                next_has_key = (len(matched_targets_next) > 0)
            else:
                # No next paragraph
                next_has_key = False

            # Split condition: current paragraph has match, next does not
            if current_has_key and not next_has_key:
                chunk_text = "\n".join(current_chunk_paragraphs).strip()
                chunks_dict[chunk_text] = current_chunk_matched

                # Reset accumulators for the next chunk
                current_chunk_paragraphs = []
                current_chunk_matched = set()

        return chunks_dict

    except Exception as e:
        # Return an empty dict or re-raise depending on your preference
        print(f"An error occurred: {e}")
        return {}


# Example usage:
if __name__ == "__main__":

    file_path = "data/Ce qui est important 4 _ Plusjapprends.com.docx"
    threshold = 85

    # Call the function
    result_dict = chunk_docx_paragraphs_fuzzy(file_path, themes_dict, threshold=threshold)
    # 'result_dict' now contains all chunk => matched-targets mappings.


In [53]:
correct_items = [items for key, items in result_dict.items()]

## take questions out