In [1]:
# -*- coding: utf-8 -*-
"""
Integrated Jupyter‐style script for MALLET‐based topic modeling
with per‐file Fisher’s Exact filtering and stopword exclusion.
Supports UTF-8 encoding for French accents.
"""

import os
import csv
from pathlib import Path
from collections import Counter
import pandas as pd
from scipy.stats import fisher_exact
import little_mallet_wrapper
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows

# Path to your MALLET binary
# path_to_mallet = "mallet-2.0.8/bin/mallet"
path_to_mallet = os.path.expanduser("~/mallet-2.0.8/bin/mallet")


# ─── Utility Functions ─────────────────────────────────────────────────────────

def tokenize_file(filepath):
    """Read a text file (UTF-8) and split into tokens (preserves French accents)."""
    with open(filepath, "r", encoding="utf-8") as f:
        return f.read().split()


def list_txt_files(directory):
    """Return sorted list of .txt filenames in a directory."""
    return sorted([fn for fn in os.listdir(directory) if fn.endswith(".txt")])


def list_csv_files(directory):
    """Recursively find all .csv files under directory."""
    csvs = []
    for root, _, files in os.walk(directory):
        for fn in files:
            if fn.endswith(".csv"):
                csvs.append(os.path.join(root, fn))
    return csvs


def choose_directory(prompt):
    """
    Let user pick one of the subfolders (or current) by number.
    Shows relative paths starting from the name of the current directory.
    Returns a full path.
    """
    cwd = os.getcwd()
    base = Path(cwd).name
    # Collect current folder and all subdirectories
    dirs = [cwd]
    for root, subdirs, _ in os.walk(cwd):
        for d in subdirs:
            dirs.append(os.path.join(root, d))

    # Build display names
    options = []
    for d in dirs:
        rel = os.path.relpath(d, cwd)
        display = base if rel == "." else f"{base}/{rel}"
        options.append((display, d))

    print(prompt)
    for i, (display, _) in enumerate(options, start=1):
        print(f"{i}. {display}")
    while True:
        try:
            sel = int(input("Enter number: ").strip())
            if 1 <= sel <= len(options):
                return options[sel - 1][1]
        except ValueError:
            pass
        print("Invalid choice, try again.")


def choose_files(files):
    """
    Let user select from a numbered list of filenames.
    Supports 'all', single number, ranges (1-3), or prefix patterns.
    Displays selected files after selection.
    """
    print("Available files:")
    for i, fn in enumerate(files, start=1):
        print(f"{i}. {fn}")
    choice = input(
        "Pick files ('all', numbers, ranges e.g. 1-3, or prefix text): "
    ).strip()
    if choice.lower() == "all":
        sel = files[:]
    else:
        sel = []
        for part in choice.split(","):
            part = part.strip()
            if "-" in part:
                a, b = map(int, part.split("-"))
                sel.extend(files[a - 1 : b])
            elif part.isdigit():
                sel.append(files[int(part) - 1])
            else:
                sel.extend([fn for fn in files if fn.startswith(part)])
        sel = sorted(set(sel))
    print("Selected files:")
    for fn in sel:
        print(f"- {fn}")
    return sel


def choose_csv_file(csv_files):
    """Prompt user to pick one CSV file from a list."""
    print("Select your stopwords .csv:")
    for i, path in enumerate(csv_files, start=1):
        print(f"{i}. {path}")
    while True:
        try:
            sel = int(input("Enter number: ").strip())
            if 1 <= sel <= len(csv_files):
                return csv_files[sel - 1]
        except ValueError:
            pass
        print("Invalid choice.")


def read_stopwords(filepath):
    """Read CSV stopword file (UTF-8), split on commas, strip whitespace."""
    sw = []
    with open(filepath, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            for cell in row:
                sw.extend(cell.split(","))
    return [w.strip() for w in sw if w.strip()]


# ─── Fisher’s Exact & Rate Dictionary ──────────────────────────────────────────

def get_fishers(word, freq_dict, rate_dict, alternative="greater"):
    """
    Compute p-value of Fisher’s Exact Test:
      [[obs, total-obs],
       [exp, total-exp]]
    """
    obs = freq_dict.get(word, 0)
    total = sum(freq_dict.values())
    remainder = total - obs
    exp = round(rate_dict.get(word, 0) * total)
    comp_exp = total - exp
    _, pval = fisher_exact(
        [[obs, remainder], [exp, comp_exp]], alternative=alternative
    )
    return pval


def calculate_rate_dictionary(rate_files, rate_dir):
    """
    Build a global rate dictionary from a set of reference files.
    Returns { token: relative_frequency }.
    """
    ctr = Counter()
    total = 0
    for fn in rate_files:
        tokens = tokenize_file(os.path.join(rate_dir, fn))
        ctr.update(tokens)
        total += len(tokens)
    return {tok: cnt / total for tok, cnt in ctr.items()}


# ─── Preprocessing & Training Data Prep ───────────────────────────────────────

def prepare_training_data(target_files, target_dir, stopwords, rate_dict, alpha):
    """
    For each file:
      – tokenize (UTF-8)
      – build frequency dict
      – run Fisher’s Exact per token against rate_dict
      – keep only tokens with p-value < alpha and not in stopwords
    Returns:
      docs: [ str ]        # documents ready for MALLET
      dists: [ dict ]      # raw freq-dicts per document
    """
    docs = []
    dists = []
    for fn in target_files:
        path = os.path.join(target_dir, fn)
        tokens = tokenize_file(path)
        freq = Counter(tokens)
        filtered = [
            w
            for w in tokens
            if w not in stopwords and get_fishers(w, freq, rate_dict) < alpha
        ]
        docs.append(" ".join(filtered))
        dists.append(freq)
    return docs, dists


# ─── Topic Model Training & Export ────────────────────────────────────────────

def train_topic_model(training_data, num_topics, output_dir):
    """
    Calls little_mallet_wrapper to train and load topics.
    Creates output_dir if needed.
    Returns: topics (list of lists)
    """
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    little_mallet_wrapper.quick_train_topic_model(
        path_to_mallet, output_dir, num_topics, training_data
    )
    return little_mallet_wrapper.load_topic_keys(
        f"{output_dir}/mallet.topic_keys.{num_topics}"
    )


def save_results_to_excel(excel_path, topics, distributions, target_files):
    """
    1) Sheet 'Topics' with one row per topic (Topic#, tokens...)
    2) One sheet per target_file (sheet name = stem) showing distribution
    """
    wb = Workbook()
    # -- Topics sheet --
    ws0 = wb.active
    ws0.title = "Topics"
    for idx, topic in enumerate(topics):
        ws0.append([f"Topic {idx}"] + topic)

    # -- Per-document distribution sheets --
    for fn, dist in zip(target_files, distributions):
        stem = Path(fn).stem
        ws = wb.create_sheet(title=stem)
        df = pd.DataFrame.from_dict(dist, orient="index", columns=["count"])
        for row in dataframe_to_rows(df, index=True, header=True):
            ws.append(row)

    wb.save(excel_path)


def export_heatmap(model, output_dir):
    """
    Plot and save the MALLET heatmap as JPG for Word insertion.
    """
    fig = little_mallet_wrapper.plot_categories_by_topics_heatmap(model)
    fig.savefig(os.path.join(output_dir, "heatmap.jpg"), format="jpg", dpi=300)


def save_top_titles(excel_path, n_titles, topics):
    """
    Write top n_titles per topic into an .xlsx,
    one sheet per topic named 'Topic{#}'.
    """
    wb = Workbook()
    for idx, _ in enumerate(topics):
        ws = wb.create_sheet(title=f"Topic{idx}")
        top = little_mallet_wrapper.display_top_titles_per_topic(idx, n_titles)
        for title in top:
            ws.append([title])
    wb.save(excel_path)


def display_success_message():
    print("✅ Notebook has finished successfully!")


# ─── Main Workflow ────────────────────────────────────────────────────────────

def main():
    # Change 0: select stopwords list
    stop_csv = choose_csv_file(list_csv_files(os.getcwd()))
    stopwords = read_stopwords(stop_csv)

    # Change 2: select files for rate dictionary
    rate_dir = choose_directory("Select RATE DICTIONARY directory:")
    rate_files = choose_files(list_txt_files(rate_dir))
    rate_dict = calculate_rate_dictionary(rate_files, rate_dir)

    # Change 6: select files to topic‐model
    target_dir = choose_directory("Select TARGET FILES directory:")
    target_files = choose_files(list_txt_files(target_dir))

    # Change 3: alpha threshold
    alpha = float(input("Enter alpha threshold for Fisher’s Exact: ").strip())

    # Changes 4 & 5: prepare filtered training data
    training_docs, distributions = prepare_training_data(
        target_files, target_dir, stopwords, rate_dict, alpha
    )

    # Change 7 & 12: number of topics and output subfolder
    num_topics = int(input("Enter number of topics to generate: ").strip())
    out_sub = input("Enter name for output subfolder: ").strip()
    output_dir = os.path.join(os.getcwd(), out_sub)

    # Train model (Change 11 & 12)
    topics = train_topic_model(training_docs, num_topics, output_dir)

    # Export to Excel (Changes 13–15)
    excel_path = os.path.join(output_dir, "topic_model_results.xlsx")
    save_results_to_excel(excel_path, topics, distributions, target_files)

    # Export heatmap as JPG (Change 16)
    export_heatmap(topics, output_dir)

    # Changes 17–19: top‐titles export
    n_titles = int(input("How many top titles per topic? ").strip())
    top_titles_path = os.path.join(output_dir, "top_titles.xlsx")
    save_top_titles(top_titles_path, n_titles, topics)

    # Change 20: success message
    display_success_message()


if __name__ == "__main__":
    main()

Select your stopwords .csv:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized/stop_words.csv


Enter number:  1


Select RATE DICTIONARY directory:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized/concordances
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized/.ipynb_checkpoints
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized


Enter number:  3


Available files:
1. Discours des raisons_corrected_stemmed.txt
2. Démonomanie I.1_corrected_stemmed.txt
3. Démonomanie I.2_corrected_stemmed.txt
4. Démonomanie I.3_corrected_stemmed.txt
5. Démonomanie I.4_corrected_stemmed.txt
6. Démonomanie I.5_corrected_stemmed.txt
7. Démonomanie I.6_corrected_stemmed.txt
8. Démonomanie I.7_corrected_stemmed.txt
9. Démonomanie II.1_corrected_stemmed.txt
10. Démonomanie II.2_corrected_stemmed.txt
11. Démonomanie II.3_corrected_stemmed.txt
12. Démonomanie II.4_corrected_stemmed.txt
13. Démonomanie II.5_corrected_stemmed.txt
14. Démonomanie II.6_corrected_stemmed.txt
15. Démonomanie II.7_corrected_stemmed.txt
16. Démonomanie II.8_corrected_stemmed.txt
17. Démonomanie III.1_corrected_stemmed.txt
18. Démonomanie III.2_corrected_stemmed.txt
19. Démonomanie III.3_corrected_stemmed.txt
20. Démonomanie III.4_corrected_stemmed.txt
21. Démonomanie III.5_corrected_stemmed.txt
22. Démonomanie III.6_corrected_stemmed.txt
23. Démonomanie IV.1_corrected_stemmed.txt


Pick files (numbers, ranges e.g. 1-3, or prefix text):  all


Select TARGET FILES directory:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized/concordances
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized/.ipynb_checkpoints
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized


KeyboardInterrupt: Interrupted by user