In [1]:
import os
from nltk.stem.snowball import SnowballStemmer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

# Initialize the stemmer and lemmatizer for French
stemmer = SnowballStemmer(language='french')
lemmatizer = FrenchLefffLemmatizer()

# Define custom stemming/lemmatization rules sorted alphabetically with five entries per line
CUSTOM_RULES = {
    "abfolu": "absolu", "accuf": "accus", "aduient": "advient", "aduis": "advis", "alor": "alors",
    "ariftot": "aristot", "aufquel": "ausquel", "auantag": "avantag", "auon": "avon", "auoyent": "avoyent",
    "beft": "best", "cai": "cayer", "ceft": "cest", "cftat": "estat",
    "comiffion": "commission", "commiffion": "commission", "commiflair": "commissair", "commiflion": "commission",
    "confeff": "confess", "conful": "consul", "coft": "cost", "costé": "côté", "defquel": "desquel",
    "dieux": "dieu", "déput": "deput", "député": "deput", "depuré": "deput", "depurez": "deput",
    "deuant": "devant", "desloix": "des loi", "deteft": "detest", "difent": "disent",
    "difoit": "disoit", "diuif": "divis", "diuin": "divin", "droi": "doit", "droict": "droit",
    "edit": "édict", "édictz": "édict", "eftant": "estant", "efté": "esté", "eftim": "estim",
    "eftoyent": "estoyent", "efcript": "escript", "efpaign": "espaigne", "efpec": "espec", "efprit": "esprit",
    "empefch": "empesch", "enfans": "enfant", "enfembl": "ensembl", "ensan": "enfant", "ensans": "enfant",
    "enuer": "enver", "esglis": "églis", "espic": "épice", "estar": "estat", "estatz": "estat",
    "état": "estat", "euft": "eust", "fag": "sag", "faif": "sais", "faifoit": "faisoit",
    "facrific": "sacrific", "feliqu": "felicit", "fembl": "sembl", "femblabl": "semblabl", "fecond": "second",
    "fept": "sept", "feulg": "seul", "felon": "selon", "fent": "sent", "feroit": "seroit",
    "fcauoir": "savoir", "fcigneur": "seigneur", "fçauoir": "savoir", "finon": "sinon", "fimpl": "simpl",
    "fignif": "signif", "file": "fill", "fong": "song", "foyent": "soyent", "fuft": "fust",
    "foudain": "soudain", "hebrieux": "hebrieu", "iug": "jug", "iufqu": "iusqu", "iust": "just", "iustic": "justic",
    "iurifdiet": "iurisdict", "iurifdict": "iurisdict", "laloy": "la loi", "laiff": "laiss", "leroy": "le roy",
    "lefquel": "lesquel", "lesloix": "les loi", "lifon": "lison", "liur": "livr", "loix": "loi",
    "loy": "loi", "loyx": "loi", "maicft": "maiest", "maifon": "maison", "maiftr": "maistr",
    "magiftrat": "magistrat", "magiftrats": "magistrat", "magiltrat": "magistrat", "mefine": "même", "mefines": "mêmes", "meím": "mesm", 
    "mesmoir": "memoir", "monftr": "monstr", "noftr": "nostr", "pai": "pay", "parlement": "parlement", 
    "parlements": "parlement", "penf": "pens", "peuuent": "peuvent", "pluficur": "plusieur", 
    "plutfoft": "plustost", "pouuoir": "pouvoir", "pouuoit": "pouvoit", "pourueu": "pourveu", 
    "prefent": "present", "prefqu": "prequ", "prerogatiu": "prerogativ", "prin c": "prince", 
    "prin ces": "princ", "pris": "prix", "priuileg": "privileg", "procez": "procès", "puiff": "puiss",
    "puifle": "puiss", "puiffanc": "puissanc", "quc": "que", "queftion": "question", "raifon": "raison",
    "reffort": "ressort", "repvbliqu": "republ", "républicque": "républ", "royaulm": "royaum", "royau me": "royaum",
    "sorcicr": "sorcier", "sorciere": "sorci", "sorcieres": "sorci", "soubverain": "souverain", 
    "souverian": "souverain", "souverianet": "souverainet", "subject": "sujet", "subjectz": "sujet",
    "toufiour": "tousiour", "trouu": "trouv", "vertus": "vertu", "viur": "vivr", "vifion": "vision",
    "ftatut": "statut"
}


def custom_stem_or_lemmatize(word, use_lemmatizer):
    """
    Apply custom rules during stemming or lemmatizing on a single token.
    """
    if word in CUSTOM_RULES:
        return CUSTOM_RULES[word]
    if use_lemmatizer:
        lemmatized_word = lemmatizer.lemmatize(word)
        return CUSTOM_RULES.get(lemmatized_word, lemmatized_word)
    else:
        stemmed_word = stemmer.stem(word)
        return CUSTOM_RULES.get(stemmed_word, stemmed_word)

def return_stem(text, use_lemmatizer=False):
    """
    Replace multi-token sequences first, then process the text token by token.

    Multi-token replacement works by scanning the entire text for any key in CUSTOM_RULES
    that contains a space. Since these keys represent phrases that should be merged into a single token,
    the code replaces each occurrence with its corresponding value (which does not contain the space).
    This way, when the text is later split into tokens (using text.split()),
    phrases like "royau me" (if present in CUSTOM_RULES) will be replaced by "royaum" and treated as one token.
    """
    # Replace multi-token sequences before tokenizing.
    for phrase, replacement in CUSTOM_RULES.items():
        if " " in phrase:
            text = text.replace(phrase, replacement)
    tokens = text.split()
    final_processed = [custom_stem_or_lemmatize(token, use_lemmatizer) for token in tokens]
    return ' '.join(final_processed)

def process_file(file_path, use_lemmatizer=True):
    """
    Process the contents of a file using custom rules.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    processed_content = return_stem(content, use_lemmatizer)
    return processed_content

def save_processed_content(original_file, processed_content, use_lemmatizer):
    """
    Save processed content to a new file in a 'lemmatized' subdirectory.
    """
    output_dir = os.path.join(os.getcwd(), 'lemmatized')
    os.makedirs(output_dir, exist_ok=True)
    suffix = '_lemmatized.txt' if use_lemmatizer else '_stemmed.txt'
    new_file_name = os.path.splitext(os.path.basename(original_file))[0] + suffix
    new_file_path = os.path.join(output_dir, new_file_name)
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_content)
    print(f"Processed content saved to {new_file_path}")

def select_subdirectory():
    """
    Prompt the user to select a subdirectory or use the current directory.
    """
    current_directory = os.getcwd()
    subdirectories = sorted([d for d in os.listdir(current_directory) if os.path.isdir(d)])
    if not subdirectories:
        print("No subdirectories found.")
        return None
    print("Available subdirectories:")
    for i, subdir in enumerate(subdirectories):
        print(f"{i + 1}. {subdir}")
    while True:
        user_input = input("Select a subdirectory number for target files (leave empty for current directory): ").strip()
        if not user_input:
            return current_directory
        try:
            choice = int(user_input) - 1
            if choice < 0 or choice >= len(subdirectories):
                print("Invalid choice. Please enter a valid number.")
                continue
            return os.path.join(current_directory, subdirectories[choice])
        except ValueError:
            print("Invalid input. Please enter a number.")

def select_files():
    """
    List and allow the user to select text files from a directory.
    """
    directory = select_subdirectory()
    if directory is None:
        directory = os.getcwd()
    txt_files = sorted([f for f in os.listdir(directory) if f.endswith('.txt')])
    if not txt_files:
        print("No text files found in the selected directory.")
        return []
    print("Available text files:")
    for i, file in enumerate(txt_files):
        print(f"{i + 1}. {file}")
    selected_files = []
    while True:
        user_input = input("Select file numbers (e.g., 1,3-5 or press Enter to select all): ").strip()
        if not user_input:
            selected_files = txt_files
            break
        try:
            parts = user_input.split(',')
            for part in parts:
                if '-' in part:
                    start, end = map(int, part.split('-'))
                    selected_files.extend(txt_files[start - 1:end])
                else:
                    selected_files.append(txt_files[int(part) - 1])
            break
        except (ValueError, IndexError):
            print("Invalid input. Please enter valid numbers or ranges.")
    selected_files = [os.path.join(directory, file) for file in selected_files]
    return selected_files

def main():
    """
    Main function to handle file processing.
    """
    file_paths = select_files()
    if not file_paths:
        return
    use_lemmatizer = input("Use FrenchLefffLemmatizer? (y/n, default is n): ").strip().lower() == 'y'
    for file_path in file_paths:
        processed_content = process_file(file_path, use_lemmatizer)
        save_processed_content(file_path, processed_content, use_lemmatizer)

In [2]:
if __name__ == "__main__":
    main()

Available subdirectories:
1. .ipynb_checkpoints
2. Démonomanie
3. République
4. Théatre
5. lemmatized


Select a subdirectory number for target files (leave empty for current directory):  


Available text files:
1. Discours des raisons_corrected.txt
2. Démonomanie III_corrected.txt
3. Démonomanie II_corrected.txt
4. Démonomanie IV_corrected.txt
5. Démonomanie I_corrected.txt
6. Démonomanie preface Repair_corrected.txt
7. Harangue - Fontainebleau_corrected.txt
8. Harangue - Orléans 2_corrected.txt
9. Harangue - Orléans_corrected.txt
10. Harangue - Poissy_corrected.txt
11. Harangue - Rouen_corrected.txt
12. Harangue - Saint Germain_corrected.txt
13. Harangue - lit de justice_corrected.txt
14. Harangue - ouverture de parlement_corrected.txt
15. Harangue - parlement 2_corrected.txt
16. Harangue - parlement 3_corrected.txt
17. Harangue - parlement_corrected.txt
18. Harangue - religion_corrected.txt
19. Harangue - septembre_corrected.txt
20. La réponse_corrected.txt
21. Le paradoxe_corrected.txt
22. Lettre_corrected.txt
23. Lit de justice_corrected.txt
24. Memoire - Namur_corrected.txt
25. Memoire - le but_corrected.txt
26. Memoire au roi_corrected.txt
27. Memoires d'État Refug

Select file numbers (e.g., 1,3-5 or press Enter to select all):  
Use FrenchLefffLemmatizer? (y/n, default is n):  


Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Chapterized/lemmatized/Discours des raisons_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Chapterized/lemmatized/Démonomanie III_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Chapterized/lemmatized/Démonomanie II_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Chapterized/lemmatized/Démonomanie IV_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Chapterized/lemmatized/Démonomanie I_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Chapterized/lemmatized/Démonomanie preface Repair_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Chapterized/lemmatized/Harangue - Fontainebleau_corrected_stemmed.txt
Processed content sa