In [5]:
#!pip install --upgrade --force-reinstall git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git
import os
import french_lefff_lemmatizer 
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
lefff_data_path = os.path.join(os.path.dirname(french_lefff_lemmatizer.__file__), "data", "lefff-3.4.mlex")

from nltk.stem.snowball import SnowballStemmer

# Initialize the stemmer and lemmatizer for French
stemmer = SnowballStemmer(language='french')
lemmatizer = FrenchLefffLemmatizer(lefff_file_path=lefff_data_path)

# Global flag to control application of custom rules without mutating the dictionary.
# If not yet defined (e.g., when running this cell first), default to True.
try:
    APPLY_CUSTOM_RULES
except NameError:
    APPLY_CUSTOM_RULES = True

# Define custom stemming/lemmatization rules sorted alphabetically with five entries per line
CUSTOM_RULES = {
    "abfolu": "absolu", "abfoluë": "absolu", "accuf": "accus", "aduient": "advient", "aduis": "advis",
    "affemble": "assemble", "aguerr": "guerr", "aifn": "aisn", "alor": "alors", "ariftot": "aristot", "aufquel": "ausquel",
    "auantag": "avantag", "auon": "avon", "auoyent": "avoyent", "beft": "best", "bourgcoif": "bourgeois",
    "cai": "cayer", "ceft": "cest", "chágement": "changemen", "comand": "command", "comiffion": "commission",
    "commiffion": "commission", "commiflair": "commissair", "commiflion": "commission", "confent ": "consent", "confeff": "confess",
    "conful": "consul", "coft": "cost", "costé": "côté", "debuoir": "debvoir", "defquel": "desquel",
    "desloix": "des loi", "deteft": "detest", "difcord": "discord", "difent": "disent", "difoit": "disoit",
    "diuif": "divis", "diuin": "divin", "droi": "doit", "droict": "droit", "edit": "édict",
    "édictz": "édict", "eglif": "eglis", "efcript": "escript", "efpaign": "espaigne", "efpargn": "espargn",
    "efpec": "espec", "efprit": "esprit", "eftant": "estant", "efté": "esté", "eftim": "estim",
    "eftoyent": "estoyent", "empefch": "empesch", "enfans": "enfant", "enfembl": "ensembl", "ennem": "ennemy", "ensan": "enfant",
    "ensans": "enfant", "enuer": "enver", "esglis": "églis", "espic": "épice", "estar": "estat",
    "estatz": "estat", "état": "estat", "euft": "eust", "fag": "sag", "faif": "sais",
    "faifoit": "faisoit", "facrific": "sacrific", "fedit": "sedit", "feliqu": "felicit", "fembl": "sembl", "femblabl": "semblabl",
    "fecond": "second", "fept": "sept", "feulg": "seul", "felon": "selon", "fent": "sent",
    "feroit": "seroit", "fcauoir": "savoir", "fcigneur": "seigneur", "fçauoir": "savoir", "finon": "sinon",
    "fimpl": "simpl", "fignif": "signif", "file": "fill", "fong": "song", "foyent": "soyent",
    "fouuet": "souvent", "fucced": "succeed", "fucceffeur": "successeur", "fuft": "fust", "foudain": "soudain",
    "gouuern": "gouvern", "gouuerneur": "gouverneur", "hebrieux": "hebrieu", "iug": "jug", "iufqu": "iusqu",
    "iurifdiet": "iurisdict", "iurifdict": "iurisdict", "iust": "just", "iustic": "justic", "impoft": "impost",
    "laloy": "la loi", "laiff": "laiss", "lefquel": "lesquel", "leroy": "le roy", "lesloix": "les loi",
    "lifon": "lir", "liur": "livr", "loix": "loi", "loy": "loi", "loyx": "phoi",
    "maicft": "maiest", "maifon": "maison", "maiftr": "maistr", "majeft": "majest", "magiftrat": "magistrat",
    "magiftrats": "magistrat", "magiltrat": "magistrat", "mefine": "même", "mefines": "mêmes", "meím": "mesm",
    "mesmoir": "memoir", "monftr": "monstr", "naturc": "natur", "noftr": "nostr", "obeiff": "obeiss",
    "obeifl": "obeiss", "pai": "pay", "parlem": "parlement", "parlements": "parlement", "penf": "pens",
    "peuuent": "peuvent", "pluficur": "plusieur", "plutfoft": "plustost", "pouuoir": "pouvoir", "pouuoit": "pouvoit",
    "pourueu": "pourveu", "prefent": "present", "prefqu": "presqu", "preftr": "prestr", "prerogatiu": "prerogativ",
    "prifonni": "prisonni", "prin c": "prince", "prin ces": "princ", "pris": "prix", "priuileg": "privileg", "priuileig": "privileg", "procez": "proc",
    "puiff": "puiss", "puifle": "puiss", "puiffanc": "puissanc", "quc": "que", "queftion": "question",
    "raifon": "raison", "reffort": "ressort", "republie": "republ", "repvbliqu": "republ", "républicque": "republ","respublicqu":"republ",
    "royaulm": "royaum", "royau me": "royaum", "sorcicr": "sorcier", "sorciere": "sorci", "sorcieres": "sorci",
    "soubverain": "souverain", "souverian": "souverain", "souverianet": "souverainet", "subject": "sujet", "subjectz": "sujet",
    "tiltr": "titr", "toufiour": "tousiour", "traitt": "traict", "trouu": "trouv", "vaffal": "vassal", "vaflal": "vassal",
    "vertus": "vertu", "viur": "viv", "vifion": "vision", "ftatut": "statut", "blafph": "blasph", 
    "blafphem":"blasphem", "feruir":"servyr", "iuft":"just","mefchan":"meschan","mefchancet":"meschancet", "perpetucl":"perpetuel", "ferment":"serment",
    "commád":"command", "commifl":"commiss", "defenf":"defens","fuperieur":"superieur","iurifconfult":"jurisconsult","ofhcier":"officier",
    "iour":"jour", "saig":"sag", "genz":"gen","soyt":"soit", "hommaig":"hommag", "doibt":"doit", "ruyn":"ruin",
    "feul":"seul", "fixes":"fixé", "fubftanc":"substanc", "debvoir":"devoir", "doibvent":"dev","facon":"façon",
    "aduint":"advint", "receuoir":"recevoir", "grad":"grand", "impoffibl":"impossibl", "prif":"pris", "iur":"jur",
    "publicqu":"public", 'hault':'haut', "maulv":"mauvais", "maulvais":"mauvais",
    "mauu":"mauvais", "fang":"sang", "pauur":"pauvr", "noy":"roy", "paff":"pass", "fecret":"secret", "rendr":"rend",
    "veut":"veu", "scavoir":"savoir", "vivr":"viv", "judg":"jug", "présen":"present", "longu":"long", "majest":"maiest",
    "nobleff":"nobless", "neceffair":"necessair","ailles":"aile", "ailes":"aile", "verit":"vérit",
    "teneus":"tenu", "veult":"veu", "vouloit":"veu", "jehan":"jean", "vouleu":"veu", "voul":"veu", "veulent":"veu",
    "efleu":"elu", "esleu":"elu", "befoin":"besoin", "affeur":"asseur","neceflair":"necessair","ariftocrat":"aristocrat",
    "rend":"rend", "besoing":"besoin", "courts":"cour", "prennent":"prend", "occafion":"occasion", "philofoph":"philosoph",
    "miz":"mis","prouinc":"provinc","voulut":"veu","grád":"grand", "cognoiftr":"cognoistr", "faueur":"faveur",
    "veneu":"venu","voulu":"veu","daulphin":"dauphin","veoi":"voit","chaff":"chass","proche":"proche", "proches":"proche",
    "enuoi":"envoi","puiffant":"puissan", "puiffante":"puissan","roi":"roy","rois":"roi","servy":"servyr","maulx":"mal","uoir":"voir","hiftoir":"histoir",
    "expérient":"expérienc", "pos  sible":"possible", "pos ible":"possible", "beaulx":"beau", "mœur":"moeur", "perfon":"person", "fcienc":"scienc", "auant":"avant",
    "louis":"louis", "ambaffadeur":"ambassadeur", "confent":"consent","preuu":"preuv", "enten":"entend",
    "justicf":"justice", "efgal":"esgal", "diuifion":"division", "efpaignol":"espaignol", "demonftr":"demonstr",
    "oifeau":"oiseau", "conuen":"conven", "lug": "jug", "eleétion": "election", "refolu":"resolu",
    "renuerf":"renvers", "phyficien":"physicien"
}

def custom_stem_or_lemmatize(word, use_lemmatizer):
    """
    Apply custom rules during stemming or lemmatizing on a single token.
    """
    use_rules = globals().get('APPLY_CUSTOM_RULES', True)
    if use_rules and word in CUSTOM_RULES:
        return CUSTOM_RULES[word]
    if use_lemmatizer:
        lemmatized_word = lemmatizer.lemmatize(word)
        return CUSTOM_RULES.get(lemmatized_word, lemmatized_word) if use_rules else lemmatized_word
    else:
        stemmed_word = stemmer.stem(word)
        return CUSTOM_RULES.get(stemmed_word, stemmed_word) if use_rules else stemmed_word

def return_stem(text, use_lemmatizer=False):
    """
    Replace multi-token sequences first, then process the text token by token.

    Multi-token replacement works by scanning the entire text for any key in CUSTOM_RULES
    that contains a space. Since these keys represent phrases that should be merged into a single token,
    the code replaces each occurrence with its corresponding value (which does not contain the space).
    This way, when the text is later split into tokens (using text.split()),
    phrases like "royau me" (if present in CUSTOM_RULES) will be replaced by "royaum" and treated as one token.
    """
    # Replace multi-token sequences before tokenizing, but only if the flag is enabled.
    if globals().get('APPLY_CUSTOM_RULES', True):
        for phrase, replacement in CUSTOM_RULES.items():
            if " " in phrase:
                text = text.replace(phrase, replacement)
    tokens = text.split()
    final_processed = [custom_stem_or_lemmatize(token, use_lemmatizer) for token in tokens]
    return ' '.join(final_processed)

def process_file(file_path, use_lemmatizer=True):
    """
    Process the contents of a file using custom rules.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    processed_content = return_stem(content, use_lemmatizer)
    return processed_content

def save_processed_content(original_file, processed_content, use_lemmatizer):
    """
    Save processed content to a new file in a 'lemmatized' subdirectory.
    """
    output_dir = os.path.join(os.getcwd(), 'lemmatized')
    os.makedirs(output_dir, exist_ok=True)
    suffix = '_lemmatized.txt' if use_lemmatizer else '_stemmed.txt'
    new_file_name = os.path.splitext(os.path.basename(original_file))[0] + suffix
    new_file_path = os.path.join(output_dir, new_file_name)
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_content)
    print(f"Processed content saved to {new_file_path}")

def select_subdirectory():
    """
    Prompt the user to select a subdirectory or use the current directory.
    """
    current_directory = os.getcwd()
    subdirectories = sorted([d for d in os.listdir(current_directory) if os.path.isdir(d)])
    if not subdirectories:
        print("No subdirectories found.")
        return None
    print("Available subdirectories:")
    for i, subdir in enumerate(subdirectories):
        print(f"{i + 1}. {subdir}")
    while True:
        user_input = input("Select a subdirectory number for target files (leave empty for current directory): ").strip()
        if not user_input:
            return current_directory
        try:
            choice = int(user_input) - 1
            if choice < 0 or choice >= len(subdirectories):
                print("Invalid choice. Please enter a valid number.")
                continue
            return os.path.join(current_directory, subdirectories[choice])
        except ValueError:
            print("Invalid input. Please enter a number.")

def select_files():
    """
    List and allow the user to select text files from a directory.
    """
    directory = select_subdirectory()
    if directory is None:
        directory = os.getcwd()
    txt_files = sorted([f for f in os.listdir(directory) if f.endswith('.txt')])
    if not txt_files:
        print("No text files found in the selected directory.")
        return []
    print("Available text files:")
    for i, file in enumerate(txt_files):
        print(f"{i + 1}. {file}")
    selected_files = []
    while True:
        user_input = input("Select file numbers (e.g., 1,3-5 or press Enter to select all): ").strip()
        if not user_input:
            selected_files = txt_files
            break
        try:
            parts = user_input.split(',')
            for part in parts:
                if '-' in part:
                    start, end = map(int, part.split('-'))
                    selected_files.extend(txt_files[start - 1:end])
                else:
                    selected_files.append(txt_files[int(part) - 1])
            break
        except (ValueError, IndexError):
            print("Invalid input. Please enter valid numbers or ranges.")
    selected_files = [os.path.join(directory, file) for file in selected_files]
    return selected_files

def main():
    """
    Main function to handle file processing.
    """
    file_paths = select_files()
    if not file_paths:
        return
    use_lemmatizer = input("Use FrenchLefffLemmatizer? (y/n, default is n): ").strip().lower() == 'y'
    for file_path in file_paths:
        processed_content = process_file(file_path, use_lemmatizer)
        save_processed_content(file_path, processed_content, use_lemmatizer)

In [6]:
# Display all custom rule entries sorted alphabetically by key
# This does not modify CUSTOM_RULES; it just presents a sorted view.
try:
    sorted_custom_rules_items = sorted(CUSTOM_RULES.items(), key=lambda kv: kv[0])
    print(f"Total custom rules: {len(sorted_custom_rules_items)}")
    for k, v in sorted_custom_rules_items:
        print(f"{k}\t=>\t{v}")
except NameError:
    print("CUSTOM_RULES is not defined yet. Run the cell defining it first.")

Total custom rules: 303
abfolu	=>	absolu
abfoluë	=>	absolu
accuf	=>	accus
aduient	=>	advient
aduint	=>	advint
aduis	=>	advis
affemble	=>	assemble
affeur	=>	asseur
aguerr	=>	guerr
aifn	=>	aisn
ailes	=>	aile
ailles	=>	aile
alor	=>	alors
ambaffadeur	=>	ambassadeur
ariftocrat	=>	aristocrat
ariftot	=>	aristot
auant	=>	avant
auantag	=>	avantag
aufquel	=>	ausquel
auon	=>	avon
auoyent	=>	avoyent
beaulx	=>	beau
befoin	=>	besoin
beft	=>	best
besoing	=>	besoin
blafph	=>	blasph
blafphem	=>	blasphem
bourgcoif	=>	bourgeois
cai	=>	cayer
ceft	=>	cest
chaff	=>	chass
chágement	=>	changemen
coft	=>	cost
cognoiftr	=>	cognoistr
comand	=>	command
comiffion	=>	commission
commiffion	=>	commission
commifl	=>	commiss
commiflair	=>	commissair
commiflion	=>	commission
commád	=>	command
confeff	=>	confess
confent	=>	consent
confent 	=>	consent
conful	=>	consul
conuen	=>	conven
costé	=>	côté
courts	=>	cour
daulphin	=>	dauphin
debuoir	=>	debvoir
debvoir	=>	devoir
defenf	=>	defens
defquel	=>	desquel
demonftr	=>	demon

In [7]:
# Toggle whether to apply custom lemmatization rules (default: yes)
try:
    # Ensure CUSTOM_RULES exists even if cells run out of order
    CUSTOM_RULES
except NameError:
    CUSTOM_RULES = {}
    
resp = input("Apply custom lemmatization rules? (Y/n, press Enter for Yes): ").strip().lower()
APPLY_CUSTOM_RULES = (resp in ("", "y", "yes"))
if not APPLY_CUSTOM_RULES:
    # Disable by using a flag so downstream lookups become no-ops (without modifying the dictionary)
    # CUSTOM_RULES.clear()  # <-- no longer needed; preserved for reference but intentionally not executed
    print("Custom rules disabled for this run.")
else:
    print("Custom rules enabled.")

KeyboardInterrupt: Interrupted by user

In [None]:
if __name__ == "__main__":
    main()

Available subdirectories:
1. .ipynb_checkpoints
2. Démonomanie
3. République
4. Théatre
5. lemmatized
Available text files:
1. Discours des raisons_corrected.txt
2. Démonomanie III_corrected.txt
3. Démonomanie II_corrected.txt
4. Démonomanie IV_corrected.txt
5. Démonomanie I_corrected.txt
6. Démonomanie preface Repair_corrected.txt
7. Harangue - Fontainebleau_corrected.txt
8. Harangue - Orléans 2_corrected.txt
9. Harangue - Orléans_corrected.txt
10. Harangue - Poissy_corrected.txt
11. Harangue - Rouen_corrected.txt
12. Harangue - Saint Germain_corrected.txt
13. Harangue - lit de justice_corrected.txt
14. Harangue - ouverture de parlement_corrected.txt
15. Harangue - parlement 2_corrected.txt
16. Harangue - parlement 3_corrected.txt
17. Harangue - parlement_corrected.txt
18. Harangue - religion_corrected.txt
19. Harangue - septembre_corrected.txt
20. La réponse_corrected.txt
21. Le paradoxe_corrected.txt
22. Lettre_corrected.txt
23. Lit de justice_corrected.txt
24. Memoire - Namur_corre