In [1]:
import os
from nltk.stem.snowball import SnowballStemmer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

# Initialize the stemmer and lemmatizer for French
stemmer = SnowballStemmer(language='french')
lemmatizer = FrenchLefffLemmatizer()

# Define custom stemming rules
CUSTOM_RULES = {
    "sorciere": "sorci", "sorcieres": "sorci" # Add other custom rules here if needed
}

def custom_stem(word):
    """
    Apply custom stemming rules first, then use the chosen stemmer as a fallback.
    """
    if word in CUSTOM_RULES:
        return CUSTOM_RULES[word]
    return word  # Return the original word if no custom rule applies

def apply_stemmer_or_lemmatizer(word, use_lemmatizer):
    """
    Apply the lemmatizer or the Snowball Stemmer based on user preference.
    """
    if use_lemmatizer:
        return lemmatizer.lemmatize(word)
    else:
        return stemmer.stem(word)

def return_stem(text, use_lemmatizer=False):
    """
    Tokenize and process text using custom rules first, then lemmatizer or stemmer.
    """
    tokens = text.split()
    # Apply custom rules first
    custom_processed = [custom_stem(token) for token in tokens]
    # Apply lemmatizer or stemmer after custom processing
    final_processed = [apply_stemmer_or_lemmatizer(token, use_lemmatizer) for token in custom_processed]
    return ' '.join(final_processed)

def process_file(file_path, use_lemmatizer=True):
    """
    Process the contents of a file using custom rules first, then the lemmatizer or stemmer.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    processed_content = return_stem(content, use_lemmatizer)
    return processed_content

def save_processed_content(original_file, processed_content, use_lemmatizer):
    """
    Save processed content to a new file in a 'lemmatized' subdirectory.
    """
    lemmatized_dir = os.path.join(os.getcwd(), 'lemmatized')
    os.makedirs(lemmatized_dir, exist_ok=True)
    suffix = '_lemmatized.txt' if use_lemmatizer else '_stemmed.txt'
    new_file_name = os.path.splitext(os.path.basename(original_file))[0] + suffix
    new_file_path = os.path.join(lemmatized_dir, new_file_name)
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_content)
    print(f"Processed content saved to {new_file_path}")

def select_subdirectory():
    """
    Prompt the user to select a subdirectory or use the current directory.
    """
    current_directory = os.getcwd()
    subdirectories = sorted([d for d in os.listdir(current_directory) if os.path.isdir(d)])
    if not subdirectories:
        print("No subdirectories found.")
        return None

    print("Available subdirectories:")
    for i, subdir in enumerate(subdirectories):
        print(f"{i + 1}. {subdir}")

    while True:
        user_input = input("Select a subdirectory number for target files (leave empty for current directory): ").strip()
        if not user_input:
            return current_directory

        try:
            choice = int(user_input) - 1
            if choice < 0 or choice >= len(subdirectories):
                print("Invalid choice. Please enter a valid number.")
                continue
            return os.path.join(current_directory, subdirectories[choice])
        except ValueError:
            print("Invalid input. Please enter a number.")

def select_files():
    """
    List and allow the user to select text files from a directory.
    """
    directory = select_subdirectory()
    if directory is None:
        directory = os.getcwd()

    txt_files = sorted([f for f in os.listdir(directory) if f.endswith('.txt')])
    if not txt_files:
        print("No text files found in the selected directory.")
        return []

    print("Available text files:")
    for i, file in enumerate(txt_files):
        print(f"{i + 1}. {file}")

    selected_files = []
    while True:
        user_input = input("Select file numbers (individual or ranges, e.g., 1,3-5): ")
        if not user_input:
            break

        try:
            parts = user_input.split(',')
            for part in parts:
                if '-' in part:
                    start, end = map(int, part.split('-'))
                    selected_files.extend(txt_files[start - 1:end])
                else:
                    selected_files.append(txt_files[int(part) - 1])
            break
        except (ValueError, IndexError):
            print("Invalid input. Please enter valid numbers or ranges.")

    selected_files = [os.path.join(directory, file) for file in selected_files]
    return selected_files

def main():
    """
    Main function to handle file processing.
    """
    file_paths = select_files()
    if not file_paths:
        return
   
    use_lemmatizer = input("Use FrenchLefffLemmatizer? (y/n, default is n): ").strip().lower() == 'y'
    for file_path in file_paths:
        processed_content = process_file(file_path, use_lemmatizer)
        save_processed_content(file_path, processed_content, use_lemmatizer)

In [2]:
if __name__ == "__main__":
    main()

Available subdirectories:
1. .ipynb_checkpoints
2. lemmatized


Select a subdirectory number for target files (leave empty for current directory):  


Available text files:
1. Discours des raisons_corrected.txt
2. Démonomanie Repair_corrected.txt
3. Harangue - Fontainebleau_corrected.txt
4. Harangue - Orléans 2_corrected.txt
5. Harangue - Orléans_corrected.txt
6. Harangue - Poissy_corrected.txt
7. Harangue - Rouen_corrected.txt
8. Harangue - Saint Germain_corrected.txt
9. Harangue - lit de justice_corrected.txt
10. Harangue - ouverture de parlement_corrected.txt
11. Harangue - parlement 2_corrected.txt
12. Harangue - parlement 3_corrected.txt
13. Harangue - parlement_corrected.txt
14. Harangue - religion_corrected.txt
15. Harangue - septembre_corrected.txt
16. La réponse_corrected.txt
17. Le paradoxe_corrected.txt
18. Lettre_corrected.txt
19. Lit de justice_corrected.txt
20. Memoire - Namur_corrected.txt
21. Memoire - le but_corrected.txt
22. Memoire au roi_corrected.txt
23. Memoires d'État Refuge_corrected.txt
24. Memoires d'état_corrected.txt
25. Recueil_corrected.txt
26. Remonstrances - Royaume_corrected.txt
27. Remonstrances - pa

Select file numbers (individual or ranges, e.g., 1,3-5):  1-36
Use FrenchLefffLemmatizer? (y/n, default is n):  


Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Discours des raisons_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Démonomanie Repair_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Fontainebleau_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Orléans 2_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Orléans_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Poissy_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Rouen_corrected_stemmed.txt
Processed content saved to 