In [1]:
import os
from nltk.stem.snowball import SnowballStemmer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

# Initialize the stemmer and lemmatizer for French
stemmer = SnowballStemmer(language='french')
lemmatizer = FrenchLefffLemmatizer()

def return_stem(text, use_lemmatizer=True):
    # Tokenize the text
    tokens = text.split()
    # Apply lemmatizer or stemmer based on user choice
    if use_lemmatizer:
        processed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    else:
        processed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(processed_tokens)

def process_file(file_path, use_lemmatizer=True):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    processed_content = return_stem(content, use_lemmatizer)
    return processed_content

def save_processed_content(original_file, processed_content):
    lemmatized_dir = os.path.join(os.getcwd(), 'lemmatized')
    os.makedirs(lemmatized_dir, exist_ok=True)
    new_file_name = os.path.splitext(os.path.basename(original_file))[0] + '_stemmed.txt'
    new_file_path = os.path.join(lemmatized_dir, new_file_name)
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_content)
    print(f"Processed content saved to {new_file_path}")

def select_subdirectory():
    current_directory = os.getcwd()
    subdirectories = sorted([d for d in os.listdir(current_directory) if os.path.isdir(d)])
    if not subdirectories:
        print("No subdirectories found.")
        return None

    print("Available subdirectories:")
    for i, subdir in enumerate(subdirectories):
        print(f"{i + 1}. {subdir}")

    while True:
        user_input = input("Select a subdirectory number for target files (leave empty for current directory): ").strip()
        if not user_input:
            return current_directory

        try:
            choice = int(user_input) - 1
            if choice < 0 or choice >= len(subdirectories):
                print("Invalid choice. Please enter a valid number.")
                continue
            return os.path.join(current_directory, subdirectories[choice])
        except ValueError:
            print("Invalid input. Please enter a number.")

def select_files():
    directory = select_subdirectory()
    if directory is None:
        directory = os.getcwd()

    # List all text files in the selected directory
    txt_files = sorted([f for f in os.listdir(directory) if f.endswith('.txt')])
    if not txt_files:
        print("No text files found in the selected directory.")
        return []

    print("Available text files:")
    for i, file in enumerate(txt_files):
        print(f"{i + 1}. {file}")

    selected_files = []
    while True:
        user_input = input("Select file numbers (individual or ranges, e.g., 1,3-5): ")
        if not user_input:
            break

        try:
            parts = user_input.split(',')
            for part in parts:
                if '-' in part:
                    start, end = map(int, part.split('-'))
                    selected_files.extend(txt_files[start - 1:end])
                else:
                    selected_files.append(txt_files[int(part) - 1])
            break
        except (ValueError, IndexError):
            print("Invalid input. Please enter valid numbers or ranges.")

    selected_files = [os.path.join(directory, file) for file in selected_files]
    return selected_files

def main():
    use_lemmatizer = input("Use FrenchLefffLemmatizer? (y/n, default is y): ").strip().lower() != 'n'
    file_paths = select_files()
    if file_paths:
        for file_path in file_paths:
            processed_content = process_file(file_path, use_lemmatizer)
            save_processed_content(file_path, processed_content)

if __name__ == "__main__":
    main()

Use FrenchLefffLemmatizer? (y/n, default is y):  


Available subdirectories:
1. .ipynb_checkpoints


Select a subdirectory number (leave empty for current directory):  


Available text files:
1. Discours des raisons_corrected.txt
2. Démonomanie Repair_corrected.txt
3. Harangue - Fontainebleau_corrected.txt
4. Harangue - Orléans 2_corrected.txt
5. Harangue - Orléans_corrected.txt
6. Harangue - Poissy_corrected.txt
7. Harangue - Rouen_corrected.txt
8. Harangue - Saint Germain_corrected.txt
9. Harangue - lit de justice_corrected.txt
10. Harangue - ouverture de parlement_corrected.txt
11. Harangue - parlement 2_corrected.txt
12. Harangue - parlement 3_corrected.txt
13. Harangue - parlement_corrected.txt
14. Harangue - religion_corrected.txt
15. Harangue - septembre_corrected.txt
16. La réponse_corrected.txt
17. Le paradoxe_corrected.txt
18. Lettre_corrected.txt
19. Lit de justice_corrected.txt
20. Memoire - Namur_corrected.txt
21. Memoire - le but_corrected.txt
22. Memoire au roi_corrected.txt
23. Memoires d'État Refuge_corrected.txt
24. Memoires d'état_corrected.txt
25. Recueil_corrected.txt
26. Remonstrances - Royaume_corrected.txt
27. Remonstrances - pa

Select file numbers (individual or ranges, e.g., 1,3-5):  1-36


Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Discours des raisons_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Démonomanie Repair_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Fontainebleau_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Orléans 2_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Orléans_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Poissy_corrected_stemmed.txt
Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected/lemmatized/Harangue - Rouen_corrected_stemmed.txt
Processed content saved to 