In [1]:
import os
from nltk.stem.snowball import SnowballStemmer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

# Initialize the stemmer and lemmatizer for French
stemmer = SnowballStemmer(language='french')
lemmatizer = FrenchLefffLemmatizer()

def return_stem(text, use_lemmatizer=True):
    # Tokenize the text
    tokens = text.split()
    # Apply lemmatizer or stemmer based on user choice
    if use_lemmatizer:
        processed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    else:
        processed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(processed_tokens)

def process_file(file_path, use_lemmatizer=True):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    processed_content = return_stem(content, use_lemmatizer)
    return processed_content

def save_processed_content(original_file, processed_content):
    lemmatized_dir = os.path.join(os.getcwd(), 'lemmatized')
    os.makedirs(lemmatized_dir, exist_ok=True)
    new_file_name = os.path.splitext(os.path.basename(original_file))[0] + '_stemmed.txt'
    new_file_path = os.path.join(lemmatized_dir, new_file_name)
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_content)
    print(f"Processed content saved to {new_file_path}")

def select_subdirectory():
    current_directory = os.getcwd()
    subdirectories = sorted([d for d in os.listdir(current_directory) if os.path.isdir(d)])
    if not subdirectories:
        print("No subdirectories found.")
        return None

    print("Available subdirectories:")
    for i, subdir in enumerate(subdirectories):
        print(f"{i + 1}. {subdir}")

    while True:
        user_input = input("Select a subdirectory number (leave empty for current directory): ").strip()
        if not user_input:
            return current_directory

        try:
            choice = int(user_input) - 1
            if choice < 0 or choice >= len(subdirectories):
                print("Invalid choice. Please enter a valid number.")
                continue
            return os.path.join(current_directory, subdirectories[choice])
        except ValueError:
            print("Invalid input. Please enter a number.")

def select_files():
    directory = select_subdirectory()
    if directory is None:
        directory = os.getcwd()

    # List all text files in the selected directory
    txt_files = sorted([f for f in os.listdir(directory) if f.endswith('.txt')])
    if not txt_files:
        print("No text files found in the selected directory.")
        return []

    print("Available text files:")
    for i, file in enumerate(txt_files):
        print(f"{i + 1}. {file}")

    selected_files = []
    while True:
        user_input = input("Select file numbers (individual or ranges, e.g., 1,3-5): ")
        if not user_input:
            break

        try:
            parts = user_input.split(',')
            for part in parts:
                if '-' in part:
                    start, end = map(int, part.split('-'))
                    selected_files.extend(txt_files[start - 1:end])
                else:
                    selected_files.append(txt_files[int(part) - 1])
            break
        except (ValueError, IndexError):
            print("Invalid input. Please enter valid numbers or ranges.")

    selected_files = [os.path.join(directory, file) for file in selected_files]
    return selected_files

def main():
    use_lemmatizer = input("Use FrenchLefffLemmatizer? (y/n, default is y): ").strip().lower() != 'n'
    file_paths = select_files()
    if file_paths:
        for file_path in file_paths:
            processed_content = process_file(file_path, use_lemmatizer)
            save_processed_content(file_path, processed_content)

if __name__ == "__main__":
    main()

Use FrenchLefffLemmatizer? (y/n, default is y):  


Available subdirectories:
1. .ipynb_checkpoints
2. Démonomanie
3. République
4. Théatre


Select a subdirectory number (leave empty for current directory):  


Available text files:
1. Démonomanie III_corrected.txt
2. Démonomanie II_corrected.txt
3. Démonomanie IV_corrected.txt
4. Démonomanie I_corrected.txt
5. Démonomanie preface Repair_corrected.txt
6. La réponse_corrected.txt
7. Le paradoxe_corrected.txt
8. Lettre_corrected.txt
9. Recueil_corrected.txt
10. République III_corrected.txt
11. République II_corrected.txt
12. République IV_corrected.txt
13. République I_corrected.txt
14. République VI_corrected.txt
15. République V_corrected.txt
16. Réublique Preface_corrected.txt
17. Théatre III_corrected.txt
18. Théatre II_corrected.txt
19. Théatre IV_corrected.txt
20. Théatre I_corrected.txt
21. Théatre V_corrected.txt
22. Théatre summary_corrected.txt


Select file numbers (individual or ranges, e.g., 1,3-5):  4


Processed content saved to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Bodin Chapterized/lemmatized/Démonomanie I_corrected_stemmed.txt
