In [1]:
import re
import os
from collections import Counter
import csv  # Added for robust comma-aware pattern parsing using CSV quoting

def select_files():
    # List all text files in the current directory
    txt_files = sorted([f for f in os.listdir() if f.endswith('.txt')])
    if not txt_files:
        print("No text files found in the current directory.")
        return []

    print("Available text files:")
    for i, file in enumerate(txt_files):
        print(f"{i + 1}. {file}")
    print("Enter 'all' to select all files.")

    selected_files = []
    while True:
        user_input = input("Select file numbers (individual, ranges, or 'all'): ")
        if not user_input:
            break

        if user_input.strip().lower() == 'all':
            selected_files = txt_files[:]
            break
        else:
            try:
                parts = user_input.split(',')
                for part in parts:
                    if '-' in part:
                        start, end = map(int, part.split('-'))
                        selected_files.extend(txt_files[start - 1:end])
                    else:
                        selected_files.append(txt_files[int(part) - 1])
                break
            except (ValueError, IndexError):
                print("Invalid input. Please enter valid numbers, ranges, or 'all'.")

    return selected_files

def search_pattern(file_paths, pattern, pattern_string):
    results = {}
    print(f"\n--- Results for pattern: '{pattern_string}' ---")

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            matches = pattern.findall(text)
            if matches:
                counter = Counter(matches)
                results[file_path] = counter

    # Separate files with hits and without hits
    files_with_hits = {fp: c for fp, c in results.items() if sum(c.values()) > 0}
    all_file_paths = set(file_paths)
    hit_file_paths = set(files_with_hits.keys())
    files_without_hits = all_file_paths - hit_file_paths

    # Sort files with hits by total hit count (descending)
    sorted_with_hits = sorted(files_with_hits.items(), key=lambda x: sum(x[1].values()), reverse=True)

    if sorted_with_hits:
        #print("Files with hits:")
        for file_path, counter in sorted_with_hits:
            print(f"Results for {file_path}:")
            for word, count in counter.items():
                print(f"'{word}': {count}")
            print()
    else:
        print("No hits found for this pattern in any of the selected files.")

    #if files_without_hits:
    #    print("Files without hits:")
    #    for file_path in sorted(list(files_without_hits)):
    #        print(f"{file_path}")
    #    print()

def main():
    file_paths = select_files()
    if file_paths:
        regex_patterns_input = input("Enter regex patterns (comma-separated; wrap any pattern containing a comma in double quotes, e.g. \"foo,bar\"; spaces at start/end are preserved): ")
        # Old simple split approach (retained for reference):
        # pattern_strings = regex_patterns_input.split(',')
        # CSV-based parsing (now preserving leading/trailing spaces by NOT using skipinitialspace and not stripping).
        try:
            # IMPORTANT: skipinitialspace=False so spaces immediately after commas are kept as part of the pattern.
            pattern_strings = next(csv.reader([regex_patterns_input], skipinitialspace=False))
        except Exception as e:
            print(f"Failed to parse input with CSV reader ({e}); falling back to naive split (will not preserve some spaces).")
            pattern_strings = regex_patterns_input.split(',')
        
        for pattern_str in pattern_strings:
            raw = pattern_str  # Preserve original (including any leading/trailing spaces) for display & compilation
            # pattern_str_stripped = pattern_str.strip()  # Previous behavior (would remove leading/trailing spaces)
            # if not pattern_str_stripped:
            #     continue
            if pattern_str == '':  # Skip truly empty entries (e.g., consecutive commas)
                continue
            try:
                pattern = re.compile(pattern_str)  # Compile with spaces intact
                search_pattern(file_paths, pattern, raw)
            except re.error:
                print(f"Invalid regex pattern: '{raw}'. Skipping.")

In [2]:
if __name__ == "__main__":
    main()

Available text files:
1. 0Théatre summary_corrected_stemmed.txt
2. Discours des raisons_corrected_stemmed.txt
3. Démonomanie I.1_corrected_stemmed.txt
4. Démonomanie I.2_corrected_stemmed.txt
5. Démonomanie I.3_corrected_stemmed.txt
6. Démonomanie I.4_corrected_stemmed.txt
7. Démonomanie I.5_corrected_stemmed.txt
8. Démonomanie I.6_corrected_stemmed.txt
9. Démonomanie I.7_corrected_stemmed.txt
10. Démonomanie II.1_corrected_stemmed.txt
11. Démonomanie II.2_corrected_stemmed.txt
12. Démonomanie II.3_corrected_stemmed.txt
13. Démonomanie II.4_corrected_stemmed.txt
14. Démonomanie II.5_corrected_stemmed.txt
15. Démonomanie II.6_corrected_stemmed.txt
16. Démonomanie II.7_corrected_stemmed.txt
17. Démonomanie II.8_corrected_stemmed.txt
18. Démonomanie III.1_corrected_stemmed.txt
19. Démonomanie III.2_corrected_stemmed.txt
20. Démonomanie III.3_corrected_stemmed.txt
21. Démonomanie III.4_corrected_stemmed.txt
22. Démonomanie III.5_corrected_stemmed.txt
23. Démonomanie III.6_corrected_stemmed