In [1]:
import os
from scipy.stats import fisher_exact
import csv
import pandas as pd

# Define a list of words to calculate Fisher's exact test for
predefined_words = [
    'absolu', 'arrest', 'bien', 'chos', 'citoyen', 'conseil', 'conseiller', 'confess', 
    'cour', 'couron', 'demon', 'demoniaqu', 'diabl', 'diabol', 'dieu', 'divin', 'domain', 
    'droit', 'édict', 'estat', 'hebrieu', 'impiet', 'iurisdict', 'jug', 'just', 'loi', 
    'magistrat', 'maiest', 'offic', 'offici', 'ordon', 'parlement', 'preuv', 'princ', 
    'puissanc', 'question', 'republ', 'ressort', 'roy', 'royal', 'royaum', 'sathan', 
    'seigneur', 'sorceller', 'sorci', 'souverain', 'souverainet', 'statut', 'sujet'
]

# Global variable to store the chosen stopwords file path
selected_stopwords_file_path = None

def tokenize_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read().split()

def list_txt_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.txt')])

def choose_directory(prompt):
    base_directory = os.getcwd()
    subdirectories = [os.path.join(base_directory, o) for o in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, o))]
    subdirectories.append(base_directory)
    print(prompt)
    for i, subdir in enumerate(subdirectories):
        print(f"{i + 1}. {subdir}")
    choice = int(input("Enter your choice: "))
    return subdirectories[choice - 1]

def choose_files(files):
    print("Select target .txt files (enter numbers separated by commas or ranges, or starting text patterns):")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    choices = input("Enter your choices: ").strip()
    selected_files = []
    parts = choices.split(',')
    for part in parts:
        part = part.strip()
        if '-' in part:
            start, end = map(int, part.split('-'))
            selected_files.extend(files[start - 1:end])
        elif part.isdigit():
            selected_files.append(files[int(part) - 1])
        else:
            selected_files.extend([file for file in files if file.startswith(part)])
    return sorted(set(selected_files))  # Remove duplicates and sort

def include_files(files):
    print("Available .txt files:")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    print("Enter the numbers of the files you want to include, separated by commas:")
    choices = input().split(',')
    included_files = [files[int(choice.strip()) - 1] for choice in choices if choice.strip().isdigit()]
    return included_files

def exclude_files(files):
    print("Available .txt files:")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    print("Enter the numbers of the files you want to exclude, separated by commas:")
    choices = input().split(',')
    excluded_files = [files[int(choice.strip()) - 1] for choice in choices if choice.strip().isdigit()]
    return [file for file in files if file not in excluded_files]

def get_fishers(someword, somecountdict, someratedict, alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r * wc)
    d = wc - c
    p = fisher_exact([[a, b], [c, d]], alternative=alternative).pvalue
    return p

def list_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv') and 'stop' in file:
                csv_files.append(os.path.join(root, file))
    return csv_files

def choose_csv_file(csv_files):
    print("Select a .csv file to use as a stopwords file:")
    for i, file in enumerate(csv_files):
        print(f"{i + 1}. {file}")
    choice = int(input("Enter your choice: "))
    return csv_files[choice - 1]

def read_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

def calculate_fishers_exact_for_corpus():
    global selected_stopwords_file_path

    # Prompt user to select the directory for rate dictionary files
    rate_directory = choose_directory("Select the directory for the rate dictionary files:")
    rate_files = list_txt_files(rate_directory)
    if not rate_files:
        print("No .txt files found in the selected rate directory.")
        return

    # Prompt user to include all or use custom selection
    print("Do you want to include all .txt files in your rate dictionary (default is no)? (yes/no)")
    include_all = input().strip().lower()
    if include_all == 'yes':
        selected_rate_files = rate_files
    else:
        print("Would you like to include specific files or exclude specific files?")
        print("1. Include specific files")
        print("2. Exclude specific files")
        method = input("Enter 1 or 2: ").strip()
        if method == '1':
            selected_rate_files = include_files(rate_files)
        elif method == '2':
            selected_rate_files = exclude_files(rate_files)
        else:
            print("Invalid choice. Exiting.")
            return

    if not selected_rate_files:
        print("No .txt files selected after custom selection.")
        return

    # Create an Excel writer
    output_filename = "results_mdw.xlsx"
    output_filepath = os.path.join(os.getcwd(), output_filename)
    with pd.ExcelWriter(output_filepath, engine='openpyxl') as writer:
        # Ensure at least one visible sheet
        writer.book.create_sheet(title="Placeholder")

        # Prompt user if they want to use the predefined list of words
        use_predefined_list = input("Do you want to use the predefined list of words? (yes/no): ").strip().lower() == 'yes'
        words_to_check = predefined_words if use_predefined_list else None

        while True:
            # Prompt user to select the directory for target files
            target_directory = choose_directory("Select the directory for the target files:")
            target_files = list_txt_files(target_directory)
            if not target_files:
                print("No .txt files found in the selected target directory.")
                return

            target_files = choose_files(target_files)
            target_file_paths = [os.path.join(target_directory, file) for file in target_files]

            # Initialize dictionaries
            rate_doc_counts = {}
            target_doc_counts = {}
            rates = {}

            # Count words in rate_files
            all_counts = {}
            for f in selected_rate_files:
                rate_doc_counts[f] = {}
                words = tokenize_file(os.path.join(rate_directory, f))
                for w in words:
                    if w not in rate_doc_counts[f]:
                        rate_doc_counts[f][w] = 0
                    rate_doc_counts[f][w] += 1
                    if w not in all_counts:
                        all_counts[w] = 0
                    all_counts[w] += 1

            total_wc = sum(all_counts.values())

            # Calculate rates using rate_doc_counts
            for word in all_counts:
                rates[word] = all_counts[word] / total_wc

            # Count words in target_files
            for f in target_files:
                target_doc_counts[f] = {}
                words = tokenize_file(os.path.join(target_directory, f))
                for w in words:
                    if w not in target_doc_counts[f]:
                        target_doc_counts[f][w] = 0
                    target_doc_counts[f][w] += 1

            # Ask user for threshold
            threshold = input("Enter the threshold for word count (default is 5): ")
            if not threshold.isdigit():
                threshold = 5
            else:
                threshold = int(threshold)

            # Ask user for p-value
            p_value_threshold = input("Enter the p-value threshold (default is 0.10): ")
            if not p_value_threshold.replace('.', '', 1).isdigit():
                p_value_threshold = 0.10
            else:
                p_value_threshold = float(p_value_threshold)

            # Process each target file
            for target_file in target_files:
                countdict = target_doc_counts[target_file]

                # List .csv files in the current working directory and subdirectories
                csv_files = list_csv_files(os.getcwd())
                if not csv_files:
                    print("No .csv files found in the current working directory or subdirectories.")
                    return

                # Choose a .csv file if not already selected
                if not selected_stopwords_file_path:
                    selected_stopwords_file_path = choose_csv_file(csv_files)

                # Read stopwords from the selected .csv file
                stops = read_stopwords(selected_stopwords_file_path)

                # Prepare output table
                output_table = [['token_', 'count', 'p-value', 'obs/exp']]
                words_to_check = predefined_words if use_predefined_list else countdict.keys()
                
                for word in words_to_check:
                    if word not in countdict or countdict[word] < threshold:
                        continue
                    if word in stops:
                        continue
                    p = get_fishers(word, countdict, rates)
                    exp = rates[word] * sum(countdict.values())
                    if p < p_value_threshold:
                        new_row = [word, countdict[word], p, countdict[word] / exp]
                        output_table.append(new_row)

                # Sort the output table by count in descending order
                output_table[1:] = sorted(output_table[1:], key=lambda x: x[1], reverse=True)

                # Convert output table to DataFrame
                df_output = pd.DataFrame(output_table[1:], columns=output_table[0])

                # Write results to a new sheet in the Excel file
                sheet_name = os.path.splitext(target_file)[0]
                df_output.to_excel(writer, sheet_name=sheet_name, index=False)

                # Remove the placeholder sheet once a real sheet is added
                if "Placeholder" in writer.book.sheetnames:
                    writer.book.remove(writer.book["Placeholder"])

            # Ask user if they want to select new target files
            new_target_files = input("Do you want to select new target files in a new directory? (yes/no): ").strip().lower()
            if new_target_files != 'yes':
                break

    # Check if the file was saved
    if os.path.isfile(output_filepath):
        print(f"Results successfully written to {output_filepath}")
    else:
        print("Error: The file was not saved.")

# Call the function
calculate_fishers_exact_for_corpus()

Select the directory for the rate dictionary files:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/Démonomanie
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/.ipynb_checkpoints
4. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/République
5. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized


Enter your choice:  1


Do you want to include all .txt files in your rate dictionary (default is no)? (yes/no)


 yes
Do you want to use the predefined list of words? (yes/no):  yes


Select the directory for the target files:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/Démonomanie
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/.ipynb_checkpoints
4. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/République
5. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized


Enter your choice:  1


Select target .txt files (enter numbers separated by commas or ranges, or starting text patterns):
1. Discours des raisons_corrected_stemmed.txt
2. Démonomanie I.1_corrected_stemmed.txt
3. Démonomanie I.2_corrected_stemmed.txt
4. Démonomanie I.3_corrected_stemmed.txt
5. Démonomanie I.4_corrected_stemmed.txt
6. Démonomanie I.5_corrected_stemmed.txt
7. Démonomanie I.6_corrected_stemmed.txt
8. Démonomanie I.7_corrected_stemmed.txt
9. Démonomanie II.1_corrected_stemmed.txt
10. Démonomanie II.2_corrected_stemmed.txt
11. Démonomanie II.3_corrected_stemmed.txt
12. Démonomanie II.4_corrected_stemmed.txt
13. Démonomanie II.5_corrected_stemmed.txt
14. Démonomanie II.6_corrected_stemmed.txt
15. Démonomanie II.7_corrected_stemmed.txt
16. Démonomanie II.8_corrected_stemmed.txt
17. Démonomanie III.1_corrected_stemmed.txt
18. Démonomanie III.2_corrected_stemmed.txt
19. Démonomanie III.3_corrected_stemmed.txt
20. Démonomanie III.4_corrected_stemmed.txt
21. Démonomanie III.5_corrected_stemmed.txt
22. D

Enter your choices:  Dém,Rép
Enter the threshold for word count (default is 5):  1
Enter the p-value threshold (default is 0.10):  


Select a .csv file to use as a stopwords file:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/lemmatized/stop_words.csv
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/Démonomanie/stop_words.csv
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/République/stop_words.csv


Enter your choice:  1




Do you want to select new target files in a new directory? (yes/no):  no


Results successfully written to /home/lucas-jerusalimiec/Documents/OCR Text/Text/Sectionized/results_mdw.xlsx
