In [1]:
import os
from scipy.stats import fisher_exact
import csv

def tokenize_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read().split()

def list_txt_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.txt')])

def choose_file(files):
    print("Select a target .txt file:")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    choice = int(input("Enter your choice: "))
    return files[choice - 1]

def include_files(files):
    print("Available .txt files:")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    
    print("Enter the numbers of the files you want to include, separated by commas:")
    choices = input().split(',')
    included_files = [files[int(choice.strip()) - 1] for choice in choices if choice.strip().isdigit()]
    return included_files

def exclude_files(files):
    print("Available .txt files:")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    
    print("Enter the numbers of the files you want to exclude, separated by commas:")
    choices = input().split(',')
    excluded_files = [files[int(choice.strip()) - 1] for choice in choices if choice.strip().isdigit()]
    return [file for file in files if file not in excluded_files]

def get_fishers(someword, somecountdict, someratedict, alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r * wc)
    d = wc - c
    p = fisher_exact([[a, b], [c, d]], alternative=alternative).pvalue
    return p

def list_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

def choose_csv_file(csv_files):
    print("Select a .csv file to use as a stopwords file:")
    for i, file in enumerate(csv_files):
        print(f"{i + 1}. {file}")
    choice = int(input("Enter your choice: "))
    return csv_files[choice - 1]

def read_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

def calculate_fishers_exact_for_corpus():
    # Get the current working directory
    base_directory = os.getcwd()
    all_files = list_txt_files(base_directory)
    
    if not all_files:
        print("No .txt files found in the current working directory.")
        return

    # Prompt user to include all or use custom selection
    print("Do you want to include all .txt files in your rate dictionary? (yes/no)")
    include_all = input().strip().lower()
    
    if include_all == 'yes':
        files = all_files
    else:
        print("Would you like to include specific files or exclude specific files?")
        print("1. Include specific files")
        print("2. Exclude specific files")
        method = input("Enter 1 or 2: ").strip()
        
        if method == '1':
            files = include_files(all_files)
        elif method == '2':
            files = exclude_files(all_files)
        else:
            print("Invalid choice. Exiting.")
            return
    
    if not files:
        print("No .txt files selected after custom selection.")
        return
    
    # Choose a target file
    target_file = choose_file(files)
    target_file_path = os.path.join(base_directory, target_file)
    
    # Tokenize the target file
    target_words = tokenize_file(target_file_path)
    
    # Initialize dictionaries
    doc_counts = {}
    all_counts = {}
    rates = {}
    
    # Count words in all files
    for f in files:
        doc_counts[f] = {}
        words = tokenize_file(os.path.join(base_directory, f))
        for w in words:
            if w not in doc_counts[f]:
                doc_counts[f][w] = 0
            doc_counts[f][w] += 1
            if w not in all_counts:
                all_counts[w] = 0
            all_counts[w] += 1
    
    total_wc = sum(all_counts.values())
    
    # Calculate rates
    for word in all_counts:
        rates[word] = all_counts[word] / total_wc
    
    # Prepare output table
    output_table = [['token_', 'count', 'p-value', 'obs/exp']]
    
    countdict = doc_counts[target_file]

    # List .csv files in the current directory and subdirectories
    csv_files = list_csv_files(base_directory)
    
    if not csv_files:
        print("No .csv files found in the current directory or subdirectories.")
        return
    
    # Choose a .csv file
    chosen_csv_file = choose_csv_file(csv_files)
    chosen_csv_file_path = os.path.join(base_directory, chosen_csv_file)
    
    # Read stopwords from the chosen .csv file
    stops = read_stopwords(chosen_csv_file_path)
    
    # Ask user for threshold
    threshold = input("Enter the threshold for word count (default is 5): ")
    if not threshold.isdigit():
        threshold = 5
    else:
        threshold = int(threshold)
    
    # Ask user for p-value
    p_value_threshold = input("Enter the p-value threshold (default is 0.10): ")
    if not p_value_threshold.replace('.', '', 1).isdigit():
        p_value_threshold = 0.10
    else:
        p_value_threshold = float(p_value_threshold)
    
    for word, count in countdict.items():
        if count < threshold:
            continue
        if word in stops:
            continue
        p = get_fishers(word, countdict, rates)
        exp = rates[word] * sum(countdict.values())
        if p < p_value_threshold:
            new_row = [word, count, p, count / exp]
            output_table.append(new_row)
    
    # Sort the output table by count in descending order
    output_table[1:] = sorted(output_table[1:], key=lambda x: x[1], reverse=True)

    # Write results to CSV
    output_filename = f"{os.path.splitext(target_file)[0]}_mdw.csv"
    with open(output_filename, 'w', newline='') as output:
        writer = csv.writer(output)
        writer.writerows(output_table)
    
    print(f"Results written to {output_filename}")

In [2]:
# Call the function
calculate_fishers_exact_for_corpus()

Do you want to include all .txt files in your rate dictionary? (yes/no)


 no


Would you like to include specific files or exclude specific files?
1. Include specific files
2. Exclude specific files


Enter 1 or 2:  2


Available .txt files:
1. Bodin.txt
2. Discours des raisons_corrected.txt
3. Démonomanie Repair_corrected.txt
4. Harangue - Fontainebleau_corrected.txt
5. Harangue - Orléans 2_corrected.txt
6. Harangue - Orléans_corrected.txt
7. Harangue - Poissy_corrected.txt
8. Harangue - Rouen_corrected.txt
9. Harangue - Saint Germain_corrected.txt
10. Harangue - lit de justice_corrected.txt
11. Harangue - ouverture de parlement_corrected.txt
12. Harangue - parlement 2_corrected.txt
13. Harangue - parlement 3_corrected.txt
14. Harangue - parlement_corrected.txt
15. Harangue - religion_corrected.txt
16. Harangue - septembre_corrected.txt
17. L'Hospital.txt
18. La réponse_corrected.txt
19. Le paradoxe_corrected.txt
20. Lettre_corrected.txt
21. Lit de justice_corrected.txt
22. Memoire - Namur_corrected.txt
23. Memoire - le but_corrected.txt
24. Memoire au roi_corrected.txt
25. Memoires d'État Refuge_corrected.txt
26. Memoires d'état_corrected.txt
27. Recueil_corrected.txt
28. Remonstrances - Royaume_cor

 1,17


Select a target .txt file:
1. Discours des raisons_corrected.txt
2. Démonomanie Repair_corrected.txt
3. Harangue - Fontainebleau_corrected.txt
4. Harangue - Orléans 2_corrected.txt
5. Harangue - Orléans_corrected.txt
6. Harangue - Poissy_corrected.txt
7. Harangue - Rouen_corrected.txt
8. Harangue - Saint Germain_corrected.txt
9. Harangue - lit de justice_corrected.txt
10. Harangue - ouverture de parlement_corrected.txt
11. Harangue - parlement 2_corrected.txt
12. Harangue - parlement 3_corrected.txt
13. Harangue - parlement_corrected.txt
14. Harangue - religion_corrected.txt
15. Harangue - septembre_corrected.txt
16. La réponse_corrected.txt
17. Le paradoxe_corrected.txt
18. Lettre_corrected.txt
19. Lit de justice_corrected.txt
20. Memoire - Namur_corrected.txt
21. Memoire - le but_corrected.txt
22. Memoire au roi_corrected.txt
23. Memoires d'État Refuge_corrected.txt
24. Memoires d'état_corrected.txt
25. Recueil_corrected.txt
26. Remonstrances - Royaume_corrected.txt
27. Remonstrances

Enter your choice:  28


Select a .csv file to use as a stopwords file:
1. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected Tokenized/Bodin_mdw.csv
2. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected Tokenized/stop_words.csv
3. /home/lucas-jerusalimiec/Documents/OCR Text/Text/Collected Tokenized/Démonomanie Repair_corrected_mdw.csv


Enter your choice:  2
Enter the threshold for word count (default is 5):  3
Enter the p-value threshold (default is 0.10):  


Results written to République_corrected_mdw.csv
