In [1]:
import os
from scipy.stats import fisher_exact

In [2]:
def tokenize_file(filepath):
    with open(filepath, 'r') as file:
        return file.read().split()

def list_directories(base_directory):
    return [d for d in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, d))]

def choose_directory(directories):
    print("Select a directory:")
    for i, directory in enumerate(directories):
        print(f"{i + 1}. {directory}")
    choice = int(input("Enter your choice: "))
    return directories[choice - 1]

def list_files_in_directory(directory):
    return [f for f in os.listdir(directory) if f.endswith('.txt')]

def choose_files(files):
    print("Select files to use for calculating r (enter numbers separated by commas):")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    choices = input("Enter your choices: ").split(',')
    return [files[int(choice) - 1] for choice in choices]

def choose_single_file(files):
    print("Select a target file:")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    choice = int(input("Enter your choice: "))
    return files[choice - 1]

def calculate_fishers_exact():
    # Get the current working directory
    base_directory = os.getcwd()
    directories = list_directories(base_directory)
    
    if not directories:
        print("No directories found in the current working directory.")
        return
    
    # Choose a directory
    chosen_directory = choose_directory(directories)
    full_directory_path = os.path.join(base_directory, chosen_directory)
    
    files = list_files_in_directory(full_directory_path)
    
    if not files:
        print("No .txt files found in the selected directory.")
        return
    
    # Choose multiple files for calculating r
    file_paths = choose_files(files)
    all_words = []
    
    for file_path in file_paths:
        all_words.extend(tokenize_file(os.path.join(full_directory_path, file_path)))
    
    while True:
        # Input target word
        target_word = input("Enter the target word: ")
        
        count = all_words.count(target_word)
        wc = len(all_words)
        
        r = count / wc
        
        # Choose a single file for calculating a and wc
        single_file_path = choose_single_file(files)
        single_file_words = tokenize_file(os.path.join(full_directory_path, single_file_path))
        
        a = single_file_words.count(target_word)
        wc_single = len(single_file_words)
        
        b = wc_single - a
        c = round(r * wc_single)
        d = wc_single - c
        
        table = [[a, b], [c, d]]
        
        # Run the Fisher's Exact Test
        fe = fisher_exact(table, alternative='greater')
        
        # Get the p-value
        print("P-value:", fe.pvalue)
        
        # Ask the user whether to quit or calculate for a new target word
        choice = input("Do you want to quit or calculate Fisher's Exact Test for a new target word? (quit/new): ").strip().lower()
        if choice == 'quit':
            break

In [None]:
# Call the function
calculate_fishers_exact()

Select a directory:
1. Concatenated
2. .ipynb_checkpoints


Enter your choice:  1


Select files to use for calculating r (enter numbers separated by commas):
1. Bodin.txt
2. L'Hospital.txt


Enter your choices:  1,2
Enter the target word:  chos


Select a target file:
1. Bodin.txt
2. L'Hospital.txt


Enter your choice:  1


P-value: 0.19789527129546836


Do you want to quit or calculate Fisher's Exact Test for a new target word? (quit/new):  yes
