In [3]:
import os
from scipy.stats import fisher_exact
import csv

def tokenize_file(filepath):
    with open(filepath, 'r', encoding='latin-1') as file:
        return file.read().split()

def list_txt_files(directory):
    return [f for f in os.listdir(directory) if f.endswith('.txt')]

def choose_file(files):
    print("Select a target .txt file:")
    for i, file in enumerate(files):
        print(f"{i + 1}. {file}")
    choice = int(input("Enter your choice: "))
    return files[choice - 1]

def get_fishers(someword, somecountdict, someratedict, alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r * wc)
    d = wc - c
    p = fisher_exact([[a, b], [c, d]], alternative=alternative).pvalue
    return p

def list_subdirectories(directory):
    return [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]

def list_csv_files(directory):
    return [f for f in os.listdir(directory) if f.endswith('.csv')]

def choose_subdirectory(subdirectories):
    print("Select a subdirectory for the stopwords csv file:")
    for i, subdir in enumerate(subdirectories):
        print(f"{i + 1}. {subdir}")
    choice = int(input("Enter your choice: "))
    return subdirectories[choice - 1]

def choose_csv_file(csv_files):
    print("Select a .csv file:")
    for i, file in enumerate(csv_files):
        print(f"{i + 1}. {file}")
    choice = int(input("Enter your choice: "))
    return csv_files[choice - 1]

def read_stopwords(filepath):
    with open(filepath, 'r', encoding='latin-1') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

def calculate_fishers_exact_for_corpus():
    # Get the current working directory
    base_directory = os.getcwd()
    files = list_txt_files(base_directory)
    
    if not files:
        print("No .txt files found in the current working directory.")
        return
    
    # Choose a target file
    target_file = choose_file(files)
    target_file_path = os.path.join(base_directory, target_file)
    
    # Tokenize the target file
    target_words = tokenize_file(target_file_path)
    
    # Initialize dictionaries
    doc_counts = {}
    all_counts = {}
    rates = {}
    
    # Count words in all files
    for f in files:
        doc_counts[f] = {}
        words = tokenize_file(os.path.join(base_directory, f))
        for w in words:
            if w not in doc_counts[f]:
                doc_counts[f][w] = 0
            doc_counts[f][w] += 1
            if w not in all_counts:
                all_counts[w] = 0
            all_counts[w] += 1
    
    total_wc = sum(all_counts.values())
    
    # Calculate rates
    for word in all_counts:
        rates[word] = all_counts[word] / total_wc
    
    # Prepare output table
    output_table = [['token_', 'count', 'p-value', 'obs/exp']]
    
    countdict = doc_counts[target_file]

    # Move to parent directory and list subdirectories
    parent_directory = os.path.dirname(base_directory)
    subdirectories = list_subdirectories(parent_directory)
    
    while True:
        chosen_subdirectory = choose_subdirectory(subdirectories)
        chosen_subdirectory_path = os.path.join(parent_directory, chosen_subdirectory)
        
        # List .csv files in the chosen subdirectory
        csv_files = list_csv_files(chosen_subdirectory_path)
        
        if csv_files:
            break
        else:
            print("No .csv files found in the chosen subdirectory. Please choose another subdirectory.")
    
    # Choose a .csv file
    chosen_csv_file = choose_csv_file(csv_files)
    chosen_csv_file_path = os.path.join(chosen_subdirectory_path, chosen_csv_file)
    
    # Read stopwords from the chosen .csv file
    stops = read_stopwords(chosen_csv_file_path)
    
    # Ask user for threshold
    threshold = input("Enter the threshold for word count (default is 5): ")
    if not threshold.isdigit():
        threshold = 5
    else:
        threshold = int(threshold)
    
    for word, count in countdict.items():
        if count < threshold:
            continue
        if word in stops:
            continue
        p = get_fishers(word, countdict, rates)
        exp = rates[word] * sum(countdict.values())
        if p < 0.05:
            new_row = [word, count, p, count / exp]
            output_table.append(new_row)
    
    # Write results to CSV
    output_filename = f"{os.path.splitext(target_file)[0]}_mdw.csv"
    with open(output_filename, 'w', newline='') as output:
        writer = csv.writer(output)
        writer.writerows(output_table)
    
    print(f"Results written to {output_filename}")

In [4]:
# Call the function
calculate_fishers_exact_for_corpus()

Select a target .txt file:
1. Cummings.txt
2. pg74277.txt
3. Test.txt


Enter your choice:  1


Select a subdirectory for the stopwords csv file:
1. .ipynb_checkpoints
2. Concatenated
3. concordances
4. dispersion_plots
5. output
6. pdfs
7. plots
8. tokenized
9. __pycache__


Enter your choice:  5


Select a .csv file:
1. Cummings_adjustments.csv
2. Cummings_fulldata.csv
3. Cummings_metadata.csv
4. spellcheck_data.csv
5. stop_words.csv


Enter your choice:  5
Enter the threshold for word count (default is 5):  3


Results written to Cummings_mdw.csv
