In [3]:
import os
import nltk
import csv
from scipy.stats import fisher_exact

# Function to list subfolders in the current directory
def list_subfolders():
    return [f.name for f in os.scandir() if f.is_dir()]

# Function to prompt the user to select a subfolder
def prompt_subfolder(subfolders):
    print("0. Current Working Directory")
    for index, folder in enumerate(subfolders, start=1):
        print(f"{index}. {folder}")
    while True:
        try:
            choice = int(input("Select a subfolder by number: "))
            if 0 <= choice <= len(subfolders):
                return None if choice == 0 else subfolders[choice - 1]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a number.")

# Function to find text files in a subfolder
def find_text_files(subfolder):
    text_files = [f for f in os.listdir(subfolder) if f.endswith('.txt')]
    return text_files

# Function to find CSV files in a subfolder
def find_csv_files(subfolder):
    csv_files = [f for f in os.listdir(subfolder) if f.endswith('.csv')]
    return csv_files

# Function to prompt the user to select multiple text files
def prompt_text_files(text_files):
    selected_files = []
    for index, file in enumerate(text_files, start=1):
        print(f"{index}. {file}")
    print("Enter the numbers of the text files you want to select, separated by spaces (e.g., '1 3 5'): ")
    while True:
        try:
            choices = list(map(int, input().split()))
            if all(1 <= choice <= len(text_files) for choice in choices):
                selected_files = [text_files[choice - 1] for choice in choices]
                return selected_files
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter numbers separated by spaces.")

# Function to process the selected text files and combine them into a single corpus
def process_text_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_contents = f.read().lower()
            combined_text += file_contents + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    text = nltk.Text(tokens)
    return text

# Function to prompt the user to select a subset of text files for KWIC and counts
def prompt_subset_files(text_files):
    selected_files = []
    for index, file in enumerate(text_files, start=1):
        print(f"{index}. {file}")
    print("Enter the numbers of the text files you want to use for KWIC and counts, separated by spaces (e.g., '1 3 5'): ")
    while True:
        try:
            choices = list(map(int, input().split()))
            if all(1 <= choice <= len(text_files) for choice in choices):
                selected_files = [text_files[choice - 1] for choice in choices]
                return selected_files
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter numbers separated by spaces.")

# Function to process the subset of text files for KWIC and counts
def process_subset_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_contents = f.read().lower()
            combined_text += file_contents + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return tokens

def get_kwic(sometargetterm, somelistofwords, window=10, excl_target=True):
    kwics = []
    for n, w in enumerate(somelistofwords):
        if w == sometargetterm:
            start = max(0, n - window)
            end = min(n + window + 1, len(somelistofwords))
            if excl_target:
                k = somelistofwords[start:n] + somelistofwords[n + 1:end]
            else:
                k = somelistofwords[start:end]
            kwics.append(k)
    return kwics

def add_to_count_dict(word, count_dict):
    if word in count_dict:
        count_dict[word] += 1
    else:
        count_dict[word] = 1

def get_fishers(someword, somecountdict, someratedict, alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r * wc)
    d = wc - c
    p = fisher_exact([[a, b], [c, d]], alternative=alternative)[1]
    return p

def search_concordance(text_data, tokens):
    alpha = input("Enter the value of alpha (default is 0.10): ").strip()
    alpha = float(alpha) if alpha else 0.10
    for token in tokens:
        # Create the directory if it does not exist
        directory = 'concordances'
        if not os.path.exists(directory):
            os.makedirs(directory)

        # Create a filename for each token
        filename = f"{token}_MDC.csv"
        file_path = os.path.join(directory, filename)

        excl_target = input(f"Exclude target term '{token}' from concordance (yes/no)? ").strip().lower() == 'yes'
        window = int(input(f"Enter the window size for concordance for '{token}': ").strip())

        with open(file_path, 'w', newline='') as csvfile:
            fieldnames = ['Collocate', 'Count', 'Frequency', 'obs/exp', 'p-value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            words = text_data.tokens
            subset_files = prompt_subset_files(text_files)
            subset_tokens = process_subset_files([os.path.join(subfolder_path, file) for file in subset_files])
            kwics = get_kwic(token, subset_tokens, window, excl_target)
            counts = {}
            for k in kwics:
                for w in k:
                    add_to_count_dict(w, counts)
            total_wc = sum(counts.values())
            rates = {word: count / total_wc for word, count in counts.items()}
            for word, count in sorted(counts.items(), key=lambda item: item[1], reverse=True):
                if word not in stops:
                    p_value = get_fishers(word, counts, rates)
                    if p_value < alpha:
                        frequency = count / total_wc
                        exp = rates[word] * total_wc
                        obs_exp = count / exp
                        writer.writerow({'Collocate': word, 'Count': count, 'Frequency': frequency, 'obs/exp': obs_exp, 'p-value': p_value})

        print(f'Concordance for the token "{token}" has been saved to {file_path}')

def choose_subdirectory(subdirectories):
    print("Select a subdirectory for the stopwords csv file:")
    print("0. Current Working Directory")
    for i, subdir in enumerate(subdirectories, start=1):
        print(f"{i + 1}. {subdir}")
    while True:
        try:
            choice = int(input("Enter your choice: "))
            if 0 <= choice <= len(subdirectories):
                return None if choice == 0 else subdirectories[choice - 1]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a number.")

def read_stopwords(filepath):
    with open(filepath, 'r', encoding='latin-1') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

subfolders = list_subfolders()
if subfolders:
    selected_subfolder = prompt_subfolder(subfolders)
    subfolder_path = os.getcwd() if selected_subfolder is None else os.path.join(os.getcwd(), selected_subfolder)
    text_files = find_text_files(subfolder_path)
    if text_files:
        print('Select .txt files to establish global word counts')
        selected_files = prompt_text_files(text_files)
        full_file_paths = [os.path.join(subfolder_path, file) for file in selected_files]
        text_data = process_text_files(full_file_paths)
    else:
        print(f"No text files found in '{selected_subfolder}'.")
else:
    print("No subfolders found in the current directory.")

# Example usage
user_input = input("Enter words to find their collocate concordances (separated by spaces): ").lower()
tokens = [word.strip() for word in user_input.split(' ')]  # Split the input into a list of tokens

stopwords_subfolders = list_subfolders()
if stopwords_subfolders:
    selected_stopwords_subfolder = choose_subdirectory(stopwords_subfolders)
    stopwords_subfolder_path = os.getcwd() if selected_stopwords_subfolder is None else os.path.join(os.getcwd(), selected_stopwords_subfolder)
    stopwords_files = find_csv_files(stopwords_subfolder_path)
    if stopwords_files:
        chosen_stopwords_file = prompt_text_files(stopwords_files)[0]  # Select the first file from the list
        chosen_csv_file_path = os.path.join(stopwords_subfolder_path, chosen_stopwords_file)
        stops = read_stopwords(chosen_csv_file_path)
        search_concordance(text_data, tokens)
    else:
            print(f"No stopwords files found in '{selected_stopwords_subfolder}'.")
else:
    print("No subfolders found in the current directory.")

0. Current Working Directory
1. tokenized
2. concordances
3. final
4. .ipynb_checkpoints


Select a subfolder by number:  1


Select .txt files to establish global word counts
1. Démonomanie Repair_corrected_underscore_bigrams.txt
2. Démonomanie Repair_corrected.txt
Enter the numbers of the text files you want to select, separated by spaces (e.g., '1 3 5'): 


 1 2
Enter words to find their collocate concordances (separated by spaces):  malade


Select a subdirectory for the stopwords csv file:
0. Current Working Directory
2. tokenized
3. concordances
4. final
5. .ipynb_checkpoints


Enter your choice:  0


1. spellcheck_data.csv
2. stop_words.csv
Enter the numbers of the text files you want to select, separated by spaces (e.g., '1 3 5'): 


 2
Enter the value of alpha (default is 0.10):  0.95
Exclude target term 'malade' from concordance (yes/no)?  no
Enter the window size for concordance for 'malade':  10


1. Démonomanie Repair_corrected_underscore_bigrams.txt
2. Démonomanie Repair_corrected.txt
Enter the numbers of the text files you want to use for KWIC and counts, separated by spaces (e.g., '1 3 5'): 


 2


Concordance for the token "malade" has been saved to concordances/malade_MDC.csv
