In [1]:
import os
import csv
import nltk
from scipy.stats import fisher_exact
from openpyxl import Workbook, load_workbook

# Function to prompt the user to select text files based on patterns
def prompt_pattern_files(text_files, pattern):
    selected_files = [file for file in text_files if file.startswith(pattern)]
    return selected_files

def prompt_files(text_files, purpose):
    text_files = sorted(text_files, key=custom_file_sort_key)  # Custom sort for files
    print(f"Select the files for {purpose}:")
    for i, file in enumerate(text_files, start=1):
        print(f"{i}. {file}")
    
    selection = input("Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files: ").strip()
    selected_files = []
    
    if selection.lower() == 'all':
        selected_files = text_files
    else:
        # Split the input by commas to handle multiple ranges or numbers
        parts = selection.split(',')
        for part in parts:
            part = part.strip()
            if '-' in part:  # If the part is a range
                try:
                    start, end = map(int, part.split('-'))
                    selected_files.extend(text_files[start-1:end])
                except ValueError:
                    print(f"Invalid range: {part}. Please provide ranges like '1-3'.")
            elif part.isdigit():  # If the part is a single number
                try:
                    selected_files.append(text_files[int(part) - 1])
                except IndexError:
                    print(f"Invalid number: {part}. Please select numbers from the list.")
            else:  # If the part is treated as a pattern
                selected_files.extend(prompt_pattern_files(text_files, part))
    
    # Remove duplicates and sort the selected files
    selected_files = sorted(set(selected_files), key=custom_file_sort_key)
    
    print("Selected files:")
    for file in selected_files:
        print(file)
    
    return selected_files

# Custom sort key for file names
def custom_file_sort_key(filename):
    # Prioritize 'preface' higher than patterns like 'I.1'
    if 'preface' in filename.lower():
        return ('', filename.lower())  # Sort 'preface' first
    return (filename.lower(),)

# Function to process text files
def process_text_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            combined_text += file.read().lower() + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return nltk.Text(tokens)

# Function to find .txt files in a directory
def find_text_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.txt')], key=custom_file_sort_key)  # Custom sort

# Function to list subfolders in the current directory
def list_subfolders():
    return sorted([f.name for f in os.scandir() if f.is_dir()])  # Sort folders alphabetically

# Function to prompt the user to select a subfolder or the current directory
def prompt_subfolder(subfolders):
    print("Select a subfolder or the current working directory:")
    print("0. Current Working Directory")
    for i, subfolder in enumerate(subfolders, start=1):
        print(f"{i}. {subfolder}")
    selected_index = int(input("Enter the number of the subfolder: "))
    return None if selected_index == 0 else subfolders[selected_index - 1]

# Function to get predefined target words
def get_predefined_target_words():
    return [
        ['citoyen', 'cour', 'domain', 'ressort'],  # List 1
        ['guerre', 'paix', 'police' 'religion'],  # List 2
        ['confess', 'demon', 'demoniaqu', 'diabl',
         'diabol', 'dieu', 'divin', 
        'hebrieu', 'impiet', 'preuv', 'question',   'sathan', 
        'sorceller', 'sorci',  'statut', 'sujet'],  # List 3
        ['arrest',  'conseil', 'conseiller', 'consul', 
         'couron', 'édict', 'iurisdict', 'jug', 'magistrat',
         'offic', 'offici', 'ordon', 'parlement',
        'seigneur', 'seigneurial', 'statut'],  # List 4
        ['absolu', 'bien', 'chos', 'civil', 'droit', 'estat', 'just', 'justic',
         'loi', 'maiest', 'princ', 'puissanc',
        'republ', 'roy', 'royal', 'royaum', 'souverain', 'souverainet', 'sujet']  # List 5
    ]

# Function to choose subdirectory for stopwords csv file
def choose_subdirectory(subdirectories):
    print("Select a subdirectory:")
    print("0. Current Working Directory")
    for i, subdir in enumerate(subdirectories, start=1):
        print(f"{i}. {subdir}")
    while True:
        try:
            choice = int(input("Enter your choice: "))
            if 0 <= choice <= len(subdirectories):
                return None if choice == 0 else subdirectories[choice - 1]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a number.")

# Function to read stopwords from a csv file
def read_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

# Function to find .csv files in a directory
def find_csv_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.csv')])  # Sort files alphabetically

# Function to select files for the stopwords
def select_stopwords_file():
    print('Stopwords file selection')
    stopwords_subfolders = list_subfolders()
    selected_stopwords_subfolder = choose_subdirectory(stopwords_subfolders)
    
    # Check if the user selected a subfolder or the current directory
    stopwords_subfolder_path = os.getcwd() if selected_stopwords_subfolder is None else os.path.join(os.getcwd(), selected_stopwords_subfolder)
    
    # Find .csv files in the selected directory
    stopwords_files = find_csv_files(stopwords_subfolder_path)
    if stopwords_files:
        print('Select a stopwords file from the following list:')
        # Prompt the user to select a single .csv file
        selected_file = prompt_files(stopwords_files, "stopwords file")
        if selected_file:
            return selected_file[0], stopwords_subfolder_path  # Return the first selected file and its path
        else:
            print("No stopwords file selected.")
            return None, None
    else:
        print(f"No .csv stopwords files found in '{selected_stopwords_subfolder}'.")
        return None, None

# Function to select files for the rate dictionary
def select_rate_dictionary_files():
    print('Rate dictionary file selection')
    rate_dictionary_subfolders = list_subfolders()
    selected_rate_dictionary_subfolder = choose_subdirectory(rate_dictionary_subfolders)
    
    # Check if the user selected a subfolder or the current directory
    rate_dictionary_subfolder_path = os.getcwd() if selected_rate_dictionary_subfolder is None else os.path.join(os.getcwd(), selected_rate_dictionary_subfolder)
    
    # Find .txt files in the selected directory
    rate_dictionary_files = find_text_files(rate_dictionary_subfolder_path)
    if rate_dictionary_files:
        print('Select one or more rate dictionary files from the following list:')
        selected_files = prompt_files(rate_dictionary_files, "rate dictionary")
        if selected_files:
            return selected_files, rate_dictionary_subfolder_path  # Return the selected files and their path
        else:
            print("No rate dictionary files selected.")
            return [], None
    else:
        print(f"No .txt rate dictionary files found in '{selected_rate_dictionary_subfolder}'.")
        return [], None

def select_existing_xlsx_file():
    print("Select a directory to search for .xlsx files:")
    subfolders = list_subfolders()
    selected_subfolder = choose_subdirectory(subfolders)

    # Check if the user selected a subfolder or the current directory
    folder_path = os.getcwd() if selected_subfolder is None else os.path.join(os.getcwd(), selected_subfolder)

    # Find .xlsx files in the selected directory
    xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
    if xlsx_files:
        print("Select an existing .xlsx file from the following list:")
        for i, file in enumerate(xlsx_files, start=1):
            print(f"{i}. {file}")
        while True:
            try:
                choice = int(input("Enter the number of the file you want to select or 0 to cancel: "))
                if 0 <= choice <= len(xlsx_files):
                    return None if choice == 0 else os.path.join(folder_path, xlsx_files[choice - 1])
                else:
                    print("Invalid selection. Please try again.")
            except ValueError:
                print("Please enter a number.")
    else:
        print(f"No .xlsx files found in '{folder_path}'.")
        return None

# Utility function to clean file names
def clean_file_name(file_name):
    """
    Clean the file name for display, including replacing specific patterns.
    """
    file_name = file_name.replace('_', '').replace('corrected', '').replace('stemmed', '')
    if 'Démonomanie' in file_name:
        file_name = file_name.replace('Démonomanie', 'Dém')
    if 'République' in file_name:
        file_name = file_name.replace('République', 'Rép')
   
    # Replace '911' with '11' and '910' with '10' (NEW CHANGE)
    file_name = file_name.replace('911', '11').replace('910', '10')  # <--- CHANGE HERE
   
    return os.path.splitext(file_name)[0]

# Function to process the subset of text files for KWIC and counts
def process_subset_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_contents = f.read().lower()
            combined_text += file_contents + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return tokens

def get_kwic(sometargetterm, somelistofwords, window=10, excl_target=True, source_file=None):
    kwics = []
    for n, w in enumerate(somelistofwords):
        if w == sometargetterm:
            start = max(0, n - window)
            end = min(n + window + 1, len(somelistofwords))
            if excl_target:
                k = somelistofwords[start:n] + somelistofwords[n + 1:end]
            else:
                k = somelistofwords[start:end]
            kwics.append((k, source_file))
    return kwics

def add_to_count_dict(word, count_dict):
    if word in count_dict:
        count_dict[word] += 1
    else:
        count_dict[word] = 1

def get_fishers(someword, somecountdict, someratedict, alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r * wc)
    d = wc - c
    p = fisher_exact([[a, b], [c, d]], alternative=alternative)[1]
    return p

# Modified filter_collocates function
def filter_collocates(collocates, collocate_counts):
    """
    Filter the list of significant collocates based on user selection.

    Parameters:
        collocates (list): A sorted list of significant collocates.
        collocate_counts (dict): A dictionary of collocates and their counts.

    Returns:
        list: A filtered list of collocates selected by the user.
    """
    while True:
        try:
            min_count = int(input("Enter the minimum count threshold for collocates to include: ").strip())
            break
        except ValueError:
            print("Invalid input. Please enter a valid number.")

    filtered_collocates = [collocate for collocate in collocates if collocate_counts[collocate] >= min_count]
    print(f"\nCollocates with counts >= {min_count}:")
    for i, collocate in enumerate(filtered_collocates, start=1):
        print(f"{i}. {collocate} (Count: {collocate_counts[collocate]})")

    print("\nYou can select collocates by entering:")
    print("- A single number (e.g., 3) to select one collocate.")
    print("- A range of numbers (e.g., 3-6) to select multiple collocates.")
    print("- Multiple selections separated by commas (e.g., 3,5-7,9).")
    print("- Type 'all' to select all collocates.")
    print("- Type 'done' to finalize your selection.")

    selected_collocates = []

    while True:
        selection = input("Enter your selection: ").strip()
        if selection.lower() == 'done':
            break
        elif selection.lower() == 'all':
            selected_collocates = filtered_collocates
            break

        try:
            parts = selection.split(',')
            for part in parts:
                part = part.strip()
                if '-' in part:  # Handle ranges
                    start, end = map(int, part.split('-'))
                    selected_collocates.extend(filtered_collocates[start-1:end])
                elif part.isdigit():  # Handle single numbers
                    selected_collocates.append(filtered_collocates[int(part) - 1])
                else:
                    print(f"Invalid selection: {part}. Please try again.")
        except (ValueError, IndexError):
            print(f"Invalid input: {selection}. Please try again.")

        # Remove duplicates and sort the selected collocates
        selected_collocates = sorted(set(selected_collocates), key=filtered_collocates.index)

        print("Currently selected collocates:")
        for collocate in selected_collocates:
            print(collocate)

    return selected_collocates, min_count

def search_concordance(text_data, predefined_word_lists, stops, alpha):
    window = int(input(f"Enter the window size for concordance: ").strip())

    subset_files = prompt_files(find_text_files(os.getcwd()), "KWIC and key word counts")

    # Allow users to choose an existing .xlsx file or create a new one
    append_to_existing = input("Do you want to append results to an existing .xlsx file? (yes/no): ").strip().lower()

    if append_to_existing == 'yes':
        existing_file = select_existing_xlsx_file()
        if existing_file:
            wb = load_workbook(existing_file)
            print(f"Appending to existing file: {existing_file}")
        else:
            print("No existing workbook selected. Creating a new workbook instead.")
            wb = Workbook()
    else:
        wb = Workbook()
        print("Creating a new workbook.")

    for index, predefined_words in enumerate(predefined_word_lists):
        # Allow users to skip processing a predefined word list
        skip = input(f"Do you want to skip processing Hypothesis {index + 1}? (yes/no): ").strip().lower()
        if skip == 'yes':
            print(f"Skipping Hypothesis {index + 1}.")
            continue

        ws = wb.create_sheet(title=f"Hypothesis {index + 1}")

        # Write headers
        headers = ['Word'] + [clean_file_name(file) for file in subset_files] + ['Total']
        ws.append(headers)

        # Sort rows (words) alphabetically
        predefined_words = sorted(predefined_words)

        all_significant_collocates = set()
        collocate_counts = {}
        for word in predefined_words:
            for file in subset_files:
                file_path = os.path.join(os.getcwd(), file)
                tokens = process_subset_files([file_path])
                kwics = get_kwic(word, tokens, window)
                for k in kwics:
                    for w in k[0]:  # Access the first element of the tuple
                        if w not in stops:
                            all_significant_collocates.add(w)
                            add_to_count_dict(w, collocate_counts)

        all_significant_collocates = sorted(all_significant_collocates)
        selected_collocates, min_count = filter_collocates(all_significant_collocates, collocate_counts)

        for word in predefined_words:
            row = [word]
            word_total = 0
            for file in subset_files:
                file_path = os.path.join(os.getcwd(), file)
                tokens = process_subset_files([file_path])
                kwics = get_kwic(word, tokens, window)
                count = sum(1 for k in kwics if any(w in selected_collocates for w in k[0]))
                row.append(count)
                word_total += count
            row.append(word_total)
            ws.append(row)

        # Add total row
        total_row = ['Total'] + [sum(ws.cell(row=i + 2, column=j + 2).value for i in range(len(predefined_words))) for j in range(len(subset_files))] + [sum(ws.cell(row=i + 2, column=len(subset_files) + 2).value for i in range(len(predefined_words)))]
        ws.append(total_row)

        # Add blank line and selected collocates
        ws.append([])
        ws.append(['Selected Significant Collocates:'])
        for i in range(0, len(selected_collocates), 10):
            ws.append(selected_collocates[i:i + 10])
       
        # Add a blank line and display alpha (p-value threshold), window size, and minimum count threshold
        ws.append([])
        ws.append(['p-value threshold:', alpha])
        ws.append(['window size:', window])
        ws.append(['minimum count threshold:', min_count])

        # Option to output results after processing each list
        output_now = input(f"Do you want to save the results for Hypothesis {index + 1} now? (yes/no): ").strip().lower()
        if output_now == 'yes':
            if 'concordances' not in os.listdir():
                os.mkdir('concordances')
            wb.save(os.path.join('concordances', 'distinct_collocates.xlsx'))
            print(f"Results up to Hypothesis {index + 1} saved to concordances/distinct_collocates.xlsx.")

        # Option to exit after processing each hypothesis
        exit_now = input("Do you want to exit after this hypothesis? (yes/no): ").strip().lower()
        if exit_now == 'yes':
            print("Exiting the program.")
            return

    # Remove the default sheet if it exists
    if 'Sheet' in wb.sheetnames:
        del wb['Sheet']

    # Save the workbook
    if 'concordances' not in os.listdir():
        os.mkdir('concordances')
    wb.save(os.path.join('concordances', 'distinct_collocates.xlsx'))
    print(f'Concordance has been saved to concordances/distinct_collocates.xlsx.')
    
# Example usage
use_predefined = input("Do you want to use predefined target word lists (yes/no)? ").strip().lower() == 'yes'
if use_predefined:
    predefined_word_lists = get_predefined_target_words()
else:
    predefined_word_lists = [input("Enter words for a group separated by spaces: ").strip().split() for _ in range(5)]

alpha = float(input("Enter the value for alpha: ").strip())

stopwords_file, stopwords_path = select_stopwords_file()
if stopwords_file:
    stops = read_stopwords(os.path.join(stopwords_path, stopwords_file))
    rate_dictionary_files, rate_dictionary_path = select_rate_dictionary_files()
    if rate_dictionary_files:
        search_concordance(rate_dictionary_files, predefined_word_lists, stops, alpha)
    else:
        print("No rate dictionary files selected.")
else:
    print("No stopwords file selected.") 

Do you want to use predefined target word lists (yes/no)?  yes
Enter the value for alpha:  0.10


Stopwords file selection
Select a subdirectory:
0. Current Working Directory
1. .ipynb_checkpoints
2. concordances


Enter your choice:  0


Select a stopwords file from the following list:
Select the files for stopwords file:
1. stop_words.csv


Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files:  1


Selected files:
stop_words.csv
Rate dictionary file selection
Select a subdirectory:
0. Current Working Directory
1. .ipynb_checkpoints
2. concordances


Enter your choice:  0


Select one or more rate dictionary files from the following list:
Select the files for rate dictionary:
1. Démonomanie preface Repair_corrected_stemmed.txt
2. République preface_corrected_stemmed.txt
3. Discours des raisons_corrected_stemmed.txt
4. Démonomanie I.1_corrected_stemmed.txt
5. Démonomanie I.2_corrected_stemmed.txt
6. Démonomanie I.3_corrected_stemmed.txt
7. Démonomanie I.4_corrected_stemmed.txt
8. Démonomanie I.5_corrected_stemmed.txt
9. Démonomanie I.6_corrected_stemmed.txt
10. Démonomanie I.7_corrected_stemmed.txt
11. Démonomanie II.1_corrected_stemmed.txt
12. Démonomanie II.2_corrected_stemmed.txt
13. Démonomanie II.3_corrected_stemmed.txt
14. Démonomanie II.4_corrected_stemmed.txt
15. Démonomanie II.5_corrected_stemmed.txt
16. Démonomanie II.6_corrected_stemmed.txt
17. Démonomanie II.7_corrected_stemmed.txt
18. Démonomanie II.8_corrected_stemmed.txt
19. Démonomanie III.1_corrected_stemmed.txt
20. Démonomanie III.2_corrected_stemmed.txt
21. Démonomanie III.3_corrected_st

Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files:  all


Selected files:
Démonomanie preface Repair_corrected_stemmed.txt
République preface_corrected_stemmed.txt
Discours des raisons_corrected_stemmed.txt
Démonomanie I.1_corrected_stemmed.txt
Démonomanie I.2_corrected_stemmed.txt
Démonomanie I.3_corrected_stemmed.txt
Démonomanie I.4_corrected_stemmed.txt
Démonomanie I.5_corrected_stemmed.txt
Démonomanie I.6_corrected_stemmed.txt
Démonomanie I.7_corrected_stemmed.txt
Démonomanie II.1_corrected_stemmed.txt
Démonomanie II.2_corrected_stemmed.txt
Démonomanie II.3_corrected_stemmed.txt
Démonomanie II.4_corrected_stemmed.txt
Démonomanie II.5_corrected_stemmed.txt
Démonomanie II.6_corrected_stemmed.txt
Démonomanie II.7_corrected_stemmed.txt
Démonomanie II.8_corrected_stemmed.txt
Démonomanie III.1_corrected_stemmed.txt
Démonomanie III.2_corrected_stemmed.txt
Démonomanie III.3_corrected_stemmed.txt
Démonomanie III.4_corrected_stemmed.txt
Démonomanie III.5_corrected_stemmed.txt
Démonomanie III.6_corrected_stemmed.txt
Démonomanie IV.1_corrected_stemme

Enter the window size for concordance:  15


Select the files for KWIC and key word counts:
1. Démonomanie preface Repair_corrected_stemmed.txt
2. République preface_corrected_stemmed.txt
3. Discours des raisons_corrected_stemmed.txt
4. Démonomanie I.1_corrected_stemmed.txt
5. Démonomanie I.2_corrected_stemmed.txt
6. Démonomanie I.3_corrected_stemmed.txt
7. Démonomanie I.4_corrected_stemmed.txt
8. Démonomanie I.5_corrected_stemmed.txt
9. Démonomanie I.6_corrected_stemmed.txt
10. Démonomanie I.7_corrected_stemmed.txt
11. Démonomanie II.1_corrected_stemmed.txt
12. Démonomanie II.2_corrected_stemmed.txt
13. Démonomanie II.3_corrected_stemmed.txt
14. Démonomanie II.4_corrected_stemmed.txt
15. Démonomanie II.5_corrected_stemmed.txt
16. Démonomanie II.6_corrected_stemmed.txt
17. Démonomanie II.7_corrected_stemmed.txt
18. Démonomanie II.8_corrected_stemmed.txt
19. Démonomanie III.1_corrected_stemmed.txt
20. Démonomanie III.2_corrected_stemmed.txt
21. Démonomanie III.3_corrected_stemmed.txt
22. Démonomanie III.4_corrected_stemmed.txt
23.

Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files:  Dém,Rép


Selected files:
Démonomanie preface Repair_corrected_stemmed.txt
République preface_corrected_stemmed.txt
Démonomanie I.1_corrected_stemmed.txt
Démonomanie I.2_corrected_stemmed.txt
Démonomanie I.3_corrected_stemmed.txt
Démonomanie I.4_corrected_stemmed.txt
Démonomanie I.5_corrected_stemmed.txt
Démonomanie I.6_corrected_stemmed.txt
Démonomanie I.7_corrected_stemmed.txt
Démonomanie II.1_corrected_stemmed.txt
Démonomanie II.2_corrected_stemmed.txt
Démonomanie II.3_corrected_stemmed.txt
Démonomanie II.4_corrected_stemmed.txt
Démonomanie II.5_corrected_stemmed.txt
Démonomanie II.6_corrected_stemmed.txt
Démonomanie II.7_corrected_stemmed.txt
Démonomanie II.8_corrected_stemmed.txt
Démonomanie III.1_corrected_stemmed.txt
Démonomanie III.2_corrected_stemmed.txt
Démonomanie III.3_corrected_stemmed.txt
Démonomanie III.4_corrected_stemmed.txt
Démonomanie III.5_corrected_stemmed.txt
Démonomanie III.6_corrected_stemmed.txt
Démonomanie IV.1_corrected_stemmed.txt
Démonomanie IV.2_corrected_stemmed.tx

KeyboardInterrupt: Interrupted by user

In [None]:
 modify this code so that it allows users to input a unique name to save the .xlsx file under in the concordances folder.

In [None]:
def search_concordance(text_data, predefined_word_lists, stops, alpha):
    window = int(input(f"Enter the window size for concordance: ").strip())
    subset_files = prompt_files(find_text_files(os.getcwd()), "KWIC and key word counts")

    append_to_existing = input("Do you want to append results to an existing .xlsx file? (yes/no): ").strip().lower()
    if append_to_existing == 'yes':
        existing_file = select_existing_xlsx_file()
        if existing_file:
            wb = load_workbook(existing_file)
            print(f"Appending to existing file: {existing_file}")
        else:
            print("No existing workbook selected. Creating a new workbook instead.")
            wb = Workbook()
            if 'Sheet' in wb.sheetnames:
                del wb['Sheet']
    else:
        wb = Workbook()
        print("Creating a new workbook.")
        if 'Sheet' in wb.sheetnames:
            del wb['Sheet']

    if append_to_existing == 'yes' and existing_file:
        output_filepath = existing_file
    else:
        output_filename = input("Enter a unique name for the .xlsx file (without extension): ").strip()
        if not output_filename:
            output_filename = "distinct_collocates"
        output_filepath = os.path.join("concordances", f"{output_filename}.xlsx")

    for index, predefined_words in enumerate(predefined_word_lists):
        skip = input(f"Do you want to skip processing Hypothesis {index + 1}? (yes/no): ").strip().lower()
        if skip == 'yes':
            print(f"Skipping Hypothesis {index + 1}.")
            continue

        print(f"Processing Hypothesis {index + 1}.")

        ws = wb.create_sheet(title=f"Hypothesis {index + 1}")
        headers = ['Word'] + [clean_file_name(file) for file in subset_files] + ['Total']
        ws.append(headers)

        predefined_words = sorted(predefined_words)
        keyword_pairs = generate_keyword_pairs(predefined_words)

        # --- 1. Gather KWIC collocates for all files ---
        counts_by_file       = {}   # will map filename → {token → count}
        pair_counts_by_file  = {}   # will map filename → {(kw1,kw2) → count}
        
        for file in subset_files:
            file_path = os.path.join(os.getcwd(), file)
            subset_tokens = process_subset_files([file_path])
        
            counts     = {}   # single‐token collocate counts for THIS file
            pair_counts = {}  # keyword–keyword pair counts for THIS file
        
            for word in predefined_words:
                kwics = get_kwic(word, subset_tokens, window, source_file=file)
                for window_tokens, _ in kwics:
                    unique = set(window_tokens)   # dedupe within each window
                    for collocate in unique:
                        if collocate not in stops and collocate != word:
                            # 1) count every single‐word collocate
                            add_to_count_dict(collocate, counts)
        
                            # 2) if that collocate is another hypothesis‐keyword,
                            #    record the pair (word, collocate) separately
                            if collocate in predefined_words:
                                add_to_count_dict((word, collocate), pair_counts)

    # store this file’s two dicts
    counts_by_file[file]      = counts
    pair_counts_by_file[file] = pair_counts

        # --- 2. Per-file Fisher tests against the external rate-dictionary ---
        # build expected rates once
        ref_tokens = process_subset_files([
            os.path.join(rate_dictionary_path, f) for f in text_data
        ])
        ref_counts = {}
        for tok in ref_tokens:
            if tok not in stops and tok not in predefined_words:
                add_to_count_dict(tok, ref_counts)
        ref_total = sum(ref_counts.values())
        if ref_total == 0:
            raise ValueError("Rate dictionary is empty after stopword/keyword filtering!")
        expected_rates = {w: ref_counts.get(w, 0) / ref_total for w in ref_counts}
        
        # test each file separately, collect all significant collocates
        all_significant_collocates = set()
        for file, counts in counts_by_file.items():
            file_total = sum(counts.values())
            if file_total == 0:
                continue
            for collocate, obs in counts.items():
                exp_rate = expected_rates.get(collocate, 0.0)
                exp_count = round(exp_rate * file_total)
                a, b = obs, file_total - obs
                c, d = exp_count, file_total - exp_count
                p_value = fisher_exact([[a, b], [c, d]], alternative='greater')[1]
                if p_value < alpha:
                    all_significant_collocates.add(collocate)
                    collocate_counts[collocate] = collocate_counts.get(collocate, 0) + obs
                
        # --- 3. Let user filter significant collocates ---
        all_significant_collocates = sorted(all_significant_collocates)
            print(f"\n=== Hypothesis {index+1} target words ===")
            print(", ".join(predefined_words))
            print("=======================================\n")
        selected_collocates, min_count = filter_collocates_with_removal(all_significant_collocates, collocate_counts)

        # --- 4. Write results per keyword to worksheet ---
        for word in predefined_words:
            row = [word]
            word_total = 0
            for file in subset_files:
                file_collocates = set()
                file_counts = counts_by_file[file]
                for collocate in selected_collocates:
                    if collocate in file_counts:
                        file_collocates.add(collocate)
                count = len(file_collocates)
                row.append(count)
                word_total += count
            row.append(word_total)
            ws.append(row)

        total_row = ['Total'] + [
            sum(ws.cell(row=i + 2, column=j + 2).value or 0 for i in range(len(predefined_words)))
            for j in range(len(subset_files))
        ] + [
            sum(ws.cell(row=i + 2, column=len(subset_files) + 2).value or 0 for i in range(len(predefined_words)))
        ]
        ws.append(total_row)

        # --- 5. Write selected collocates and pairs ---
        ws.append([])
        ws.append(['Selected Significant Collocates:'])
        for i in range(0, len(selected_collocates), 10):
            ws.append(selected_collocates[i:i + 10])

        # Optional: Write keyword pairs for reference
        pair_collocates = [
            f"{pair[0]} + {pair[1]}" for pair in keyword_pairs
            if pair[0] in selected_collocates and pair[1] in selected_collocates
        ]
        if pair_collocates:
            ws.append([])
            ws.append(['Keyword Pairs (Target + Significant Collocate):'])
            for i in range(0, len(pair_collocates), 5):
                ws.append(pair_collocates[i:i + 5])

        ws.append([])
        ws.append(['p-value threshold:', alpha])
        ws.append(['window size:', window])
        ws.append(['minimum count threshold:', min_count])

        output_now = input(f"Do you want to save the results for Hypothesis {index + 1} now? (yes/no): ").strip().lower()
        if output_now == 'yes':
            if 'concordances' not in os.listdir():
                os.mkdir('concordances')
            wb.save(output_filepath)
            print(f"Results up to Hypothesis {index + 1} saved to {output_filepath}.")

        exit_now = input("Do you want to exit after this hypothesis? (yes/no): ").strip().lower()
        if exit_now == 'yes':
            print("Exiting the program.")
            return

    # Remove default 'Sheet' if present
    if 'Sheet' in wb.sheetnames:
        del wb['Sheet']

    if 'concordances' not in os.listdir():
        os.mkdir('concordances')
    wb.save(output_filepath)
    print(f'Concordance has been saved to {output_filepath}.')

In [None]:
import os
import csv
import nltk
from scipy.stats import fisher_exact
from openpyxl import Workbook, load_workbook
from itertools import combinations

# Function to prompt the user to select text files based on patterns
def prompt_pattern_files(text_files, pattern):
    selected_files = [file for file in text_files if file.startswith(pattern)]
    return selected_files

def generate_keyword_pairs(predefined_words):
    """
    Generate all possible combinations of keyword pairs from the predefined word list.
    """
    return list(combinations(predefined_words, 2))

def prompt_files(text_files, purpose):
    text_files = sorted(text_files, key=custom_file_sort_key)  # Custom sort for files
    print(f"Select the files for {purpose}:")
    for i, file in enumerate(text_files, start=1):
        print(f"{i}. {file}")
    
    selection = input("Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files: ").strip()
    selected_files = []
    
    if selection.lower() == 'all':
        selected_files = text_files
    else:
        # Split the input by commas to handle multiple ranges or numbers
        parts = selection.split(',')
        for part in parts:
            part = part.strip()
            if '-' in part:  # If the part is a range
                try:
                    start, end = map(int, part.split('-'))
                    selected_files.extend(text_files[start-1:end])
                except ValueError:
                    print(f"Invalid range: {part}. Please provide ranges like '1-3'.")
            elif part.isdigit():  # If the part is a single number
                try:
                    selected_files.append(text_files[int(part) - 1])
                except IndexError:
                    print(f"Invalid number: {part}. Please select numbers from the list.")
            else:  # If the part is treated as a pattern
                selected_files.extend(prompt_pattern_files(text_files, part))
    
    # Remove duplicates and sort the selected files
    selected_files = sorted(set(selected_files), key=custom_file_sort_key)
    
    print("Selected files:")
    for file in selected_files:
        print(file)
    
    return selected_files

# Custom sort key for file names
def custom_file_sort_key(filename):
    # Prioritize 'preface' higher than patterns like 'I.1'
    if 'preface' in filename.lower():
        return ('', filename.lower())  # Sort 'preface' first
    return (filename.lower(),)

# Function to process text files
def process_text_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            combined_text += file.read().lower() + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return nltk.Text(tokens)

# Function to find .txt files in a directory
def find_text_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.txt')], key=custom_file_sort_key)  # Custom sort

# Function to list subfolders in the current directory
def list_subfolders():
    return sorted([f.name for f in os.scandir() if f.is_dir()])  # Sort folders alphabetically

# Function to prompt the user to select a subfolder or the current directory
def prompt_subfolder(subfolders):
    print("Select a subfolder or the current working directory:")
    print("0. Current Working Directory")
    for i, subfolder in enumerate(subfolders, start=1):
        print(f"{i}. {subfolder}")
    selected_index = int(input("Enter the number of the subfolder: "))
    return None if selected_index == 0 else subfolders[selected_index - 1]

# Function to get predefined target words
def get_predefined_target_words():
    return [
        ['citoyen', 'cour', 'domain', 'ressort'],  # List 1
        ['guerr', 'paix', 'police', 'religion'],  # List 2
        ['confess', 'demon', 'demoniaqu', 'diabl',
         'diabol', 'dieu', 'divin', 
        'hebrieu', 'impiet', 'preuv', 'question',   'sathan', 
        'sorceller', 'sorci',  'statut', 'sujet'],  # List 3
        ['arrest',  'conseil', 'conseiller', 'consul', 
         'couron', 'édict', 'iurisdict', 'jug', 'magistrat',
         'offic', 'offici', 'ordon', 'parlement',
        'seigneur', 'seigneurial', 'statut'],  # List 4
        ['absolu', 'bien', 'chos', 'civil', 'droit', 'estat', 'just', 'justic',
         'loi', 'maiest', 'princ', 'puissanc',
        'republ', 'roy', 'royal', 'royaum', 'souverain', 'souverainet', 'sujet']  # List 5
    ]

# Function to choose subdirectory for stopwords csv file
def choose_subdirectory(subdirectories):
    print("Select a subdirectory:")
    print("0. Current Working Directory")
    for i, subdir in enumerate(subdirectories, start=1):
        print(f"{i}. {subdir}")
    while True:
        try:
            choice = int(input("Enter your choice: "))
            if 0 <= choice <= len(subdirectories):
                return None if choice == 0 else subdirectories[choice - 1]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a number.")

# Function to read stopwords from a csv file
def read_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

# Function to find .csv files in a directory
def find_csv_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.csv')])  # Sort files alphabetically

# Function to select files for the stopwords
def select_stopwords_file():
    print('Stopwords file selection')
    stopwords_subfolders = list_subfolders()
    selected_stopwords_subfolder = choose_subdirectory(stopwords_subfolders)
    
    # Check if the user selected a subfolder or the current directory
    stopwords_subfolder_path = os.getcwd() if selected_stopwords_subfolder is None else os.path.join(os.getcwd(), selected_stopwords_subfolder)
    
    # Find .csv files in the selected directory
    stopwords_files = find_csv_files(stopwords_subfolder_path)
    if stopwords_files:
        print('Select a stopwords file from the following list:')
        # Prompt the user to select a single .csv file
        selected_file = prompt_files(stopwords_files, "stopwords file")
        if selected_file:
            return selected_file[0], stopwords_subfolder_path  # Return the first selected file and its path
        else:
            print("No stopwords file selected.")
            return None, None
    else:
        print(f"No .csv stopwords files found in '{selected_stopwords_subfolder}'.")
        return None, None

# Function to select files for the rate dictionary
def select_rate_dictionary_files():
    print('Rate dictionary file selection')
    rate_dictionary_subfolders = list_subfolders()
    selected_rate_dictionary_subfolder = choose_subdirectory(rate_dictionary_subfolders)
    
    # Check if the user selected a subfolder or the current directory
    rate_dictionary_subfolder_path = os.getcwd() if selected_rate_dictionary_subfolder is None else os.path.join(os.getcwd(), selected_rate_dictionary_subfolder)
    
    # Find .txt files in the selected directory
    rate_dictionary_files = find_text_files(rate_dictionary_subfolder_path)
    if rate_dictionary_files:
        print('Select one or more rate dictionary files from the following list:')
        selected_files = prompt_files(rate_dictionary_files, "rate dictionary")
        if selected_files:
            return selected_files, rate_dictionary_subfolder_path  # Return the selected files and their path
        else:
            print("No rate dictionary files selected.")
            return [], None
    else:
        print(f"No .txt rate dictionary files found in '{selected_rate_dictionary_subfolder}'.")
        return [], None

def select_existing_xlsx_file():
    print("Select a directory to search for .xlsx files:")
    subfolders = list_subfolders()
    selected_subfolder = choose_subdirectory(subfolders)

    # Check if the user selected a subfolder or the current directory
    folder_path = os.getcwd() if selected_subfolder is None else os.path.join(os.getcwd(), selected_subfolder)

    # Find .xlsx files in the selected directory
    xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
    if xlsx_files:
        print("Select an existing .xlsx file from the following list:")
        for i, file in enumerate(xlsx_files, start=1):
            print(f"{i}. {file}")
        while True:
            try:
                choice = int(input("Enter the number of the file you want to select or 0 to cancel: "))
                if 0 <= choice <= len(xlsx_files):
                    return None if choice == 0 else os.path.join(folder_path, xlsx_files[choice - 1])
                else:
                    print("Invalid selection. Please try again.")
            except ValueError:
                print("Please enter a number.")
    else:
        print(f"No .xlsx files found in '{folder_path}'.")
        return None

# Utility function to clean file names
def clean_file_name(file_name):
    """
    Clean the file name for display, including replacing specific patterns.
    """
    file_name = file_name.replace('_', '').replace('corrected', '').replace('stemmed', '')
    if 'Démonomanie' in file_name:
        file_name = file_name.replace('Démonomanie', 'Dém')
    if 'République' in file_name:
        file_name = file_name.replace('République', 'Rép')
   
    # Replace '911' with '11' and '910' with '10' (NEW CHANGE)
    file_name = file_name.replace('911', '11').replace('910', '10')  # <--- CHANGE HERE
   
    return os.path.splitext(file_name)[0]

# Function to process the subset of text files for KWIC and counts
def process_subset_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_contents = f.read().lower()
            combined_text += file_contents + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return tokens

def get_kwic(sometargetterm, somelistofwords, window=10, excl_target=True, source_file=None):
    kwics = []
    for n, w in enumerate(somelistofwords):
        if w == sometargetterm:
            start = max(0, n - window)
            end = min(n + window + 1, len(somelistofwords))
            if excl_target:
                # Updated: Exclude keyword itself from the window
                k = [word for word in (somelistofwords[start:n] + somelistofwords[n + 1:end]) if word != sometargetterm]
            else:
                k = somelistofwords[start:end]
            kwics.append((k, source_file))
    return kwics
   

def add_to_count_dict(word, count_dict):
    if word in count_dict:
        count_dict[word] += 1
    else:
        count_dict[word] = 1

def get_fishers(someword, somecountdict, someratedict, alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r * wc)
    d = wc - c
    p = fisher_exact([[a, b], [c, d]], alternative=alternative)[1]
    return p

def filter_collocates_with_removal(collocates, collocate_counts):
    """
    Allow the user to filter the list of significant collocates and remove any selected collocates in error.

    Parameters:
        collocates (list): A sorted list of significant collocates.
        collocate_counts (dict): A dictionary of collocates and their counts.

    Returns:
        list: A filtered list of collocates selected by the user.
    """
    while True:
        try:
            min_count = int(input("Enter the minimum count threshold for collocates to include: ").strip())
            break
        except ValueError:
            print("Invalid input. Please enter a valid number.")

    # Filter collocates by minimum count
    filtered_collocates = [collocate for collocate in collocates if collocate_counts[collocate] >= min_count]
    print(f"\nCollocates with counts >= {min_count}:")
    for i, collocate in enumerate(filtered_collocates, start=1):
        print(f"{i}. {collocate} (Count: {collocate_counts[collocate]})")

    print("\nYou can select collocates by entering:")
    print("- A single number (e.g., 3) to select one collocate.")
    print("- A range of numbers (e.g., 3-6) to select multiple collocates.")
    print("- Multiple selections separated by commas (e.g., 3,5-7,9).")
    print("- Type 'all' to select all collocates.")
    print("- Type 'done' to finalize your selection.")

    selected_collocates = []

    while True:
        selection = input("Enter your selection: ").strip()
        if selection.lower() == 'done':
            break
        elif selection.lower() == 'all':
            selected_collocates = filtered_collocates
            break

        try:
            parts = selection.split(',')
            for part in parts:
                part = part.strip()
                if '-' in part:  # Handle ranges
                    start, end = map(int, part.split('-'))
                    selected_collocates.extend(filtered_collocates[start-1:end])
                elif part.isdigit():  # Handle single numbers
                    selected_collocates.append(filtered_collocates[int(part) - 1])
                else:
                    print(f"Invalid selection: {part}. Please try again.")
        except (ValueError, IndexError):
            print(f"Invalid input: {selection}. Please try again.")

        # Remove duplicates and sort the selected collocates
        selected_collocates = sorted(set(selected_collocates), key=filtered_collocates.index)

        print("Currently selected collocates:")
        for collocate in selected_collocates:
            print(collocate)

    # Allow users to review and remove collocates selected in error
    while True:
        print("\nFinalized collocates:")
        for i, collocate in enumerate(selected_collocates, start=1):
            print(f"{i}. {collocate}")

        remove_error = input(
            "Would you like to remove any collocates selected in error? (yes/no): "
        ).strip().lower()
        if remove_error == 'yes':
            remove_selection = input(
                "Enter the numbers of the collocates to remove (e.g., 2,4-5): "
            ).strip()
            try:
                parts = remove_selection.split(',')
                to_remove = []
                for part in parts:
                    part = part.strip()
                    if '-' in part:  # Handle ranges
                        start, end = map(int, part.split('-'))
                        to_remove.extend(selected_collocates[start-1:end])
                    elif part.isdigit():  # Handle single numbers
                        to_remove.append(selected_collocates[int(part) - 1])
                    else:
                        print(f"Invalid selection: {part}. Please try again.")
                selected_collocates = [
                    collocate for collocate in selected_collocates
                    if collocate not in to_remove
                ]
            except (ValueError, IndexError):
                print(f"Invalid input: {remove_selection}. Please try again.")
        else:
            break

    return selected_collocates, min_count

def search_concordance(text_data, predefined_word_lists, stops, alpha):
    window = int(input(f"Enter the window size for concordance: ").strip())

    subset_files = prompt_files(find_text_files(os.getcwd()), "KWIC and key word counts")

    append_to_existing = input("Do you want to append results to an existing .xlsx file? (yes/no): ").strip().lower()

    if append_to_existing == 'yes':
        existing_file = select_existing_xlsx_file()
        if existing_file:
            wb = load_workbook(existing_file)
            print(f"Appending to existing file: {existing_file}")
        else:
            print("No existing workbook selected. Creating a new workbook instead.")
            wb = Workbook()
            if 'Sheet' in wb.sheetnames:
                del wb['Sheet']
    else:
        wb = Workbook()
        print("Creating a new workbook.")
        if 'Sheet' in wb.sheetnames:
            del wb['Sheet']

    if append_to_existing == 'yes' and existing_file:
        output_filepath = existing_file
    else:
        output_filename = input("Enter a unique name for the .xlsx file (without extension): ").strip()
        if not output_filename:
            output_filename = "distinct_collocates"
        output_filepath = os.path.join("concordances", f"{output_filename}.xlsx")

    # Initialize the dictionary here
    unique_keyword_collocate_pairs_by_file = {}

    for index, predefined_words in enumerate(predefined_word_lists):
        skip = input(f"Do you want to skip processing Hypothesis {index + 1}? (yes/no): ").strip().lower()
        if skip == 'yes':
            print(f"Skipping Hypothesis {index + 1}.")
            continue

        ws = wb.create_sheet(title=f"Hypothesis {index + 1}")

        headers = ['Word'] + [clean_file_name(file) for file in subset_files] + ['Total']
        ws.append(headers)

        predefined_words = sorted(predefined_words)

        # Generate keyword pairs from predefined words
        keyword_pairs = generate_keyword_pairs(predefined_words)

        all_significant_collocates = set()
        collocate_counts = {}
        counts_by_file = {}

        for file in subset_files:
            file_path = os.path.join(os.getcwd(), file)
            subset_tokens = process_subset_files([file_path])
            counts = {}

            unique_keyword_collocate_pairs_per_file = {}

            for word in predefined_words:
                kwics = get_kwic(word, subset_tokens, window, source_file=file)
            
                if word not in unique_keyword_collocate_pairs_per_file:
                    unique_keyword_collocate_pairs_per_file[word] = set()
            
                for k, source_file in kwics:
                    unique_collocates = set(k)  # Ensure uniqueness
                    for collocate in unique_collocates:
                        if collocate not in stops and collocate != word:  # Exclude stopwords and the keyword itself
                            add_to_count_dict(collocate, counts)
                            unique_keyword_collocate_pairs_per_file[word].add(collocate)

                            # Check if the collocate is a keyword and form pairs
                            if collocate in predefined_words and collocate != word:
                                pair = (word, collocate)
                                if pair in keyword_pairs:
                                    add_to_count_dict(pair, counts)

            counts_by_file[file] = counts
            unique_keyword_collocate_pairs_by_file[file] = unique_keyword_collocate_pairs_per_file

        total_wc = sum(count for counts in counts_by_file.values() for count in counts.values())
        rates = {word: count / total_wc for counts in counts_by_file.values() for word, count in counts.items()}

        for file, counts in counts_by_file.items():
            for word, count in sorted(counts.items(), key=lambda item: item[1], reverse=True):
                if word not in stops:
                    p_value = get_fishers(word, counts, rates, alternative='greater')
                    if p_value < alpha:
                        all_significant_collocates.add(word)
                        collocate_counts[word] = collocate_counts.get(word, 0) + count

        print(f"\nPredefined words for Hypothesis {index + 1}: {', '.join(predefined_words)}")
        all_significant_collocates = sorted(all_significant_collocates)
        selected_collocates, min_count = filter_collocates_with_removal(all_significant_collocates, collocate_counts)

        for word in predefined_words:
            row = [word]
            word_total = 0
            for file in subset_files:
                unique_collocates_for_word = unique_keyword_collocate_pairs_by_file[file].get(word, set())
                # Filter the unique collocates to only include the selected collocates
                filtered_collocates_for_word = [collocate for collocate in unique_collocates_for_word if collocate in selected_collocates]
        
                #print(f"Keyword: {word}, File: {file}, Filtered Unique Collocates: {filtered_collocates_for_word}")
        
                # Count the filtered collocates
                count = len(filtered_collocates_for_word)
                row.append(count)
                word_total += count
            row.append(word_total)
            ws.append(row)


        total_row = ['Total'] + [
            sum(ws.cell(row=i + 2, column=j + 2).value or 0 for i in range(len(predefined_words)))
            for j in range(len(subset_files))
        ] + [
            sum(ws.cell(row=i + 2, column=len(subset_files) + 2).value or 0 for i in range(len(predefined_words)))
        ]
        ws.append(total_row)
        
        
        # Include generated pairs in the list of selected collocates
        pair_collocates = [
            f"{pair[0]} + {pair[1]}" for pair in keyword_pairs
            if pair[1] in selected_collocates
        ]

        # Write selected significant collocates, including pairs, to the sheet
        ws.append([])
        ws.append(['Selected Significant Collocates (Including Keyword Pairs):'])
        for i in range(0, len(selected_collocates), 10):
            ws.append(selected_collocates[i:i + 10])
        
        # Write keyword pairs separately for reference
        if pair_collocates:
            ws.append([])
            ws.append(['Keyword Pairs (Target + Significant Collocate):'])
            for i in range(0, len(pair_collocates), 5):  # Group pairs for better readability
                ws.append(pair_collocates[i:i + 5])

        ws.append([])
        ws.append(['p-value threshold:', alpha])
        ws.append(['window size:', window])
        ws.append(['minimum count threshold:', min_count])

        output_now = input(f"Do you want to save the results for Hypothesis {index + 1} now? (yes/no): ").strip().lower()
        if output_now == 'yes':
            if 'concordances' not in os.listdir():
                os.mkdir('concordances')
            wb.save(output_filepath)
            print(f"Results up to Hypothesis {index + 1} saved to {output_filepath}.")

        exit_now = input("Do you want to exit after this hypothesis? (yes/no): ").strip().lower()
        if exit_now == 'yes':
            print("Exiting the program.")
            return

    if 'Sheet' in wb.sheetnames:
        del wb['Sheet']

    if 'concordances' not in os.listdir():
        os.mkdir('concordances')
    wb.save(output_filepath)
    print(f'Concordance has been saved to {output_filepath}.')
    
# Example usage
use_predefined = input("Do you want to use predefined target word lists (yes/no)? ").strip().lower() == 'yes'
if use_predefined:
    predefined_word_lists = get_predefined_target_words()
else:
    predefined_word_lists = [input("Enter words for a group separated by spaces: ").strip().split() for _ in range(5)]

alpha = float(input("Enter the value for alpha: ").strip())

stopwords_file, stopwords_path = select_stopwords_file()
if stopwords_file:
    stops = read_stopwords(os.path.join(stopwords_path, stopwords_file))
    rate_dictionary_files, rate_dictionary_path = select_rate_dictionary_files()
    if rate_dictionary_files:
        search_concordance(rate_dictionary_files, predefined_word_lists, stops, alpha)
    else:
        print("No rate dictionary files selected.")
else:
    print("No stopwords file selected.") 

In [None]:
his code seems to be creating the pair_collocates list only if the second item in each pair is in the selected collocates list. But this approach does not work because the keywords are excluded from the list of collocates users can select. The desired outcome is that pair_collocates is generated if the second item matches an item in all_significant_collocates

In [2]:
import os
import csv
import nltk
from scipy.stats import fisher_exact
from openpyxl import Workbook, load_workbook

# Function to prompt the user to select text files based on patterns
def prompt_pattern_files(text_files, pattern):
    selected_files = [file for file in text_files if file.startswith(pattern)]
    return selected_files

def prompt_files(text_files, purpose):
    text_files = sorted(text_files, key=custom_file_sort_key)  # Custom sort for files
    print(f"Select the files for {purpose}:")
    for i, file in enumerate(text_files, start=1):
        print(f"{i}. {file}")
    
    selection = input("Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files: ").strip()
    selected_files = []
    
    if selection.lower() == 'all':
        selected_files = text_files
    else:
        # Split the input by commas to handle multiple ranges or numbers
        parts = selection.split(',')
        for part in parts:
            part = part.strip()
            if '-' in part:  # If the part is a range
                try:
                    start, end = map(int, part.split('-'))
                    selected_files.extend(text_files[start-1:end])
                except ValueError:
                    print(f"Invalid range: {part}. Please provide ranges like '1-3'.")
            elif part.isdigit():  # If the part is a single number
                try:
                    selected_files.append(text_files[int(part) - 1])
                except IndexError:
                    print(f"Invalid number: {part}. Please select numbers from the list.")
            else:  # If the part is treated as a pattern
                selected_files.extend(prompt_pattern_files(text_files, part))
    
    # Remove duplicates and sort the selected files
    selected_files = sorted(set(selected_files), key=custom_file_sort_key)
    
    print("Selected files:")
    for file in selected_files:
        print(file)
    
    return selected_files

# Custom sort key for file names
def custom_file_sort_key(filename):
    # Prioritize 'preface' higher than patterns like 'I.1'
    if 'preface' in filename.lower():
        return ('', filename.lower())  # Sort 'preface' first
    return (filename.lower(),)

# Function to process text files
def process_text_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            combined_text += file.read().lower() + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return nltk.Text(tokens)

# Function to find .txt files in a directory
def find_text_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.txt')], key=custom_file_sort_key)  # Custom sort

# Function to list subfolders in the current directory
def list_subfolders():
    return sorted([f.name for f in os.scandir() if f.is_dir()])  # Sort folders alphabetically

# Function to prompt the user to select a subfolder or the current directory
def prompt_subfolder(subfolders):
    print("Select a subfolder or the current working directory:")
    print("0. Current Working Directory")
    for i, subfolder in enumerate(subfolders, start=1):
        print(f"{i}. {subfolder}")
    selected_index = int(input("Enter the number of the subfolder: "))
    return None if selected_index == 0 else subfolders[selected_index - 1]

# Function to get predefined target words
def get_predefined_target_words():
    return [
        ['citoyen', 'cour', 'domain', 'ressort'],  # List 1
        ['guerre', 'paix', 'police' 'religion'],  # List 2
        ['confess', 'demon', 'demoniaqu', 'diabl',
         'diabol', 'dieu', 'divin', 
        'hebrieu', 'impiet', 'preuv', 'question',   'sathan', 
        'sorceller', 'sorci',  'statut', 'sujet'],  # List 3
        ['arrest',  'conseil', 'conseiller', 'consul', 
         'couron', 'édict', 'iurisdict', 'jug', 'magistrat',
         'offic', 'offici', 'ordon', 'parlement',
        'seigneur', 'seigneurial', 'statut'],  # List 4
        ['absolu', 'bien', 'chos', 'civil', 'droit', 'estat', 'just', 'justic',
         'loi', 'maiest', 'princ', 'puissanc',
        'republ', 'roy', 'royal', 'royaum', 'souverain', 'souverainet', 'sujet']  # List 5
    ]

# Function to choose subdirectory for stopwords csv file
def choose_subdirectory(subdirectories):
    print("Select a subdirectory:")
    print("0. Current Working Directory")
    for i, subdir in enumerate(subdirectories, start=1):
        print(f"{i}. {subdir}")
    while True:
        try:
            choice = int(input("Enter your choice: "))
            if 0 <= choice <= len(subdirectories):
                return None if choice == 0 else subdirectories[choice - 1]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a number.")

# Function to read stopwords from a csv file
def read_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

# Function to find .csv files in a directory
def find_csv_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.csv')])  # Sort files alphabetically

# Function to select files for the stopwords
def select_stopwords_file():
    print('Stopwords file selection')
    stopwords_subfolders = list_subfolders()
    selected_stopwords_subfolder = choose_subdirectory(stopwords_subfolders)
    
    # Check if the user selected a subfolder or the current directory
    stopwords_subfolder_path = os.getcwd() if selected_stopwords_subfolder is None else os.path.join(os.getcwd(), selected_stopwords_subfolder)
    
    # Find .csv files in the selected directory
    stopwords_files = find_csv_files(stopwords_subfolder_path)
    if stopwords_files:
        print('Select a stopwords file from the following list:')
        # Prompt the user to select a single .csv file
        selected_file = prompt_files(stopwords_files, "stopwords file")
        if selected_file:
            return selected_file[0], stopwords_subfolder_path  # Return the first selected file and its path
        else:
            print("No stopwords file selected.")
            return None, None
    else:
        print(f"No .csv stopwords files found in '{selected_stopwords_subfolder}'.")
        return None, None

# Function to select files for the rate dictionary
def select_rate_dictionary_files():
    print('Rate dictionary file selection')
    rate_dictionary_subfolders = list_subfolders()
    selected_rate_dictionary_subfolder = choose_subdirectory(rate_dictionary_subfolders)
    
    # Check if the user selected a subfolder or the current directory
    rate_dictionary_subfolder_path = os.getcwd() if selected_rate_dictionary_subfolder is None else os.path.join(os.getcwd(), selected_rate_dictionary_subfolder)
    
    # Find .txt files in the selected directory
    rate_dictionary_files = find_text_files(rate_dictionary_subfolder_path)
    if rate_dictionary_files:
        print('Select one or more rate dictionary files from the following list:')
        selected_files = prompt_files(rate_dictionary_files, "rate dictionary")
        if selected_files:
            return selected_files, rate_dictionary_subfolder_path  # Return the selected files and their path
        else:
            print("No rate dictionary files selected.")
            return [], None
    else:
        print(f"No .txt rate dictionary files found in '{selected_rate_dictionary_subfolder}'.")
        return [], None

def select_existing_xlsx_file():
    print("Select a directory to search for .xlsx files:")
    subfolders = list_subfolders()
    selected_subfolder = choose_subdirectory(subfolders)

    # Check if the user selected a subfolder or the current directory
    folder_path = os.getcwd() if selected_subfolder is None else os.path.join(os.getcwd(), selected_subfolder)

    # Find .xlsx files in the selected directory
    xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
    if xlsx_files:
        print("Select an existing .xlsx file from the following list:")
        for i, file in enumerate(xlsx_files, start=1):
            print(f"{i}. {file}")
        while True:
            try:
                choice = int(input("Enter the number of the file you want to select or 0 to cancel: "))
                if 0 <= choice <= len(xlsx_files):
                    return None if choice == 0 else os.path.join(folder_path, xlsx_files[choice - 1])
                else:
                    print("Invalid selection. Please try again.")
            except ValueError:
                print("Please enter a number.")
    else:
        print(f"No .xlsx files found in '{folder_path}'.")
        return None

# Utility function to clean file names
def clean_file_name(file_name):
    """
    Clean the file name for display, including replacing specific patterns.
    """
    file_name = file_name.replace('_', '').replace('corrected', '').replace('stemmed', '')
    if 'Démonomanie' in file_name:
        file_name = file_name.replace('Démonomanie', 'Dém')
    if 'République' in file_name:
        file_name = file_name.replace('République', 'Rép')
   
    # Replace '911' with '11' and '910' with '10' (NEW CHANGE)
    file_name = file_name.replace('911', '11').replace('910', '10')  # <--- CHANGE HERE
   
    return os.path.splitext(file_name)[0]

# Function to process the subset of text files for KWIC and counts
def process_subset_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_contents = f.read().lower()
            combined_text += file_contents + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return tokens

def get_kwic(sometargetterm, somelistofwords, window=10, excl_target=True, source_file=None):
    kwics = []
    for n, w in enumerate(somelistofwords):
        if w == sometargetterm:
            start = max(0, n - window)
            end = min(n + window + 1, len(somelistofwords))
            if excl_target:
                k = somelistofwords[start:n] + somelistofwords[n + 1:end]
            else:
                k = somelistofwords[start:end]
            kwics.append((k, source_file))
    return kwics

def add_to_count_dict(word, count_dict):
    if word in count_dict:
        count_dict[word] += 1
    else:
        count_dict[word] = 1

def get_fishers(someword, somecountdict, someratedict, alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r * wc)
    d = wc - c
    p = fisher_exact([[a, b], [c, d]], alternative=alternative)[1]
    return p

# Modified filter_collocates function
def filter_collocates(collocates, collocate_counts):
    """
    Filter the list of significant collocates based on user selection.

    Parameters:
        collocates (list): A sorted list of significant collocates.
        collocate_counts (dict): A dictionary of collocates and their counts.

    Returns:
        list: A filtered list of collocates selected by the user.
    """
    while True:
        try:
            min_count = int(input("Enter the minimum count threshold for collocates to include: ").strip())
            break
        except ValueError:
            print("Invalid input. Please enter a valid number.")

    filtered_collocates = [collocate for collocate in collocates if collocate_counts[collocate] >= min_count]
    print(f"\nCollocates with counts >= {min_count}:")
    for i, collocate in enumerate(filtered_collocates, start=1):
        print(f"{i}. {collocate} (Count: {collocate_counts[collocate]})")

    print("\nYou can select collocates by entering:")
    print("- A single number (e.g., 3) to select one collocate.")
    print("- A range of numbers (e.g., 3-6) to select multiple collocates.")
    print("- Multiple selections separated by commas (e.g., 3,5-7,9).")
    print("- Type 'all' to select all collocates.")
    print("- Type 'done' to finalize your selection.")

    selected_collocates = []

    while True:
        selection = input("Enter your selection: ").strip()
        if selection.lower() == 'done':
            break
        elif selection.lower() == 'all':
            selected_collocates = filtered_collocates
            break

        try:
            parts = selection.split(',')
            for part in parts:
                part = part.strip()
                if '-' in part:  # Handle ranges
                    start, end = map(int, part.split('-'))
                    selected_collocates.extend(filtered_collocates[start-1:end])
                elif part.isdigit():  # Handle single numbers
                    selected_collocates.append(filtered_collocates[int(part) - 1])
                else:
                    print(f"Invalid selection: {part}. Please try again.")
        except (ValueError, IndexError):
            print(f"Invalid input: {selection}. Please try again.")

        # Remove duplicates and sort the selected collocates
        selected_collocates = sorted(set(selected_collocates), key=filtered_collocates.index)

        print("Currently selected collocates:")
        for collocate in selected_collocates:
            print(collocate)

    return selected_collocates, min_count

def search_concordance(text_data, predefined_word_lists, stops, alpha):
    window = int(input(f"Enter the window size for concordance: ").strip())

    subset_files = prompt_files(find_text_files(os.getcwd()), "KWIC and key word counts")

    # Allow users to choose an existing .xlsx file or create a new one
    append_to_existing = input("Do you want to append results to an existing .xlsx file? (yes/no): ").strip().lower()

    if append_to_existing == 'yes':
        existing_file = select_existing_xlsx_file()
        if existing_file:
            wb = load_workbook(existing_file)
            print(f"Appending to existing file: {existing_file}")
        else:
            print("No existing workbook selected. Creating a new workbook instead.")
            wb = Workbook()
    else:
        wb = Workbook()
        print("Creating a new workbook.")

    # Prompt the user for a unique name to save the .xlsx file
    output_filename = input("Enter a unique name for the .xlsx file (without extension): ").strip()
    if not output_filename:
        output_filename = "distinct_collocates"  # Default name
    output_filepath = os.path.join("concordances", f"{output_filename}.xlsx")

    for index, predefined_words in enumerate(predefined_word_lists):
        # Allow users to skip processing a predefined word list
        skip = input(f"Do you want to skip processing Hypothesis {index + 1}? (yes/no): ").strip().lower()
        if skip == 'yes':
            print(f"Skipping Hypothesis {index + 1}.")
            continue

        ws = wb.create_sheet(title=f"Hypothesis {index + 1}")

        # Write headers
        headers = ['Word'] + [clean_file_name(file) for file in subset_files] + ['Total']
        ws.append(headers)

        # Sort rows (words) alphabetically
        predefined_words = sorted(predefined_words)

        all_significant_collocates = set()
        collocate_counts = {}
        counts_by_file = {}
        
        # Process each file and each predefined word
        for file in subset_files:
            file_path = os.path.join(os.getcwd(), file)
            subset_tokens = process_subset_files([file_path])
            counts = {}
            
            for word in predefined_words:  # Iterate over each word
                kwics = get_kwic(word, subset_tokens, window, source_file=file)
                
                # Count occurrences of each word in the KWIC context
                for k, source_file in kwics:
                    for w in k:
                        if w not in stops:
                            add_to_count_dict(w, counts)
            
            counts_by_file[file] = counts
        
        # Calculate total word count across all files
        total_wc = sum(count for counts in counts_by_file.values() for count in counts.values())
        
        # Calculate rates for all words
        rates = {word: count / total_wc for counts in counts_by_file.values() for word, count in counts.items()}
        
        # Filter significant tokens using Fisher's exact test
        for file, counts in counts_by_file.items():
            for word, count in sorted(counts.items(), key=lambda item: item[1], reverse=True):
                if word not in stops:
                    p_value = get_fishers(word, counts, rates, alternative='greater')
                    if p_value < alpha:
                        all_significant_collocates.add(word)
                        collocate_counts[word] = collocate_counts.get(word, 0) + count

        # Notify the user again before selecting collocates
        print(f"\nPredefined words for Hypothesis {index + 1}: {', '.join(predefined_words)}")
        all_significant_collocates = sorted(all_significant_collocates)
        selected_collocates, min_count = filter_collocates(all_significant_collocates, collocate_counts)

        for word in predefined_words:
            row = [word]
            word_total = 0
            for file in subset_files:
                file_path = os.path.join(os.getcwd(), file)
                tokens = process_subset_files([file_path])
                kwics = get_kwic(word, tokens, window)
                count = sum(1 for k in kwics if any(w in selected_collocates for w in k[0]))
                row.append(count)
                word_total += count
            row.append(word_total)
            ws.append(row)

        # Add total row
        total_row = ['Total'] + [sum(ws.cell(row=i + 2, column=j + 2).value for i in range(len(predefined_words))) for j in range(len(subset_files))] + [sum(ws.cell(row=i + 2, column=len(subset_files) + 2).value for i in range(len(predefined_words)))]
        ws.append(total_row)

        # Add blank line and selected collocates
        ws.append([])
        ws.append(['Selected Significant Collocates:'])
        for i in range(0, len(selected_collocates), 10):
            ws.append(selected_collocates[i:i + 10])
       
        # Add a blank line and display alpha (p-value threshold), window size, and minimum count threshold
        ws.append([])
        ws.append(['p-value threshold:', alpha])
        ws.append(['window size:', window])
        ws.append(['minimum count threshold:', min_count])

        # Option to output results after processing each list
        output_now = input(f"Do you want to save the results for Hypothesis {index + 1} now? (yes/no): ").strip().lower()
        if output_now == 'yes':
            if 'concordances' not in os.listdir():
                os.mkdir('concordances')
            wb.save(output_filepath)
            print(f"Results up to Hypothesis {index + 1} saved to {output_filepath}.")

        # Option to exit after processing each hypothesis
        exit_now = input("Do you want to exit after this hypothesis? (yes/no): ").strip().lower()
        if exit_now == 'yes':
            print("Exiting the program.")
            return

    # Remove the default sheet if it exists
    if 'Sheet' in wb.sheetnames:
        del wb['Sheet']

    # Save the workbook
    if 'concordances' not in os.listdir():
        os.mkdir('concordances')
    wb.save(output_filepath)
    print(f'Concordance has been saved to {output_filepath}.')
    
# Example usage
use_predefined = input("Do you want to use predefined target word lists (yes/no)? ").strip().lower() == 'yes'
if use_predefined:
    predefined_word_lists = get_predefined_target_words()
else:
    predefined_word_lists = [input("Enter words for a group separated by spaces: ").strip().split() for _ in range(5)]

alpha = float(input("Enter the value for alpha: ").strip())

stopwords_file, stopwords_path = select_stopwords_file()
if stopwords_file:
    stops = read_stopwords(os.path.join(stopwords_path, stopwords_file))
    rate_dictionary_files, rate_dictionary_path = select_rate_dictionary_files()
    if rate_dictionary_files:
        search_concordance(rate_dictionary_files, predefined_word_lists, stops, alpha)
    else:
        print("No rate dictionary files selected.")
else:
    print("No stopwords file selected.")

Do you want to use predefined target word lists (yes/no)?  yes
Enter the value for alpha:  0.10


Stopwords file selection
Select a subdirectory:
0. Current Working Directory
1. .ipynb_checkpoints
2. concordances


Enter your choice:  0


Select a stopwords file from the following list:
Select the files for stopwords file:
1. stop_words.csv


Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files:  1


Selected files:
stop_words.csv
Rate dictionary file selection
Select a subdirectory:
0. Current Working Directory
1. .ipynb_checkpoints
2. concordances


Enter your choice:  0


Select one or more rate dictionary files from the following list:
Select the files for rate dictionary:
1. Démonomanie preface Repair_corrected_stemmed.txt
2. République preface_corrected_stemmed.txt
3. Discours des raisons_corrected_stemmed.txt
4. Démonomanie I.1_corrected_stemmed.txt
5. Démonomanie I.2_corrected_stemmed.txt
6. Démonomanie I.3_corrected_stemmed.txt
7. Démonomanie I.4_corrected_stemmed.txt
8. Démonomanie I.5_corrected_stemmed.txt
9. Démonomanie I.6_corrected_stemmed.txt
10. Démonomanie I.7_corrected_stemmed.txt
11. Démonomanie II.1_corrected_stemmed.txt
12. Démonomanie II.2_corrected_stemmed.txt
13. Démonomanie II.3_corrected_stemmed.txt
14. Démonomanie II.4_corrected_stemmed.txt
15. Démonomanie II.5_corrected_stemmed.txt
16. Démonomanie II.6_corrected_stemmed.txt
17. Démonomanie II.7_corrected_stemmed.txt
18. Démonomanie II.8_corrected_stemmed.txt
19. Démonomanie III.1_corrected_stemmed.txt
20. Démonomanie III.2_corrected_stemmed.txt
21. Démonomanie III.3_corrected_st

Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files:  all


Selected files:
Démonomanie preface Repair_corrected_stemmed.txt
République preface_corrected_stemmed.txt
Discours des raisons_corrected_stemmed.txt
Démonomanie I.1_corrected_stemmed.txt
Démonomanie I.2_corrected_stemmed.txt
Démonomanie I.3_corrected_stemmed.txt
Démonomanie I.4_corrected_stemmed.txt
Démonomanie I.5_corrected_stemmed.txt
Démonomanie I.6_corrected_stemmed.txt
Démonomanie I.7_corrected_stemmed.txt
Démonomanie II.1_corrected_stemmed.txt
Démonomanie II.2_corrected_stemmed.txt
Démonomanie II.3_corrected_stemmed.txt
Démonomanie II.4_corrected_stemmed.txt
Démonomanie II.5_corrected_stemmed.txt
Démonomanie II.6_corrected_stemmed.txt
Démonomanie II.7_corrected_stemmed.txt
Démonomanie II.8_corrected_stemmed.txt
Démonomanie III.1_corrected_stemmed.txt
Démonomanie III.2_corrected_stemmed.txt
Démonomanie III.3_corrected_stemmed.txt
Démonomanie III.4_corrected_stemmed.txt
Démonomanie III.5_corrected_stemmed.txt
Démonomanie III.6_corrected_stemmed.txt
Démonomanie IV.1_corrected_stemme

Enter the window size for concordance:  15


Select the files for KWIC and key word counts:
1. Démonomanie preface Repair_corrected_stemmed.txt
2. République preface_corrected_stemmed.txt
3. Discours des raisons_corrected_stemmed.txt
4. Démonomanie I.1_corrected_stemmed.txt
5. Démonomanie I.2_corrected_stemmed.txt
6. Démonomanie I.3_corrected_stemmed.txt
7. Démonomanie I.4_corrected_stemmed.txt
8. Démonomanie I.5_corrected_stemmed.txt
9. Démonomanie I.6_corrected_stemmed.txt
10. Démonomanie I.7_corrected_stemmed.txt
11. Démonomanie II.1_corrected_stemmed.txt
12. Démonomanie II.2_corrected_stemmed.txt
13. Démonomanie II.3_corrected_stemmed.txt
14. Démonomanie II.4_corrected_stemmed.txt
15. Démonomanie II.5_corrected_stemmed.txt
16. Démonomanie II.6_corrected_stemmed.txt
17. Démonomanie II.7_corrected_stemmed.txt
18. Démonomanie II.8_corrected_stemmed.txt
19. Démonomanie III.1_corrected_stemmed.txt
20. Démonomanie III.2_corrected_stemmed.txt
21. Démonomanie III.3_corrected_stemmed.txt
22. Démonomanie III.4_corrected_stemmed.txt
23.

Enter the number of the file, a range (e.g., 1-3), multiple ranges (e.g., 1-3,5-7), a text pattern to select files, or type 'all' to select all files:  Dém,Rép


Selected files:
Démonomanie preface Repair_corrected_stemmed.txt
République preface_corrected_stemmed.txt
Démonomanie I.1_corrected_stemmed.txt
Démonomanie I.2_corrected_stemmed.txt
Démonomanie I.3_corrected_stemmed.txt
Démonomanie I.4_corrected_stemmed.txt
Démonomanie I.5_corrected_stemmed.txt
Démonomanie I.6_corrected_stemmed.txt
Démonomanie I.7_corrected_stemmed.txt
Démonomanie II.1_corrected_stemmed.txt
Démonomanie II.2_corrected_stemmed.txt
Démonomanie II.3_corrected_stemmed.txt
Démonomanie II.4_corrected_stemmed.txt
Démonomanie II.5_corrected_stemmed.txt
Démonomanie II.6_corrected_stemmed.txt
Démonomanie II.7_corrected_stemmed.txt
Démonomanie II.8_corrected_stemmed.txt
Démonomanie III.1_corrected_stemmed.txt
Démonomanie III.2_corrected_stemmed.txt
Démonomanie III.3_corrected_stemmed.txt
Démonomanie III.4_corrected_stemmed.txt
Démonomanie III.5_corrected_stemmed.txt
Démonomanie III.6_corrected_stemmed.txt
Démonomanie IV.1_corrected_stemmed.txt
Démonomanie IV.2_corrected_stemmed.tx

Do you want to append results to an existing .xlsx file? (yes/no):  no


Creating a new workbook.


Enter a unique name for the .xlsx file (without extension):  ss
Do you want to skip processing Hypothesis 1? (yes/no):  no



Predefined words for Hypothesis 1: citoyen, cour, domain, ressort


Enter the minimum count threshold for collocates to include:  2



Collocates with counts >= 2:
1. alli (Count: 7)
2. appel (Count: 5)
3. approch (Count: 6)
4. aristocrat (Count: 4)
5. autruy (Count: 4)
6. bien (Count: 24)
7. bon (Count: 4)
8. bourgeois (Count: 15)
9. certain (Count: 7)
10. charg (Count: 6)
11. choif (Count: 5)
12. cit (Count: 10)
13. citoyen (Count: 44)
14. command (Count: 8)
15. corp (Count: 5)
16. cour (Count: 11)
17. coustum (Count: 5)
18. definit (Count: 7)
19. deni (Count: 11)
20. derni (Count: 5)
21. different (Count: 8)
22. dir (Count: 10)
23. domain (Count: 6)
24. droit (Count: 13)
25. enfant (Count: 8)
26. entre (Count: 13)
27. esclav (Count: 7)
28. estat (Count: 28)
29. estrang (Count: 11)
30. estranger (Count: 12)
31. fix (Count: 4)
32. fort (Count: 20)
33. foy (Count: 4)
34. franc (Count: 4)
35. honneur (Count: 4)
36. ja (Count: 5)
37. jug (Count: 14)
38. latin (Count: 4)
39. lieu (Count: 4)
40. ligu (Count: 5)
41. loi (Count: 10)
42. magistrat (Count: 17)
43. marqu (Count: 4)
44. mer (Count: 4)
45. moin (Count: 12)
46. 

KeyboardInterrupt: Interrupted by user

In [None]:
modify this code so that it gives users the option to remove words from the selected significant collocates words list after finalizing the selection to allow the user to remove collocates selected in error. Also, please delete the blank sheet at the beginning of the .xlsx workbook. Finally, please ensure that the summed counts displayed in the .xlsx workbook are the counts of unique keyword-collocate pairs in each file, not the total number of occurences of any keyword-collocate pair. Thanks!