In [1]:
import os
import csv
import nltk
from scipy.stats import fisher_exact
from openpyxl import Workbook

# Function to prompt the user to select text files based on patterns
def prompt_pattern_files(text_files, pattern):
    selected_files = [file for file in text_files if file.startswith(pattern)]
    return selected_files

def prompt_files(text_files, purpose):
    text_files = sorted(text_files)  # Sort files alphabetically
    selected_files = []
    while True:
        print(f"Select the text files for {purpose}:")
        for i, file in enumerate(text_files, start=1):
            print(f"{i}. {file}")
        
        selection = input("Enter the number of the file, a range (e.g., 1-3), a text pattern to select files, or type 'all' to select all files. Type 'done' to finish: ").strip()
        if selection.lower() == 'done':
            break
        elif selection.lower() == 'all':
            selected_files = text_files
            break
        
        if '-' in selection:
            start, end = map(int, selection.split('-'))
            selected_files.extend(text_files[start-1:end])
        elif selection.isdigit():
            selected_files.append(text_files[int(selection) - 1])
        else:
            selected_files.extend(prompt_pattern_files(text_files, selection))
        
        selected_files = sorted(set(selected_files))  # Remove duplicates and sort
        print("Current selected files:")
        for file in selected_files:
            print(file)
    
    return selected_files

# Function to process text files
def process_text_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            combined_text += file.read().lower() + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return nltk.Text(tokens)

# Function to find .txt files in a directory
def find_text_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.txt')])  # Sort files alphabetically

# Function to list subfolders in the current directory
def list_subfolders():
    return sorted([f.name for f in os.scandir() if f.is_dir()])  # Sort folders alphabetically

# Function to prompt the user to select a subfolder or the current directory
def prompt_subfolder(subfolders):
    print("Select a subfolder or the current working directory:")
    print("0. Current Working Directory")
    for i, subfolder in enumerate(subfolders, start=1):
        print(f"{i}. {subfolder}")
    selected_index = int(input("Enter the number of the subfolder: "))
    return None if selected_index == 0 else subfolders[selected_index - 1]

# Function to get predefined target words
def get_predefined_target_words():
    return [
        'absolu', 'arrest', 'bien', 'chos', 'citoyen', 'conseil', 'conseiller', 'confess', 
        'cour', 'couron', 'demon', 'demoniaqu', 'diabl', 'diabol', 'dieu', 'divin', 'domain', 
        'droit', 'édict', 'estat', 'hebrieu', 'impiet', 'iurisdict', 'jug', 'just', 'justic', 'loi', 
        'magistrat', 'maiest', 'offic', 'offici', 'ordon', 'parlement', 'preuv', 'princ', 
        'puissanc', 'question', 'republ', 'ressort', 'roy', 'royal', 'royaum', 'sathan', 
        'seigneur', 'seigneurial', 'sorceller', 'sorci', 'souverain', 'souverainet', 'statut', 'sujet'
    ]

# Function to choose subdirectory for stopwords csv file
def choose_subdirectory(subdirectories):
    print("Select a subdirectory for the stopwords csv file:")
    print("0. Current Working Directory")
    for i, subdir in enumerate(subdirectories, start=1):
        print(f"{i}. {subdir}")
    while True:
        try:
            choice = int(input("Enter your choice: "))
            if 0 <= choice <= len(subdirectories):
                return None if choice == 0 else subdirectories[choice - 1]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a number.")

# Function to read stopwords from a csv file
def read_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

# Function to find .csv files in a directory
def find_csv_files(directory):
    return sorted([f for f in os.listdir(directory) if f.endswith('.csv')])  # Sort files alphabetically

# Function to process the subset of text files for KWIC and counts
def process_subset_files(file_paths):
    combined_text = ""
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_contents = f.read().lower()
            combined_text += file_contents + " "
    tokens = nltk.wordpunct_tokenize(combined_text)
    return tokens

def get_kwic(sometargetterm, somelistofwords, window=10, excl_target=True, source_file=None):
    kwics = []
    for n, w in enumerate(somelistofwords):
        if w == sometargetterm:
            start = max(0, n - window)
            end = min(n + window + 1, len(somelistofwords))
            if excl_target:
                k = somelistofwords[start:n] + somelistofwords[n + 1:end]
            else:
                k = somelistofwords[start:end]
            kwics.append((k, source_file))
    return kwics

def add_to_count_dict(word, count_dict):
    if word in count_dict:
        count_dict[word] += 1
    else:
        count_dict[word] = 1

def get_fishers(someword, somecountdict, someratedict, alternative='greater'):
    r = someratedict[someword]
    wc = sum(somecountdict.values())
    a = somecountdict[someword]
    b = wc - a
    c = round(r * wc)
    d = wc - c
    p = fisher_exact([[a, b], [c, d]], alternative=alternative)[1]
    return p

def search_concordance(text_data, tokens, stops, alpha, suffix):
    excl_target = input(f"Exclude target terms from concordance (yes/no)? ").strip().lower() == 'yes'
    window = int(input(f"Enter the window size for concordance: ").strip())

    subset_files = prompt_files(text_files, "KWIC and key word counts")

    wb = Workbook()

    for token in tokens:
        ws = wb.create_sheet(title=token)

        ws.append(['Source File', 'Collocate', 'Count', 'Frequency', 'obs/exp', 'p-value'])

        counts_by_file = {}
        for file in subset_files:
            subset_tokens = process_subset_files([os.path.join(subfolder_path, file)])
            kwics = get_kwic(token, subset_tokens, window, excl_target, source_file=file)
            counts = {}
            for k, source_file in kwics:
                for w in k:
                    add_to_count_dict(w, counts)
            counts_by_file[file] = counts

        total_wc = sum(count for counts in counts_by_file.values() for count in counts.values())
        rates = {word: count / total_wc for counts in counts_by_file.values() for word, count in counts.items()}

        for file, counts in counts_by_file.items():
            for word, count in sorted(counts.items(), key=lambda item: item[1], reverse=True):
                if word not in stops:
                    p_value = get_fishers(word, counts, rates)
                    if p_value < alpha:
                        frequency = count / total_wc
                        exp = rates[word] * total_wc
                        obs_exp = count / exp
                        ws.append([file, word, count, frequency, obs_exp, p_value])

    if 'concordances' not in os.listdir():
        os.mkdir('concordances')
    
    # Remove the first default sheet named 'Sheet'
    del wb['Sheet']
    
    filename = f"most_distinct_collocates_{suffix}.xlsx"
    wb.save(os.path.join('concordances', filename))
    print(f'Concordance has been saved to concordances/{filename}')

# Check for subfolders
subfolders = list_subfolders()
if subfolders:
    selected_subfolder = prompt_subfolder(subfolders)
    subfolder_path = os.getcwd() if selected_subfolder is None else os.path.join(os.getcwd(), selected_subfolder)
    text_files = find_text_files(subfolder_path)
    if text_files:
        selected_files = prompt_files(text_files, "global word counts")
        full_file_paths = [os.path.join(subfolder_path, file) for file in selected_files]
        text_data = process_text_files(full_file_paths)
    else:
        print(f"No text files found in '{selected_subfolder}'.")
else:
    print("No subfolders found in the current directory.")

# Example usage
use_predefined = input("Do you want to use a predefined list of target words (yes/no)? ").strip().lower() == 'yes'
if use_predefined:
    tokens = get_predefined_target_words()
else:
    user_input = input("Enter words to find their collocate concordances (separated by spaces): ").lower()
    tokens = [word.strip() for word in user_input.split(' ')]  # Split the input into a list of tokens

alpha = float(input("Enter the value for alpha: ").strip())
suffix = input("Enter a suffix for the .xlsx filename: ").strip()

# Check for subfolders for stopwords
stopwords_subfolders = list_subfolders()
if stopwords_subfolders:
    selected_stopwords_subfolder = choose_subdirectory(stopwords_subfolders)
    stopwords_subfolder_path = os.getcwd() if selected_stopwords_subfolder is None else os.path.join(os.getcwd(), selected_stopwords_subfolder)
    stopwords_files = find_csv_files(stopwords_subfolder_path)
    if stopwords_files:
        print('Select a stopwords file for the rate dictionary:')
        chosen_stopwords_file = prompt_files(stopwords_files, "the rate dictionary")[0]  # Select the first file from the list
        chosen_csv_file_path = os.path.join(stopwords_subfolder_path, chosen_stopwords_file)
        stops = read_stopwords(chosen_csv_file_path)
        search_concordance(text_data, tokens, stops, alpha, suffix)
    else:
        print(f"No stopwords files found in '{selected_stopwords_subfolder}'.")
else:
    print("No subfolders found in the current directory.")

Select a subfolder or the current working directory:
0. Current Working Directory
1. .ipynb_checkpoints
2. concordances


Enter the number of the subfolder:  0


Select the text files for global word counts:
1. Discours des raisons_corrected_stemmed.txt
2. Démonomanie I.1_corrected_stemmed.txt
3. Démonomanie I.2_corrected_stemmed.txt
4. Démonomanie I.3_corrected_stemmed.txt
5. Démonomanie I.4_corrected_stemmed.txt
6. Démonomanie I.5_corrected_stemmed.txt
7. Démonomanie I.6_corrected_stemmed.txt
8. Démonomanie I.7_corrected_stemmed.txt
9. Démonomanie II.1_corrected_stemmed.txt
10. Démonomanie II.2_corrected_stemmed.txt
11. Démonomanie II.3_corrected_stemmed.txt
12. Démonomanie II.4_corrected_stemmed.txt
13. Démonomanie II.5_corrected_stemmed.txt
14. Démonomanie II.6_corrected_stemmed.txt
15. Démonomanie II.7_corrected_stemmed.txt
16. Démonomanie II.8_corrected_stemmed.txt
17. Démonomanie III.1_corrected_stemmed.txt
18. Démonomanie III.2_corrected_stemmed.txt
19. Démonomanie III.3_corrected_stemmed.txt
20. Démonomanie III.4_corrected_stemmed.txt
21. Démonomanie III.5_corrected_stemmed.txt
22. Démonomanie III.6_corrected_stemmed.txt
23. Démonomani

Enter the number of the file, a range (e.g., 1-3), a text pattern to select files, or type 'all' to select all files. Type 'done' to finish:  all
Do you want to use a predefined list of target words (yes/no)?  yes
Enter the value for alpha:  0.10
Enter a suffix for the .xlsx filename:  20_window


Select a subdirectory for the stopwords csv file:
0. Current Working Directory
1. .ipynb_checkpoints
2. concordances


Enter your choice:  0


Select a stopwords file for the rate dictionary:
Select the text files for the rate dictionary:
1. stop_words.csv


Enter the number of the file, a range (e.g., 1-3), a text pattern to select files, or type 'all' to select all files. Type 'done' to finish:  all
Exclude target terms from concordance (yes/no)?  no
Enter the window size for concordance:  20


Select the text files for KWIC and key word counts:
1. Discours des raisons_corrected_stemmed.txt
2. Démonomanie I.1_corrected_stemmed.txt
3. Démonomanie I.2_corrected_stemmed.txt
4. Démonomanie I.3_corrected_stemmed.txt
5. Démonomanie I.4_corrected_stemmed.txt
6. Démonomanie I.5_corrected_stemmed.txt
7. Démonomanie I.6_corrected_stemmed.txt
8. Démonomanie I.7_corrected_stemmed.txt
9. Démonomanie II.1_corrected_stemmed.txt
10. Démonomanie II.2_corrected_stemmed.txt
11. Démonomanie II.3_corrected_stemmed.txt
12. Démonomanie II.4_corrected_stemmed.txt
13. Démonomanie II.5_corrected_stemmed.txt
14. Démonomanie II.6_corrected_stemmed.txt
15. Démonomanie II.7_corrected_stemmed.txt
16. Démonomanie II.8_corrected_stemmed.txt
17. Démonomanie III.1_corrected_stemmed.txt
18. Démonomanie III.2_corrected_stemmed.txt
19. Démonomanie III.3_corrected_stemmed.txt
20. Démonomanie III.4_corrected_stemmed.txt
21. Démonomanie III.5_corrected_stemmed.txt
22. Démonomanie III.6_corrected_stemmed.txt
23. Démo

Enter the number of the file, a range (e.g., 1-3), a text pattern to select files, or type 'all' to select all files. Type 'done' to finish:  Dém


Current selected files:
Démonomanie I.1_corrected_stemmed.txt
Démonomanie I.2_corrected_stemmed.txt
Démonomanie I.3_corrected_stemmed.txt
Démonomanie I.4_corrected_stemmed.txt
Démonomanie I.5_corrected_stemmed.txt
Démonomanie I.6_corrected_stemmed.txt
Démonomanie I.7_corrected_stemmed.txt
Démonomanie II.1_corrected_stemmed.txt
Démonomanie II.2_corrected_stemmed.txt
Démonomanie II.3_corrected_stemmed.txt
Démonomanie II.4_corrected_stemmed.txt
Démonomanie II.5_corrected_stemmed.txt
Démonomanie II.6_corrected_stemmed.txt
Démonomanie II.7_corrected_stemmed.txt
Démonomanie II.8_corrected_stemmed.txt
Démonomanie III.1_corrected_stemmed.txt
Démonomanie III.2_corrected_stemmed.txt
Démonomanie III.3_corrected_stemmed.txt
Démonomanie III.4_corrected_stemmed.txt
Démonomanie III.5_corrected_stemmed.txt
Démonomanie III.6_corrected_stemmed.txt
Démonomanie IV.1_corrected_stemmed.txt
Démonomanie IV.2_corrected_stemmed.txt
Démonomanie IV.3_corrected_stemmed.txt
Démonomanie IV.4_corrected_stemmed.txt
Dé

Enter the number of the file, a range (e.g., 1-3), a text pattern to select files, or type 'all' to select all files. Type 'done' to finish:  Rép


Current selected files:
Démonomanie I.1_corrected_stemmed.txt
Démonomanie I.2_corrected_stemmed.txt
Démonomanie I.3_corrected_stemmed.txt
Démonomanie I.4_corrected_stemmed.txt
Démonomanie I.5_corrected_stemmed.txt
Démonomanie I.6_corrected_stemmed.txt
Démonomanie I.7_corrected_stemmed.txt
Démonomanie II.1_corrected_stemmed.txt
Démonomanie II.2_corrected_stemmed.txt
Démonomanie II.3_corrected_stemmed.txt
Démonomanie II.4_corrected_stemmed.txt
Démonomanie II.5_corrected_stemmed.txt
Démonomanie II.6_corrected_stemmed.txt
Démonomanie II.7_corrected_stemmed.txt
Démonomanie II.8_corrected_stemmed.txt
Démonomanie III.1_corrected_stemmed.txt
Démonomanie III.2_corrected_stemmed.txt
Démonomanie III.3_corrected_stemmed.txt
Démonomanie III.4_corrected_stemmed.txt
Démonomanie III.5_corrected_stemmed.txt
Démonomanie III.6_corrected_stemmed.txt
Démonomanie IV.1_corrected_stemmed.txt
Démonomanie IV.2_corrected_stemmed.txt
Démonomanie IV.3_corrected_stemmed.txt
Démonomanie IV.4_corrected_stemmed.txt
Dé

Enter the number of the file, a range (e.g., 1-3), a text pattern to select files, or type 'all' to select all files. Type 'done' to finish:  done


Concordance has been saved to concordances/most_distinct_collocates_20_window.xlsx
