In [7]:
import os
import nltk
import csv

# Function to list subfolders in the current directory
def list_subfolders():
    return [f.name for f in os.scandir() if f.is_dir()]

# Function to prompt the user to select a subfolder
def prompt_subfolder(subfolders):
    for index, folder in enumerate(subfolders, start=1):
        print(f"{index}. {folder}")
    while True:
        try:
            choice = int(input("Select a subfolder by number: "))
            if 1 <= choice <= len(subfolders):
                return subfolders[choice - 1]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a number.")

# Function to find text files in a folder
def find_text_files(folder):
    text_files = [f for f in os.listdir(folder) if f.endswith('.txt')]
    return text_files

# Function to find csv files in a folder
def find_csv_files(folder):
    csv_files = [f for f in os.listdir(folder) if f.endswith('.csv')]
    return csv_files

# Function to prompt the user to select a text file
def prompt_text_file(text_files):
    for index, file in enumerate(text_files, start=1):
        print(f"{index}. {file}")
    while True:
        try:
            choice = int(input("Select a text file by number: "))
            if 1 <= choice <= len(text_files):
                return text_files[choice - 1]
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a number.")

# Function to process the selected text file
def process_text_file(file_path):
    with open(file_path, 'r') as f:
        file_contents = f.read().lower()
        tokens = nltk.wordpunct_tokenize(file_contents)
        text = nltk.Text(tokens)
        return text

subfolders = list_subfolders()
if subfolders:
    selected_subfolder = prompt_subfolder(subfolders)
    subfolder_path = os.path.join(os.getcwd(), selected_subfolder)
    text_files = find_text_files(subfolder_path)
    if text_files:
        selected_file = prompt_text_file(text_files)
        full_file_path = os.path.join(subfolder_path, selected_file)
        text_data = process_text_file(full_file_path)
        print(f"Type of 'text': {type(text_data)}")
    else:
        print(f"No text files found in '{selected_subfolder}'.")
else:
    print("No subfolders found in the current directory.")

def get_kwic(sometargetterm, somelistofwords, window=10, excl_target=True):
    kwics = []
    for n, w in enumerate(somelistofwords):
        if w == sometargetterm:
            start = max(0, n - window)
            end = min(n + window + 1, len(somelistofwords))
            if excl_target:
                k = somelistofwords[start:n] + somelistofwords[n + 1:end]
            else:
                k = somelistofwords[start:end]
            kwics.append(k)
    return kwics

def add_to_count_dict(word, count_dict):
    if word in count_dict:
        count_dict[word] += 1
    else:
        count_dict[word] = 1

def search_concordance(text_data, tokens):
    for token in tokens:
        # Create the directory if it does not exist
        directory = 'concordances'
        if not os.path.exists(directory):
            os.makedirs(directory)

        # Create a filename for each token
        filename = f"{token}.csv"
        file_path = os.path.join(directory, filename)

        excl_target = input(f"Exclude target term '{token}' from concordance (yes/no)? ").strip().lower() == 'yes'
        window = int(input(f"Enter the window size for concordance for '{token}': ").strip())

        with open(file_path, 'w', newline='') as csvfile:
            fieldnames = ['Collocate', 'Count', 'Frequency']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            words = text_data.tokens
            kwics = get_kwic(token, words, window, excl_target)
            counts = {}
            for k in kwics:
                for w in k:
                    add_to_count_dict(w, counts)
            total_wc = sum(counts.values())
            for word, count in sorted(counts.items(), key=lambda item: item[1], reverse=True):
                if word not in stops:
                    frequency = count / total_wc
                    writer.writerow({'Collocate': word, 'Count': count, 'Frequency': frequency})

        print(f'Concordance for the token "{token}" has been saved to {file_path}')

def choose_directory(directories):
    print("Select a directory for the stopwords csv file:")
    for i, directory in enumerate(directories):
        print(f"{i + 1}. {directory}")
    choice = int(input("Enter your choice: "))
    return directories[choice - 1]

def read_stopwords(filepath):
    with open(filepath, 'r', encoding='latin-1') as file:
        reader = csv.reader(file)
        stopwords = []
        for row in reader:
            for word in row:
                stopwords.extend(word.split(','))
        return [word.strip() for word in stopwords]

# Example usage
user_input = input("Enter words to find their concordance (separated by spaces): ").lower()
tokens = [word.strip() for word in user_input.split(' ')]  # Split the input into a list of tokens

# List subfolders and current directory
directories = ['.'] + list_subfolders()
selected_directory = choose_directory(directories)
stopwords_files = find_csv_files(selected_directory)
if stopwords_files:
    chosen_stopwords_file = prompt_text_file(stopwords_files)
    chosen_csv_file_path = os.path.join(selected_directory, chosen_stopwords_file)
    stops = read_stopwords(chosen_csv_file_path)
    search_concordance(text_data, tokens)
else:
    print(f"No stopwords files found in '{selected_directory}'.")

1. tokenized
2. concordances
3. final
4. .ipynb_checkpoints


Select a subfolder by number:  1


1. Démonomanie Repair_corrected_underscore_bigrams.txt
2. Démonomanie Repair_corrected.txt


Select a text file by number:  2


Type of 'text': <class 'nltk.text.Text'>


Enter words to find their concordance (separated by spaces):  malade


Select a directory for the stopwords csv file:
1. .
2. tokenized
3. concordances
4. final
5. .ipynb_checkpoints


Enter your choice:  1


1. spellcheck_data.csv
2. stop_words.csv


Select a text file by number:  2
Exclude target term 'malade' from concordance (yes/no)?  no
Enter the window size for concordance for 'malade':  8


Concordance for the token "malade" has been saved to concordances/malade.csv
