In [3]:
#command prompt steps to activate coreNLP
    #cd C:\Users\james\Downloads\stanford-corenlp-4.5.6
    #java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
import nltk
from nltk.tokenize import sent_tokenize
import openpyxl
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
import re
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk import Tree
from nltk.draw.util import CanvasFrame
from nltk.draw import TreeWidget
import sys
import os
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
import glob
import io
import numpy as np
import requests
import itertools
import pandas as pd
import shutil
parser = CoreNLPParser(url='http://localhost:9000')

# SENTENCE_TREE CLASS

In [4]:
class sentence_tree:
    """
    The sentence_tree class is a collection of functions and variables to perform operations and collect 
    statistics on a sentence respectively
    """

    def __init__(self, sentence):
        
        """
        This function initializes my class
        
        INPUT:
            sentence - sentence string to process
        """
        self.sentence = sentence
        self.tree = next(parser.raw_parse(self.sentence))
        self.parse_tuples = self.tree_to_tuples(self.tree)
        self.position_in_sentence = [0] #variable to keep track of position in sentence through recursions
        self.indexed_tuples =  self.create_indexed_tuples(self.parse_tuples,  self.position_in_sentence)
        self.prune_tree = [[]]#This stores all the trees with NPs in the sentence
        self.Noun_List = ["NN","NNS","NNP","NNPS"] #this stores all parts of speech that can be treated as a noun phrase
        self.Noun_Phrases = ['NP','NP-TMP','WHNP'] #list of all noun phrases
        self.Prepositional_Phrase = ["PP"] #List of Prepositional phrase markers (just one)
        self.Sentence = ["S"] #list of sentence markers (just one in this case)
        self.preposition_or_subordinating_conjunction = ["IN"] #list of prepositions or subordinating conjunctions
        self.unnecessary_fluff = ["SBAR","TO", "S"] #temporary fix for bigger phrases that are definietely different NPs

    

    def tree_to_tuples(self, tree):
        """
            Create nltk tree from sentence
            
            INPUT:
                tree - tuple tree
            OUTPT:
                nltk.Tree object created from tree
        """
        if isinstance(tree, nltk.Tree):
            return (tree.label(), [self.tree_to_tuples(t) for t in tree])
        else:
            return tree

    def flatten_list(self, input_list):
        flattened = []
        for item in input_list:
            if isinstance(item, list):
                flattened.extend(self.flatten_list(item))  # Recursively flatten the list
            else:
                flattened.append(item)  # Append the string or non-list item directly
        return flattened

    def is_sublist(self, sub_list, main_list, keep = False):
        len_sub = len(sub_list)
        if not keep:
            if len_sub == len(main_list):
                return False
        for i in range(len(main_list) - len_sub + 1):
            if main_list[i:i + len_sub] == sub_list:
                return True
        return False
    
    def create_indexed_tuples(self, tree, position):
        """
        Create a tuple tree that replaces the word in the leaves with a list [word, position in sentence]
        
        INPUT:
            tree - tuple tree
            position - list to hold value of the position in string through recursions
        """
        if isinstance(tree, tuple):
            label, children = tree
            new_children = []
            for child in children:
                new_children.append(self.create_indexed_tuples(child, position))
            return (label, new_children)
        else:
            position[0] += 1
            return ({'word' : tree, 'position' : position[0]})
    
    def prune_to_np_branches(self, tree):
        """
        Function wrapper to call prune_general specifically to collect NPs
        
        INPUT:
            tree - tuple tree
        OUTPUT:
            assignment to self.prune_tree of the list of NP strings
        """
        
        self.prune_tree = [[]]
        self.prune_general(tree, self.prune_tree, PoS = ['NP','NP-TMP','WHNP'])
        
    def prune_general(self, tree, tree2, PoS = ['NP','NP-TMP','WHNP']):
        """
        General case function for collecting the trees assosciated with a part of speech, PoS, for the tuple tree
        in tree and collecting them in tree 2
        INPUT:
            tree  - a tuple tree
            tree2 - a list of a list of tuple trees containing part of speech being searched for. 
                    It is a list so that the list of tuples can be appended in every recursive call.  
            PoS   - the part of speech that is being collected in tree2.  The default is Noun Phrase (NP)
        OUTPUT:
            modifies tree2 to store on the parts of speech specified by PoS for the sentence (in the form of a list of tuple trees) in tree
            The return value is None this signifies the last tuple (string, list) has been reached
        
        """
        if isinstance(tree, tuple): #Check that tree is a tuple tree and not a list
            label, children = tree #extracts the string and list from the tuple 
            """
            print("label:",label)
            print("children:",children)
            print()
            """
            if label in PoS:
                # If it's a node matching the part of speech save it to tree2
                tree2[0].append(tree) 
            if isinstance(children, list):#check if children is a path (list) and not a string
                # Recursively check children; keep those leading to an NP
                """
                for child in children:
                    print("child: ", child)
                    print()
                """
                for child in children:
                    np_children = self.prune_general(child, tree2, PoS = self.Noun_Phrases) #process each path in the tree
        return None
    
    def Does_Tree_Have_Target(self, tree, target = ['NP','NP-TMP','WHNP']): 
        """
        Check if a tree has the target part of speech (Either Posessive or NP)
        INPUT:
        tree  - tuple tree being scanned
        target - part of speech being targeted, default NP
        """
        recursion_for_target = False
        if isinstance(tree, tuple): #check if tree is in (string label, path list) form
            label, children = tree
            if label in target: #doesnt have to be NP, can be varialbe that can be assigned either as NP or POS
                return True
            else:
                for ii, child in enumerate(children):
                    if child[0] == target:
                        return True
                    else:
                        if self.Does_Tree_Have_Target(child, target):
                            recursion_for_target = True
        if recursion_for_target:
            return True
        else:
            return False
    
    """
    def Trees_without_NPs(self, tree):
        self.prune_tree = []
        Trees_without_NPs_List = []
        self.prune_to_np_branches(tree)
        for p_tree in self.prune_tree:
            if not self.Does_Tree_Have_Target(("Root",p_tree[1])):
                Trees_without_NPs_List.append(p_tree)
        return Trees_without_NPs_List
    """
    
    
    
    def tree_to_string_no_labels(self, tree, is_indexed = False):
        """
        Convert a tuple tree to the sentence string it represents
        
        INPUT:
            tree - tuple tree
        OUTPUT:
            this function is recursive so it returns a tuple tree if it is still processing a tree,
            otherwise it returns a string with all of the substrings appended together
        """
        # Check if the tree is a tuple with a tag and a list of subtrees or leaves
        if isinstance(tree, tuple): #and len(tree) == 2:
            _, branches = tree
            # Recursively process branches to get strings without labels
            return " ".join([self.tree_to_string_no_labels(branch, is_indexed) for branch in branches]).strip()
        # If the tree is a list, it's a list of subtrees or leaves
        elif isinstance(tree, list):
            return " ".join([self.tree_to_string_no_labels(leaf, is_indexed) for leaf in tree]).strip()
        # Base case: the tree is a leaf (a word)
        else:
            if is_indexed:
                return tree['word']
            else:
                return tree

    ''' DEPRICATED   
    def tree_to_string_no_labels_part_2(self, tree_list):
        """
        Convert each tree in the list to a string, excluding all grammatical labels
        
        INPUT:
            tree_list - list of tuple strings to process
        """
        tree_strings_no_labels = [self.tree_to_string_no_labels(tree) for tree in tree_list]
        return tree_strings_no_labels
    '''
    '''
    def tree_to_string_no_labels(self, tree):
        """
        Covert a tuple tree to the sentence string it represents
        
        INPUT:
            tree - tuple tree
        OUTPUT:
            this function is recursive so it returns a tuple tree if it is still processing a tree,
            otherwise it returns a string with all of the substrings appended together
        """
        # Check if the tree is a tuple with a tag and a list of subtrees or leaves
        if isinstance(tree, tuple) and len(tree) == 2:
            _, branches = tree
            # Recursively process branches to get strings without labels
            return " ".join([self.tree_to_string_no_labels(branch) for branch in branches]).strip()
        # If the tree is a list, it's a list of subtrees or leaves
        elif isinstance(tree, list):
            return " ".join([self.tree_to_string_no_labels(leaf) for leaf in tree]).strip()
        # Base case: the tree is a leaf (a word)
        else:
            return tree
    '''
    def tree_to_list_of_strings(self, tree, is_indexed = False):
        """
        Convert a tree of a list of strings without grammatical labels
        
        INPUT:
            tree - tuple tree
            is_index - is the tuple tree standard (False) or have indexing (True)
        OUTPUT:
            list of the strings in the tuple tree
        """
        # Check if the tree is a tuple with a tag and a list of subtrees or leaves
        if isinstance(tree, tuple) and len(tree) == 2:
            _, branches = tree
            # Recursively process branches to get strings without labels
            return self.flatten_list([self.tree_to_list_of_strings(branch, is_indexed) for branch in branches])
        # If the tree is a list, it's a list of subtrees or leaves
        elif isinstance(tree, list):
            return [self.tree_to_list_of_stings(leaf, is_indexed) for leaf in tree]
        # Base case: the tree is a leaf (a word)
        else:
            if is_indexed:
                return tree['word']
            else:
                return tree

    
    def list_of_trees_to_list_of_strings(self, tree_list, is_indexed = False):
        """
        Convert each tree in a list to a string list, excluding all grammatical labels
        
        INPUT:
            list of trees
            
        OUPUT:
            list of string lists
        """
        tree_strings_no_labels = [self.tree_to_list_of_strings(tree, is_indexed) for tree in tree_list]
        return tree_strings_no_labels

    '''DEPRICATE
    def duplicate_NP_removal(self, tree, keep_substring = False):
        """
        remove strings that overlap with other strings
        
        INPUT:
            tree - tuple tree to be processed 
            keep_substring - Boolean switch to determine if substrings are kept or the containing string
                             default is False
        OUTPUT:
            list of NP tupple tree strings with overlapping strings removed
        """
        strings_with_NPs = self.tree_to_string_no_labels_part_2(tree)
        strings_with_NPs_True_Or_False_Array = [True] * len(strings_with_NPs)

        for idx,s in enumerate(strings_with_NPs):
            for idy,t in enumerate(strings_with_NPs[idx+1:]):
                if strings_with_NPs[idx] in strings_with_NPs[idy+1+idx]:
                    if keep_substring:
                        strings_with_NPs_True_Or_False_Array[idy+1+idx] = False
                    else:
                        strings_with_NPs_True_Or_False_Array[idx] = False
                elif strings_with_NPs[idy+1+idx] in strings_with_NPs[idx]:
                    if keep_substring:
                        strings_with_NPs_True_Or_False_Array[idx] = False
                    else:
                        strings_with_NPs_True_Or_False_Array[idy+1+idx] = False
        nps_without_duplicates = []
        for ii,noun_phrase in enumerate(tree):
            if strings_with_NPs_True_Or_False_Array[ii]:
                nps_without_duplicates.append(noun_phrase)
        return nps_without_duplicates
    '''
    
    def check_and_remove_PoS(self, tree, PoS = ["TO","SBAR", "S"]):
        """
        Remove trees with the chosen label and return modified tree
        
        INPUT:
           tree - tuple tree to check for PoS
           PoS - part of speech being removed from tuple tree
        OUTPUT:
            returns tuple tree with all subtree with PoS label removed
        """
        if isinstance(tree, tuple):
            # If the tuple contains 'POS', return None to signify removal
            if tree[0] in PoS:
                return None
            else:
                # Otherwise, recursively process and rebuild the tuple without 'POS'
                return (tree[0], [self.check_and_remove_PoS(sub_t, PoS) for sub_t in tree[1] if self.check_and_remove_PoS(sub_t, PoS) is not None])
        elif isinstance(tree, list):
            # For lists, filter through and only keep items that should not be removed
            return [self.check_and_remove_PoS(sub_t, PoS) for sub_t in tree if self.check_and_remove_PoS(sub_t, PoS) is not None]
        else:
            return tree

    def remove_pos_tuples(self, tree):
        """
        Function to check and remove tuples containing 'POS' and the smallest tree containing the POS (i.e. NP: Cinderella's)
        INPUT:
            tree - tuple tree to process
        OUTPUT:
            list of tuple trees with 'POS' removed
        """
        trees2compare = []
        trees_without_pos = []  # List to store trees without 'POS'

        # Separate trees with 'POS' and trees without 'POS'
        for idx, t in enumerate(tree):
            if self.Does_Tree_Have_Target(t, target=['POS']):
                trees2compare.append((idx, t))
            else:
                trees_without_pos.append(t)

        keep_trees = []
        if len(trees2compare) > 1:
            for idx, t1 in enumerate(trees2compare[:-1]):
                s1 = self.tree_to_list_of_strings(t1[1])
                for idy, t2 in enumerate(trees2compare[idx + 1:]):
                    s2 = self.tree_to_list_of_strings(t2[1])
                    if len(s1) < len(s2):
                        if self.is_sublist(s1, s2):
                            keep_trees.append(t2[0])
                    elif self.is_sublist(s2, s1):
                        keep_trees.append(t1[0])

        # Print keep_trees after identifying smallest trees containing 'POS'
        # print("Keep trees after sublist check:", keep_trees)

        # Add trees without 'POS' to the keep_trees list
        #keep_trees.extend([idx for idx, _ in enumerate(tree) if idx in keep_trees])

        # Print keep_trees after adding trees without 'POS'
        # print("Keep trees after adding trees without 'POS':", keep_trees)

        cleaned_trees = trees_without_pos
        for idx, t in enumerate(tree):
            if idx in keep_trees:
                cleaned_trees.append(t)

        # Print the final cleaned_trees
        #print("Final cleaned trees:", cleaned_trees)

        return cleaned_trees



        # Filter out any None values that may have resulted from removal
        #return [t for t in cleaned_tuples if t is not None]
    
    def case_for_complicated_NP_embeddings(self, tree_list):
        """
        Combine NPs together into one NP to catch NNs that are modified by NPs 
        
        INPUT:
            tree_list - tuple tree
        OUTPUT:
            list of NPs that have passed this criterion
        """
        def check_for_NPs(tree_list, reduced_list):
            """
            Recursive function to perform logic on the tuple tree
            
            INPUT:
                tree_list - list of np trees to compare
                reduced_list - list of the list of final NP trees to keep
            OUTPUT:
                the list of NP strings to keep is updated in reduced_list
                the return value is the list of NP strings 
            """
            for tree in tree_list:
                if isinstance(tree, tuple):
                    keep_tree = True
                    count_of_NNs = self.count_tokens([tree], self.Noun_List)
                    if count_of_NNs > 1:
                        top_level_NPs = [[]] 
                        self.prune_general(tree[1][0], top_level_NPs)
                        count_of_NPs = self.count_tokens(tree[1], self.Noun_Phrases)

                        if count_of_NPs > 1:
                            temp_list = []
                            for top_level_NP in top_level_NPs[0]:
                                count_of_NNs = self.count_tokens(top_level_NP, self.Noun_List)
                                if count_of_NNs > 0:
                                    result = check_for_NPs([top_level_NP], reduced_list)  # Ensure this is a list
                                    temp_list.extend(result)  # Extend flattens the list
                            reduced_list[0].extend(temp_list)
                            keep_tree = False
                    count_of_unnecessary_fluff = self.count_tokens([tree], self.unnecessary_fluff)
                    if count_of_unnecessary_fluff >= 1:
                        tree = self.check_and_remove_PoS(tree, self.unnecessary_fluff)
                        keep_tree = True
                    count_of_PPs = self.count_tokens([tree], self.Prepositional_Phrase)
                    if count_of_PPs >= 1:
                        keep_tree = False
                    count_of_Ss = self.count_tokens([tree], self.Sentence)
                    if count_of_Ss >= 1:
                        keep_tree = False
                    count_of_INs = self.count_tokens([tree], self.preposition_or_subordinating_conjunction)
                    if count_of_INs >= 1:
                        keep_tree = False
                    if keep_tree:
                        reduced_list[0].append(tree)

        NPs_that_pass_this_function = [[]]
        NP_check = check_for_NPs(tree_list, NPs_that_pass_this_function)

        # Remove duplicates based on leaves
        unique_nps = []
        seen_leaves = set()
        for np in NPs_that_pass_this_function[0]:
            leaves = tuple(self.extract_leaves(np))  # Convert leaves to a tuple for hashing
            if leaves not in seen_leaves:
                seen_leaves.add(leaves)
                unique_nps.append(np)
        
        NPs_that_pass_this_function[0] = unique_nps

        return NPs_that_pass_this_function
     
    def count_tokens(self, tree_list, tokens): #this can count tokens, can be used when just wanting to find certain POS such as DET, CONJ, PP etc.
        """
        Count the number of tokens occuring in the tuple tree list
        
        INPUT:
            tree_list - the tupple tree list to be processed
            tokens - the list of tokens in the tuple_tree to be counted
        OUTPUT:
            the number of times the token occurs in the tree list            
        """
        count = 0
        for item in tree_list:
            if isinstance(item, tuple):
                tag, value = item
                if tag in tokens:
                    count += 1  # Increment count for each 'token'
                elif isinstance(value, list):
                    count += self.count_tokens(value, tokens)  # Recurse into nested lists
        return count
    
    def handle_special_phrases(self, special_phrases, utterance):
        """
        Increase the prenominal modifier count if any of the specified special phrases
        are present in the utterance.

        Args:
            special_phrases (list): A list of strings representing special phrases.
            utterance (str): The utterance or sentence string.

        Returns:
            int: The additional prenominal modifier count to be added.
        """
        additional_count = 0
        for phrase in special_phrases:
            if phrase in utterance:
                additional_count += 1
                print(f"Prenominal Quantifier '{phrase}' found.")
                print("Prenominal No for Quantifier: 1")
        #print("Additional Count Number:",additional_count)
        return additional_count
    # Define the function to extract leaves from the tree (same as before)
    def extract_leaves(self, tree):
        leaves = []  # List to collect leaves

        def traverse(node):
            if isinstance(node, tuple) and len(node) > 1 and isinstance(node[1], list):
                if all(isinstance(item, tuple) for item in node[1]):
                    for item in node[1]:
                        traverse(item)
                else:
                    leaves.extend([(node[0], subitem) for subitem in node[1]])
            elif isinstance(node, list):
                for subnode in node:
                    traverse(subnode)

        traverse(tree)
        return leaves

# ALL FILES CODE

In [5]:
# Define the function to extract leaves from the tree (same as before)
def extract_leaves(tree):
    leaves = []  # List to collect leaves

    def traverse(node):
        if isinstance(node, tuple) and isinstance(node[1], list):
            if all(isinstance(item, tuple) for item in node[1]):
                for item in node[1]:
                    traverse(item)
            else:
                leaves.extend([(node[0], subitem) for subitem in node[1]])
        elif isinstance(node, list):
            for subnode in node:
                traverse(subnode)

    traverse(tree)
    return leaves

# Define the function to scan modifiers (same as before)
def scan_modifiers(leaves, special_phrases):
    if not leaves or leaves[-1][0] not in All_Nouns:
        # If the last POS is not a noun, return 0
        return 0
    
    # Set up POS and words to exclude from counting
    excluded_pos = ["CC", "TO", "IN", "POS", "CC", "RP"]
    excluded_words = Determiners_to_be_excluded_as_prenominals + Words_to_be_excluded_as_prenominals  # Include 'been' in the exclusions
    CC_pos = ["CC"]

    # Add the specific prenominal quantifier phrases to exclude
    Prenominal_Quantifier_Words = ['many', 'all', 'most', 'some', 'few', 'several', 'number', 'bit', "lot","bunch", "any", "one"]

    # Count modifiers that are not in the excluded lists
    modifier_count = 0

    for ii, (pos, word) in enumerate(leaves[-2::-1]):  # Exclude the last leaf from counting
        if pos not in excluded_pos and word not in excluded_words:
            # Check for prenominal quantifier phrases
            if word in Prenominal_Quantifier_Words and ii + 1 < len(leaves) and leaves[-ii-2][1] == 'of':
                continue  # Skip the word if it is part of a prenominal quantifier phrase
            
            # Check if the current leaf contains a special phrase
            leaf_phrase = ' '.join(leaf[1] for leaf in leaves[-ii - 2:])
            if any(special_phrase in leaf_phrase for special_phrase in special_phrases):
                continue  # Skip if it matches any special phrase
            
            # Increment the modifier count
            modifier_count += 1

        if pos in CC_pos:
            if ii + 1 < len(leaves) and leaves[-ii-1][0] in All_Nouns:
                modifier_count -= 1
                continue
            elif ii + 2 < len(leaves) and leaves[-ii-2][0] == "DT":
                break

    return modifier_count

# Define lists of POS tags and specific words to be considered or excluded (same as before)
All_Nouns = ["NN", "NNS", "NNP", "NNPS"]
All_Adjectives = ["JJ", "JJR", "JJS", "PRP$"]
All_Adverbs = ["RB", "RBR", "RBS"]
Verbs_that_can_be_prenominal = ["VBN"]
Determiners_to_be_excluded_as_prenominals = ["a", "A", "an", "An", "the", "The", "that", "That", "these", "These", "those", "Those", "this", "This", "either", "Either", "around", "Around"]
Words_to_be_excluded_as_prenominals = ["been","around", "almost", "never","behind","above","whose","which"]
Prenominal_Quantifier_Phrases = ['a lot of', 'many of', 'all of', 'most of', 'some of', 'a few of', 'several of', 'a number of', 'a bit of', 'a bunch of', "any of", "one of"]

# Directory mapping (same as before)
# Define source directories and corresponding destination directories
directory_mapping = {
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\older adult cinderella individual sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\older adult cinderella individual sentences",
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\older adult hobby individual sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\older adult hobby individual sentences",
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\young adult cinderella individual sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\young adult cinderella individual sentences",
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\young adult hobby individual sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\young adult hobby individual sentences"
}

# Ensure destination directories exist (same as before)
for dest in directory_mapping.values():
    os.makedirs(dest, exist_ok=True)

# Define the function to process Excel files
def process_excel_file(src_file_path, dest_file_path):
    # Load the workbook and worksheet
    wb = load_workbook(src_file_path)
    ws = wb.active

    # Find the last row with data in column B
    last_row = ws.max_row
    last_row_formulas = {}  # Dictionary to store formulas in the last row

    for col in range(2, ws.max_column + 1):
        col_letter = get_column_letter(col)
        # Check if the header is "Comments"
        if ws[f'{col_letter}1'].value == "Comments":
            continue
        # Calculate the sum of values in the column (excluding the header)
        column_sum = 0
        for row in range(2, last_row + 1):
            cell_value = ws[f'{col_letter}{row}'].value
            if isinstance(cell_value, (int, float)):
                column_sum += cell_value
            elif cell_value == 'N/A':
                continue
        # Check if the last row contains a string that starts with "=SUM"
        last_row_value = ws[f'{col_letter}{last_row}'].value
        if isinstance(last_row_value, str) and last_row_value.startswith("=SUM"):
            last_row_formulas[col_letter] = last_row_value
        elif not isinstance(last_row_value, str) or not last_row_value.startswith("=SUM"):
            next_row = last_row + 1
            ws[f'{col_letter}{next_row}'] = column_sum
    
    # Save the file to the specified destination path
    wb.save(dest_file_path)
    
    # Load the processed data into pandas for further manipulation
    data = pd.read_excel(dest_file_path)

    # Data manipulation logic (same as before, ensuring that formula rows are preserved)
    data.insert(1, 'Prenominal Modifiers', '')

    for index, row in data.iterrows():
        utterance = row['UTTERANCES']
        if pd.isna(utterance) or utterance.strip() == '':
            continue  # Skip this iteration if the cell is blank or empty
        
        utterance = utterance.replace("[+ 0subj]", "").replace("[+ gram]", "").replace("0det", "").replace("[e]", "")
        print(f"Utterance: {utterance}")

        sentence_class = sentence_tree(utterance)
        special_phrase_count = sentence_class.handle_special_phrases(Prenominal_Quantifier_Phrases, utterance)
        sentence_class.prune_to_np_branches(sentence_class.parse_tuples)
        nps_without_POS = sentence_class.remove_pos_tuples(sentence_class.prune_tree[0])
        nps_factored_for_complicated_NP_embedding = sentence_class.case_for_complicated_NP_embeddings(nps_without_POS)
        total_modifiers = 0
        for tree in nps_factored_for_complicated_NP_embedding[0]:
            leaves = extract_leaves(tree)
            modifier_count = scan_modifiers(leaves, Prenominal_Quantifier_Phrases)
            total_modifiers += modifier_count
            if modifier_count > 0:  # Change to >0 when debugging
                print()
                print("Extracted Phrase:", leaves)
                print(f"Prenominal No: {modifier_count}")
        total_modifiers += special_phrase_count
        print()
        print(f"Total No: {total_modifiers}")
        data.at[index, 'Prenominal Modifiers'] = total_modifiers

    data['Prenominal Modifiers'] = pd.to_numeric(data['Prenominal Modifiers'], errors='coerce')
    data['Prenominal Modifiers'].fillna(0, inplace=True)

    # Save the DataFrame back to Excel
    data.to_excel(dest_file_path, index=False)

    # Reload the workbook to reapply formulas
    wb = load_workbook(dest_file_path)
    ws = wb.active
    last_row = len(data) + 1
    ws[f'B{last_row}'] = f"=SUM(B2:B{last_row-1})"
    next_row = last_row + 1
    ws[f'A{next_row}'] = "Prenominal Modifiers/#Nouns"
    ws[f'B{next_row}'] = f"=B{last_row}/G{last_row}"
    wb.save(dest_file_path)

In [6]:
# Loop through each directory and process each file
for src_directory, dest_directory in directory_mapping.items():
    excel_files = glob.glob(os.path.join(src_directory, '*.xlsx'))
    for file_path in excel_files:
        file_name = os.path.basename(file_path)
        base_name = os.path.splitext(file_name)[0]
        dest_file_path = os.path.join(dest_directory, file_name)
        print(f"File: {base_name}")
        process_excel_file(file_path, dest_file_path)

print("All files processed successfully.")


File: 38o9ts7l.cqp.AS
Utterance: Cinderella is about a young girl who has some stepsisters .

Extracted Phrase: [('DT', 'a'), ('JJ', 'young'), ('NN', 'girl')]
Prenominal No: 1

Extracted Phrase: [('DT', 'some'), ('NNS', 'stepsisters')]
Prenominal No: 1

Total No: 2
Utterance: she is having to clean the house and do all the chores . 

Extracted Phrase: [('PDT', 'all'), ('DT', 'the'), ('NNS', 'chores')]
Prenominal No: 1

Total No: 1
Utterance: she is not included in a lot of things .
Prenominal Quantifier 'a lot of' found.
Prenominal No for Quantifier: 1

Total No: 1
Utterance: they are going to have a party .

Total No: 0
Utterance: a ball is going to happen .

Total No: 0
Utterance: all the stepsisters are invited to attend .

Extracted Phrase: [('PDT', 'all'), ('DT', 'the'), ('NNS', 'stepsisters')]
Prenominal No: 1

Total No: 1
Utterance: she has to help them find their clothes .

Extracted Phrase: [('PRP$', 'their'), ('NNS', 'clothes')]
Prenominal No: 1

Total No: 1
Utterance: the st

# Debugging One Sentence at a time

In [None]:
# Define the function to extract leaves from the tree (same as before)
def extract_leaves(tree):
    leaves = []  # List to collect leaves

    def traverse(node):
        if isinstance(node, tuple) and isinstance(node[1], list):
            if all(isinstance(item, tuple) for item in node[1]):
                for item in node[1]:
                    traverse(item)
            else:
                leaves.extend([(node[0], subitem) for subitem in node[1]])
        elif isinstance(node, list):
            for subnode in node:
                traverse(subnode)

    traverse(tree)
    return leaves

# Define the function to scan modifiers (same as before)
def scan_modifiers(leaves, special_phrases):
    if not leaves or leaves[-1][0] not in All_Nouns:
        # If the last POS is not a noun, return 0
        return 0
    
    # Set up POS and words to exclude from counting
    excluded_pos = ["CC", "TO", "IN", "POS", "CC", "RP", "PDT"]
    excluded_words = Determiners_to_be_excluded_as_prenominals + Words_to_be_excluded_as_prenominals  # Include 'been' in the exclusions
    CC_pos = ["CC"]

    # Add the specific prenominal quantifier phrases to exclude
    Prenominal_Quantifier_Words = ['many', 'all', 'most', 'some', 'few', 'several', 'number', 'bit', "lot","bunch", "any", "one"]

    # Count modifiers that are not in the excluded lists
    modifier_count = 0

    for ii, (pos, word) in enumerate(leaves[-2::-1]):  # Exclude the last leaf from counting
        if pos not in excluded_pos and word not in excluded_words:
            # Check for prenominal quantifier phrases
            if word in Prenominal_Quantifier_Words and ii + 1 < len(leaves) and leaves[-ii-2][1] == 'of':
                continue  # Skip the word if it is part of a prenominal quantifier phrase
            
            # Check if the current leaf contains a special phrase
            leaf_phrase = ' '.join(leaf[1] for leaf in leaves[-ii - 2:])
            if any(special_phrase in leaf_phrase for special_phrase in special_phrases):
                continue  # Skip if it matches any special phrase
            
            # Increment the modifier count
            modifier_count += 1

        if pos in CC_pos:
            if ii + 1 < len(leaves) and leaves[-ii-1][0] in All_Nouns:
                modifier_count -= 1
                continue
            elif ii + 2 < len(leaves) and leaves[-ii-2][0] == "DT":
                break

    return modifier_count

# Define lists of POS tags and specific words to be considered or excluded (same as before)
All_Nouns = ["NN", "NNS", "NNP", "NNPS"]
All_Adjectives = ["JJ", "JJR", "JJS", "PRP$"]
All_Adverbs = ["RB", "RBR", "RBS"]
Verbs_that_can_be_prenominal = ["VBN"]
Determiners_to_be_excluded_as_prenominals = ["a", "A", "an", "An", "the", "The", "that", "That", "these", "These", "those", "Those", "this", "This", "either", "Either", "around", "Around"]
Words_to_be_excluded_as_prenominals = ["been","around", "almost", "never","behind","above","whose","which"]
Prenominal_Quantifier_Phrases = ['a lot of', 'many of', 'all of', 'most of', 'some of', 'a few of', 'several of', 'a number of', 'a bit of', 'a bunch of', "any of", "one of"]

# Directory mapping (same as before)
directory_mapping = {
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\buggy sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\buggy sentences"
}

# Ensure destination directories exist (same as before)
for dest in directory_mapping.values():
    os.makedirs(dest, exist_ok=True)

# Define the function to process Excel files
def process_excel_file(src_file_path, dest_file_path):
    # Load the workbook and worksheet
    wb = load_workbook(src_file_path)
    ws = wb.active

    # Find the last row with data in column B
    last_row = ws.max_row
    last_row_formulas = {}  # Dictionary to store formulas in the last row

    for col in range(2, ws.max_column + 1):
        col_letter = get_column_letter(col)
        # Check if the header is "Comments"
        if ws[f'{col_letter}1'].value == "Comments":
            continue
        # Calculate the sum of values in the column (excluding the header)
        column_sum = 0
        for row in range(2, last_row + 1):
            cell_value = ws[f'{col_letter}{row}'].value
            if isinstance(cell_value, (int, float)):
                column_sum += cell_value
            elif cell_value == 'N/A':
                continue
        # Check if the last row contains a string that starts with "=SUM"
        last_row_value = ws[f'{col_letter}{last_row}'].value
        if isinstance(last_row_value, str) and last_row_value.startswith("=SUM"):
            last_row_formulas[col_letter] = last_row_value
        elif not isinstance(last_row_value, str) or not last_row_value.startswith("=SUM"):
            next_row = last_row + 1
            ws[f'{col_letter}{next_row}'] = column_sum
    
    # Save the file to the specified destination path
    wb.save(dest_file_path)
    
    # Load the processed data into pandas for further manipulation
    data = pd.read_excel(dest_file_path)

    # Data manipulation logic (same as before, ensuring that formula rows are preserved)
    data.insert(1, 'Prenominal Modifiers', '')

    for index, row in data.iterrows():
        utterance = row['UTTERANCES']
        if pd.isna(utterance) or utterance.strip() == '':
            continue  # Skip this iteration if the cell is blank or empty
        
        utterance = utterance.replace("[+ 0subj]", "").replace("[+ gram]", "").replace("0det", "")
        print(f"Utterance: {utterance}")

        sentence_class = sentence_tree(utterance)
        special_phrase_count = sentence_class.handle_special_phrases(Prenominal_Quantifier_Phrases, utterance)
        sentence_class.prune_to_np_branches(sentence_class.parse_tuples)
        nps_without_POS = sentence_class.remove_pos_tuples(sentence_class.prune_tree[0])
        nps_factored_for_complicated_NP_embedding = sentence_class.case_for_complicated_NP_embeddings(nps_without_POS)
        total_modifiers = 0
        for tree in nps_factored_for_complicated_NP_embedding[0]:
            leaves = extract_leaves(tree)
            modifier_count = scan_modifiers(leaves, Prenominal_Quantifier_Phrases)
            total_modifiers += modifier_count
            if modifier_count > 0:  # Change to >0 when debugging
                print()
                print("Extracted Phrase:", leaves)
                print(f"Prenominal No: {modifier_count}")
        total_modifiers += special_phrase_count
        print()
        print(f"Total No: {total_modifiers}")
        data.at[index, 'Prenominal Modifiers'] = total_modifiers

    data['Prenominal Modifiers'] = pd.to_numeric(data['Prenominal Modifiers'], errors='coerce')
    data['Prenominal Modifiers'].fillna(0, inplace=True)

    # Save the DataFrame back to Excel
    data.to_excel(dest_file_path, index=False)

    # Reload the workbook to reapply formulas
    wb = load_workbook(dest_file_path)
    ws = wb.active
    last_row = len(data) + 1
    ws[f'B{last_row}'] = f"=SUM(B2:B{last_row-1})"
    next_row = last_row + 1
    ws[f'A{next_row}'] = "Prenominal Modifiers/#Nouns"
    ws[f'B{next_row}'] = f"=B{last_row}/G{last_row}"
    wb.save(dest_file_path)

In [None]:
# Loop through each directory and process each file
for src_directory, dest_directory in directory_mapping.items():
    excel_files = glob.glob(os.path.join(src_directory, '*.xlsx'))
    for file_path in excel_files:
        file_name = os.path.basename(file_path)
        base_name = os.path.splitext(file_name)[0]
        dest_file_path = os.path.join(dest_directory, file_name)
        print(f"File: {base_name}")
        process_excel_file(file_path, dest_file_path)

print("All files processed successfully.")

# SENTENCE STEP-BY-STEP DEBUGGING

In [None]:
#sentence = "the brown fox on the swing and the bad granddaughter on the playground are bad"
#sentence = "The brown and red fox is quick and he is jumping over the lazy dog."
#sentence = "The antique clock's gentle ticking filled the quiet room."
#sentence = "The gentle ticking of the antique clock filled the quiet room."
#sentence = "The bad man and the soon to be born daughter are weird
#sentence = "Her stepmother got an invitation to the red prince's strange Ball ."
#sentence = "Cinderella basically did all the chores at home all day and all night ."
#sentence = "one day the family received an invitation to the ball ."
#sentence = " the Fairy Godmother gave Cinderella a dress to wear as well as a carriage ride and two glass slippers ."
#sentence = "the following day the prince goes around town going to everybody trying to see whose feet the glass slipper fits ."
sentence = "Cinderella is about a young girl who has some stepsisters ."
sentence_class = sentence_tree(sentence)

In [None]:
sentence_class.prune_to_np_branches(sentence_class.parse_tuples)

for tree in sentence_class.prune_tree[0]:
    print(tree)
    print()

In [None]:
nps_without_POS = sentence_class.remove_pos_tuples(sentence_class.prune_tree[0])

for tree in nps_without_POS:
    print()
    print(tree)


In [None]:
nps_factored_for_complicated_NP_embedding = sentence_class.case_for_complicated_NP_embeddings(nps_without_POS)

for tree in nps_factored_for_complicated_NP_embedding[0]:
    print(tree)
    print()

In [None]:
#SENTENCE TUPLE TREE
for tree in sentence_class.indexed_tuples:
    print(tree)
    print()

In [None]:
sentence_class.tree_to_string_no_labels(sentence_class.indexed_tuples, True)

# Graveyard

In [None]:

sentence_tupletree = [('ROOT', [('S', [('NP', [('NP', [('NP', [('DT', ['the']), ('JJ', ['brown']), ('NN', ['fox'])]), 
                                                      ('PP', [('IN', ['on']), ('NP', [('DT', ['the']), ('NN', ['swing'])])])]),
                                       ('CC', ['and']),
                                       ('NP', [('NP', [('DT', ['the']), ('JJ', ['bad']), ('NN', ['granddaughter'])]), 
                                               ('PP', [('IN', ['on']), ('NP', [('DT', ['the']), ('NN', ['playground'])])])
                                              ])]), 
                                      ('VP', [('VBP', ['are']), ('ADJP', [('JJ', ['bad'])])])
                                     ])]),
                      
                      ('ROOT', [('S', [('S', [('NP', [('DT', ['The']), ('ADJP', [('JJ', ['brown']), ('CC', ['and']), 
                                                                                 ('JJ', ['red'])]), ('NN', ['fox'])]),
                                              ('VP', [('VBZ', ['is']), ('ADJP', [('JJ', ['quick'])])])]), ('CC', ['and']),
                                       ('S', [('NP', [('PRP', ['he'])]),
                                              ('VP', [('VBZ', ['is']), ('VP', [('VBG', ['jumping']), ('PP', [('IN', ['over']), 
                                                                               ('NP', [('DT', ['the']), ('JJ', ['lazy']), ('NN', ['dog'])
                                                     ])])])])]),
                                       ('.', ['.'])])]),
                      
                      ('ROOT', [('S', [('NP', [('NP', [('DT', ['The']), ('JJ', ['antique']), ('NN', ['clock']), ('POS', ["'s"])]),
                                               ('JJ', ['gentle']), ('NN', ['ticking'])]), ('VP', [('VBD', ['filled']), 
                                                                                                  ('NP', [('DT', ['the']), 
                                                                                                          ('JJ', ['quiet']), 
                                                                                                          ('NN', ['room'])])]),
                                       ('.', ['.'])])]),
                      
                      ('ROOT', [('S', [('NP', [('NP', [('DT', ['The']), ('JJ', ['gentle']), ('NN', ['ticking'])]), 
                                               ('PP', [('IN', ['of']), ('NP', [('DT', ['the']), ('JJ', ['antique']), ('NN', ['clock'])])])]),
                                       ('VP', [('VBD', ['filled']), ('NP', [('DT', ['the']), ('JJ', ['quiet']), ('NN', ['room'])])]),
                                ('.', ['.'])])]),
                      
                      ('ROOT', [('S', [('NP', [('NP', [('DT', ['the']), ('JJ', ['bad']), ('NN', ['man'])]),
                                               ('CC', ['and']), ('NP', [('DT', ['the']),
                                               ('NML', [('S', [('ADVP', [('RB', ['soon'])]), 
                                                               ('VP', [('TO', ['to']), ('VP', [('VB', ['be']), 
                                                                       ('VP', [('VBN', ['born'])])])])])]),
                                               ('NN', ['daughter'])])]),
                                       ('VP', [('VBP', ['are']), ('ADJP', [('JJ', ['weird'])])])])]),
                      
                      ('ROOT', [('S', [('NP', [('PRP$', ['Her']), ('NN', ['stepmother'])]), 
                                       ('VP', [('VBD', ['got']), ('NP', [('DT', ['an']), ('NN', ['invitation'])]),
                                               ('PP',  [('IN', ['to']), ('NP', [('NP', [('DT', ['the']), ('JJ', ['red']),
                                                        ('NN', ['prince']), ('POS', ["'s"])]), ('JJ', ['strange']),
                                                        ('NN', ['Ball'])])])]),
                                       ('.', ['.'])])]),
                     ]

In [None]:
class sentence_tree:
    """
    The sentence_tree class is a collection of functions and variables to perform operations and collect 
    statistics on a sentence respectively
    """

    def __init__(self, sentence):
        
        """
        This function initializes my class
        
        INPUT:
            sentence - sentence string to process
        """
        self.sentence = sentence
        self.tree = next(parser.raw_parse(self.sentence))
        self.parse_tuples = self.tree_to_tuples(self.tree)
        self.position_in_sentence = [0] #variable to keep track of position in sentence through recursions
        self.indexed_tuples =  self.create_indexed_tuples(self.parse_tuples,  self.position_in_sentence)
        self.prune_tree = [[]]#This stores all the trees with NPs in the sentence
        self.Noun_List = ["NN","NNS","NNP","NNPS"] #this stores all parts of speech that can be treated as a noun phrase
        self.Noun_Phrases = ['NP','NP-TMP','WHNP'] #list of all noun phrases
    

    def tree_to_tuples(self, tree):
        """
            Create nltk tree from sentence
            
            INPUT:
                tree - tuple tree
            OUTPT:
                nltk.Tree object created from tree
        """
        if isinstance(tree, nltk.Tree):
            return (tree.label(), [self.tree_to_tuples(t) for t in tree])
        else:
            return tree

    def flatten_list(self, input_list):
        flattened = []
        for item in input_list:
            if isinstance(item, list):
                flattened.extend(self.flatten_list(item))  # Recursively flatten the list
            else:
                flattened.append(item)  # Append the string or non-list item directly
        return flattened

    def is_sublist(self, sub_list, main_list, keep = False):
        len_sub = len(sub_list)
        if not keep:
            if len_sub == len(main_list):
                return False
        for i in range(len(main_list) - len_sub + 1):
            if main_list[i:i + len_sub] == sub_list:
                return True
        return False
    
    def create_indexed_tuples(self, tree, position):
        """
        Create a tuple tree that replaces the word in the leaves with a list [word, position in sentence]
        
        INPUT:
            tree - tuple tree
            positio - list to hold value of the position in string through recursions
        """
        if isinstance(tree, tuple):
            label, children = tree
            new_children = []
            for child in children:
                new_children.append(self.create_indexed_tuples(child, position))
            return (label, new_children)
        else:
            position[0] += 1
            return ({'word' : tree, 'position' : position[0]})
    
    def prune_to_np_branches(self, tree):
        """
        Function wrapper to call prune_general specifically to collect NPs
        
        INPUT:
            tree - tuple tree
        OUTPUT:
            assignment to self.prune_tree of the list of NP strings
        """
        
        self.prune_tree = [[]]
        self.prune_general(tree, self.prune_tree, PoS = ['NP','NP-TMP','WHNP'])
        
    def prune_general(self, tree, tree2, PoS = ['NP','NP-TMP','WHNP']):
        """
        General case function for collecting the trees assosciated with a part of speech, PoS, for the tuple tree
        in tree and collecting them in tree 2
        INPUT:
            tree  - a tuple tree
            tree2 - a list of a list of tuple trees containing part of speech being searched for. 
                    It is a list so that the list of tuples can be appended in every recursive call.  
            PoS   - the part of speech that is being collected in tree2.  The default is Noun Phrase (NP)
        OUTPUT:
            modifies tree2 to store on the parts of speech specified by PoS for the sentence (in the form of a list of tuple trees) in tree
            The return value is None this signifies the last tuple (string, list) has been reached
        
        """
        if isinstance(tree, tuple): #Check that tree is a tuple tree and not a list
            label, children = tree #extracts the string and list from the tuple 
            """
            print("label:",label)
            print("children:",children)
            print()
            """
            if label in PoS:
                # If it's a node matching the part of speech save it to tree2
                tree2[0].append(tree) 
            if isinstance(children, list):#check if children is a path (list) and not a string
                # Recursively check children; keep those leading to an NP
                """
                for child in children:
                    print("child: ", child)
                    print()
                """
                for child in children:
                    np_children = self.prune_general(child, tree2, PoS = self.Noun_Phrases) #process each path in the tree
        return None
    
    def Does_Tree_Have_Target(self, tree, target = ['NP','NP-TMP','WHNP']): 
        """
        Check if a tree has the target part of speech (Either Posessive or NP)
        INPUT:
        tree  - tuple tree being scanned
        target - part of speech being targeted, default NP
        """
        recursion_for_target = False
        if isinstance(tree, tuple): #check if tree is in (string label, path list) form
            label, children = tree
            if label in target: #doesnt have to be NP, can be varialbe that can be assigned either as NP or POS
                return True
            else:
                for ii, child in enumerate(children):
                    if child[0] == target:
                        return True
                    else:
                        if self.Does_Tree_Have_Target(child, target):
                            recursion_for_target = True
        if recursion_for_target:
            return True
        else:
            return False
    
    """
    def Trees_without_NPs(self, tree):
        self.prune_tree = []
        Trees_without_NPs_List = []
        self.prune_to_np_branches(tree)
        for p_tree in self.prune_tree:
            if not self.Does_Tree_Have_Target(("Root",p_tree[1])):
                Trees_without_NPs_List.append(p_tree)
        return Trees_without_NPs_List
    """
    
    
    
    def tree_to_string_no_labels(self, tree, is_indexed = False):
        """
        Convert a tuple tree to the sentence string it represents
        
        INPUT:
            tree - tuple tree
        OUTPUT:
            this function is recursive so it returns a tuple tree if it is still processing a tree,
            otherwise it returns a string with all of the substrings appended together
        """
        # Check if the tree is a tuple with a tag and a list of subtrees or leaves
        if isinstance(tree, tuple): #and len(tree) == 2:
            _, branches = tree
            # Recursively process branches to get strings without labels
            return " ".join([self.tree_to_string_no_labels(branch, is_indexed) for branch in branches]).strip()
        # If the tree is a list, it's a list of subtrees or leaves
        elif isinstance(tree, list):
            return " ".join([self.tree_to_string_no_labels(leaf, is_indexed) for leaf in tree]).strip()
        # Base case: the tree is a leaf (a word)
        else:
            if is_indexed:
                return tree['word']
            else:
                return tree

    ''' DEPRICATED   
    def tree_to_string_no_labels_part_2(self, tree_list):
        """
        Convert each tree in the list to a string, excluding all grammatical labels
        
        INPUT:
            tree_list - list of tuple strings to process
        """
        tree_strings_no_labels = [self.tree_to_string_no_labels(tree) for tree in tree_list]
        return tree_strings_no_labels
    '''
    '''
    def tree_to_string_no_labels(self, tree):
        """
        Covert a tuple tree to the sentence string it represents
        
        INPUT:
            tree - tuple tree
        OUTPUT:
            this function is recursive so it returns a tuple tree if it is still processing a tree,
            otherwise it returns a string with all of the substrings appended together
        """
        # Check if the tree is a tuple with a tag and a list of subtrees or leaves
        if isinstance(tree, tuple) and len(tree) == 2:
            _, branches = tree
            # Recursively process branches to get strings without labels
            return " ".join([self.tree_to_string_no_labels(branch) for branch in branches]).strip()
        # If the tree is a list, it's a list of subtrees or leaves
        elif isinstance(tree, list):
            return " ".join([self.tree_to_string_no_labels(leaf) for leaf in tree]).strip()
        # Base case: the tree is a leaf (a word)
        else:
            return tree
    '''
    def tree_to_list_of_strings(self, tree, is_indexed = False):
        """
        Convert a tree of a list of strings without grammatical labels
        
        INPUT:
            tree - tuple tree
            is_index - is the tuple tree standard (False) or have indexing (True)
        OUTPUT:
            list of the strings in the tuple tree
        """
        # Check if the tree is a tuple with a tag and a list of subtrees or leaves
        if isinstance(tree, tuple) and len(tree) == 2:
            _, branches = tree
            # Recursively process branches to get strings without labels
            return self.flatten_list([self.tree_to_list_of_strings(branch, is_indexed) for branch in branches])
        # If the tree is a list, it's a list of subtrees or leaves
        elif isinstance(tree, list):
            return [self.tree_to_list_of_stings(leaf, is_indexed) for leaf in tree]
        # Base case: the tree is a leaf (a word)
        else:
            if is_indexed:
                return tree['word']
            else:
                return tree

    
    def list_of_trees_to_list_of_strings(self, tree_list, is_indexed = False):
        """
        Convert each tree in a list to a string list, excluding all grammatical labels
        
        INPUT:
            list of trees
            
        OUPUT:
            list of string lists
        """
        tree_strings_no_labels = [self.tree_to_list_of_strings(tree, is_indexed) for tree in tree_list]
        return tree_strings_no_labels

    '''DEPRICATE
    def duplicate_NP_removal(self, tree, keep_substring = False):
        """
        remove strings that overlap with other strings
        
        INPUT:
            tree - tuple tree to be processed 
            keep_substring - Boolean switch to determine if substrings are kept or the containing string
                             default is False
        OUTPUT:
            list of NP tupple tree strings with overlapping strings removed
        """
        strings_with_NPs = self.tree_to_string_no_labels_part_2(tree)
        strings_with_NPs_True_Or_False_Array = [True] * len(strings_with_NPs)

        for idx,s in enumerate(strings_with_NPs):
            for idy,t in enumerate(strings_with_NPs[idx+1:]):
                if strings_with_NPs[idx] in strings_with_NPs[idy+1+idx]:
                    if keep_substring:
                        strings_with_NPs_True_Or_False_Array[idy+1+idx] = False
                    else:
                        strings_with_NPs_True_Or_False_Array[idx] = False
                elif strings_with_NPs[idy+1+idx] in strings_with_NPs[idx]:
                    if keep_substring:
                        strings_with_NPs_True_Or_False_Array[idx] = False
                    else:
                        strings_with_NPs_True_Or_False_Array[idy+1+idx] = False
        nps_without_duplicates = []
        for ii,noun_phrase in enumerate(tree):
            if strings_with_NPs_True_Or_False_Array[ii]:
                nps_without_duplicates.append(noun_phrase)
        return nps_without_duplicates
    '''
    
    def check_and_remove_PoS(self, tree, PoS = 'POS'):
        """
        Remove trees with the chosen label and return modified tree
        
        INPUT:
           tree - tuple tree to check for PoS
           PoS - part of speech being removed from tuple tree
        OUTPUT:
            returns tuple tree with all subtree with PoS label removed
        """
        if isinstance(tree, tuple):
            # If the tuple contains 'POS', return None to signify removal
            if tree[0] == 'POS':
                return None
            else:
                # Otherwise, recursively process and rebuild the tuple without 'POS'
                return (tree[0], [self.check_and_remove_PoS(sub_t, PoS) for sub_t in tree[1] if self.check_and_remove_PoS(sub_t, PoS) is not None])
        elif isinstance(tree, list):
            # For lists, filter through and only keep items that should not be removed
            return [self.check_and_remove_PoS(sub_t, PoS) for sub_t in tree if self.check_and_remove_PoS(sub_t, PoS) is not None]
        else:
            return tree

    def remove_pos_tuples(self, tree):
        """
        Function to check and remove tuples containing 'POS' and the smallEST tree containing the POS (i.e. NP: Cinderella's)
        INPUT:
            tree - tuple tree to process
        OUTPUT:
            list of tuple trees with 'POS' removed
        """

        # Apply the check and removal function to each top-level tuple
        #cleaned_tuples = [self.check_and_remove_PoS(t, PoS = 'POS') for t in tree if self.check_and_remove_PoS(t, PoS = 'POS') is not None]
        #cleaned_trees = [t for t in cleaned_tuples if t is not None]
        trees2compare = []
        for idx, t in enumerate(tree):
            if self.Does_Tree_Have_Target(t, target = ['POS']):
                trees2compare.append((idx, t))
        keep_trees = []
        if len(trees2compare) > 1:
            for idx, t1 in enumerate(trees2compare[:-1]):
                s1 = self.tree_to_list_of_strings(t1[1])
                for idy, t2 in enumerate(trees2compare[idx + 1:]):
                    s2 = self.tree_to_list_of_strings(t2[1])
                    if len(s1) < len(s2):
                        if self.is_sublist(s1, s2):
                            keep_trees.append(t2[0])
                    elif self.is_sublist(s2, s1):
                        keep_trees.append(t1[0])
        cleaned_trees = []
        for idx, t in enumerate(tree):
            if idx in keep_trees:
                cleaned_trees.append(t)
                    
        return cleaned_trees 

        # Filter out any None values that may have resulted from removal
        #return [t for t in cleaned_tuples if t is not None]
    
    def case_for_complicated_NP_embeddings(self, tree_list):
        """
        Combine NPs together into one NP to catch NNs that are modified by NPs 
        
        INPUT:
            tree_list - tuple tree
        OUTPUT:
            list of NPs that have passed this criterion
        """
        def check_for_NPs(tree_list, reduced_list):
            """
            Recursive function to perform logic on the tuple tree
            
            INPUT:
                tree_list - list of np trees to compare
                reduced_list - list of the list of final NP trees to keep
            OUTPUT:
                the list of NP strings to keep is updated in reduced_list
                the return value is the list of NP strings 
            """
            for tree in tree_list:
                #print(tree)
                if isinstance(tree, tuple):
                    keep_tree = True
                    count_of_NNs = self.count_tokens([tree], self.Noun_List)
                    #print("pass step 2")
                    #print("count of NNs: ", count_of_NNs)
                    if count_of_NNs > 1:
                        #print("pass step 3")
                        top_level_NPs = [[]] 
                        self.prune_general(tree[1][0], top_level_NPs)
                        #print(tree[1][0])
                        #print("top_level_NPs:",top_level_NPs[0])
                        count_of_NPs = self.count_tokens(tree[1], self.Noun_Phrases)
                        #print("count_of_NPs:",count_of_NPs)
                        if count_of_NPs > 1:
                            #print("pass step 4")
                            temp_list = []
                            for top_level_NP in top_level_NPs[0]:
                                #print("top_level_NPs:",top_level_NPs)
                                count_of_NNs = self.count_tokens(top_level_NP, self.Noun_List)
                                if count_of_NNs > 0:
                                    result = check_for_NPs([top_level_NP], reduced_list)  # Ensure this is a list
                                    temp_list.extend(result)  # Extend flattens the list, #look at flattening 
                            reduced_list[0].extend(temp_list)
                            keep_tree = False
                    if keep_tree:
                        reduced_list[0].append(tree)
                #return reduced_list[0] 
        NPs_that_pass_this_function = [[]]
        NP_check = check_for_NPs(tree_list, NPs_that_pass_this_function)
        return NPs_that_pass_this_function
     
    def count_tokens(self, tree_list, tokens): #this can count tokens, can be used when just wanting to find certain POS such as DET, CONJ, PP etc.
        """
        Count the number of tokens occuring in the tuple tree list
        
        INPUT:
            tree_list - the tupple tree list to be processed
            tokens - the list of tokens in the tuple_tree to be counted
        OUTPUT:
            the number of times the token occurs in the tree list            
        """
        count = 0
        for item in tree_list:
            if isinstance(item, tuple):
                tag, value = item
                if tag in tokens:
                    count += 1  # Increment count for each 'token'
                elif isinstance(value, list):
                    count += self.count_tokens(value, tokens)  # Recurse into nested lists
        return count

In [None]:
# Function to extract leaves from the tree
def extract_leaves(tree):
    leaves = []  # List to collect leaves

    def traverse(node):
        if isinstance(node, tuple) and isinstance(node[1], list):
            if all(isinstance(item, tuple) for item in node[1]):
                for item in node[1]:
                    traverse(item)
            else:
                leaves.extend([(node[0], subitem) for subitem in node[1]])
        elif isinstance(node, list):
            for subnode in node:
                traverse(subnode)

    traverse(tree)
    return leaves

def count_modifiers(leaves):
    if not leaves or leaves[-1][0] not in All_Nouns:
        # If the last POS is not a noun, return 0
        return 0
    
    # Set up POS and words to exclude from counting
    excluded_pos = All_Conjunctions + ["TO", "IN", "POS", "CD"]  # Include TO, IN, and POS in the exclusions
    excluded_words = Determiners_to_be_excluded_as_prenominals + Words_to_be_excluded_as_prenominals + ["been"]  # Include 'been' in the exclusions

    # Count modifiers that are not in the excluded lists
    modifier_count = 0
    for pos, word in leaves[:-1]:  # Exclude the last leaf from counting
        if pos not in excluded_pos and word not in excluded_words:
            modifier_count += 1

    return modifier_count
# List of POS tags and specific words to be considered or excluded
All_Nouns = ["NN", "NNS", "NNP", "NNPS"]
All_Adjectives = ["JJ", "JJR", "JJS", "PRP$"]
All_Adverbs = ["RB", "RBR", "RBS"]
All_Conjunctions = ["CC"]
Verbs_that_can_be_prenominal = ["VBN"]
Determiners_to_be_excluded_as_prenominals = ["a", "an", "the", "that", "these", "those", "this", "either"]
Words_to_be_excluded_as_prenominals = ["been"]


# Define the source and destination file paths
src_file_path = "C:\\Users\\james\\Documents\\Prenominal Modifiers\\Test Files\\Original\\Aer4zugl.cqp.AS_H.xlsx"
dest_file_path = "C:\\Users\\james\\Documents\\Prenominal Modifiers\\Test Files\\Modified\\Aer4zugl.cqp.AS_H.xlsx"

# Ensure the destination directory exists
os.makedirs(os.path.dirname(dest_file_path), exist_ok=True)


# Copy the file from source to destination
shutil.copy(src_file_path, dest_file_path)

# Load the copied Excel file into a DataFrame
data = pd.read_excel(dest_file_path)

# Insert a new blank column right after column 'A' titled 'Prenominal Modifiers'
data.insert(1, 'Prenominal Modifiers', '')

# Iterate over each row in the DataFrame
for index, row in data.iterrows():
    # Get the utterance from the 'UTTERANCES' column
    utterance = row['UTTERANCES']
    
    if pd.isna(utterance) or utterance.strip() == '':
        continue  # Skip this iteration if the cell is blank or empty
    
    print(f"Processing utterance: {utterance}")
    
    sentence_class = sentence_tree(utterance)
    sentence_class.prune_to_np_branches(sentence_class.parse_tuples)
    #sentence_class.prune_to_np_branches(sentence_class.indexed_tuples)
    nps_without_POS = sentence_class.remove_pos_tuples(sentence_class.prune_tree[0])
    nps_factored_for_complicated_NP_embedding = sentence_class.case_for_complicated_NP_embeddings(nps_without_POS)
    total_modifiers = 0
    for tree in nps_factored_for_complicated_NP_embedding[0]:
        leaves = extract_leaves(tree)
        print("Leaves extracted from the tree:", leaves)
        
        # Count modifiers using the updated function
        modifier_count = count_modifiers(leaves)
        total_modifiers += modifier_count
        print(f"Modifiers for the last noun: {modifier_count}")
        print()

    
    
    data.at[index, 'Prenominal Modifiers'] = total_modifiers

# Ensure all entries in 'Prenominal Modifiers' are numeric, converting any non-numeric to NaN
data['Prenominal Modifiers'] = pd.to_numeric(data['Prenominal Modifiers'], errors='coerce')

# Replace NaNs with 0s (NaNs can result from non-numeric conversions)
data['Prenominal Modifiers'].fillna(0, inplace=True)


# Save the DataFrame to an Excel file
data.to_excel(dest_file_path, index=False)

# Open the Excel file with openpyxl to add a formula
wb = load_workbook(dest_file_path)
ws = wb.active

# Add a sum formula at the end of the data in column B
last_row = len(data) + 1  # +1 because DataFrame is zero-indexed and Excel is 1-indexed
ws[f'B{last_row}'] = f"=SUM(B2:B{last_row-1})"

# Add another formula in the next row in column B for division
next_row = last_row + 1
ws[f'B{next_row}'] = f"=B{last_row}/G{last_row}"
ws[f'A{next_row}'] = "Prenominal Modifiers/#Nouns"  # Place string in column A

# Save the changes
wb.save(dest_file_path)

print("DONE")


In [None]:
# Function to extract leaves from the tree
def extract_leaves(tree):
    leaves = []  # List to collect leaves

    def traverse(node):
        if isinstance(node, tuple) and isinstance(node[1], list):
            if all(isinstance(item, tuple) for item in node[1]):
                for item in node[1]:
                    traverse(item)
            else:
                leaves.extend([(node[0], subitem) for subitem in node[1]])
        elif isinstance(node, list):
            for subnode in node:
                traverse(subnode)

    traverse(tree)
    return leaves

"""
def count_modifiers(leaves):
    if not leaves or leaves[-1][0] not in All_Nouns:
        # If the last POS is not a noun, return 0
        return 0
    
    # Set up POS and words to exclude from counting
    excluded_pos = ["CC","TO", "IN", "POS", "DET"]  # Include TO, IN, and POS in the exclusions
    excluded_words = Determiners_to_be_excluded_as_prenominals + Words_to_be_excluded_as_prenominals + ["been"]  # Include 'been' in the exclusions

    # Count modifiers that are not in the excluded lists
    modifier_count = 0
    for pos, word in leaves[:-1]:  # Exclude the last leaf from counting
        if pos not in excluded_pos and word not in excluded_words:
            modifier_count += 1

    return modifier_count
"""

def scan_modifiers(leaves, special_phrases):
    if not leaves or leaves[-1][0] not in All_Nouns:
        # If the last POS is not a noun, return 0
        return 0
    
    # Set up POS and words to exclude from counting
    excluded_pos = ["CC", "TO", "IN", "POS", "CC", "RP", "PDT"]
    excluded_words = Determiners_to_be_excluded_as_prenominals + Words_to_be_excluded_as_prenominals  # Include 'been' in the exclusions
    CC_pos = ["CC"]

    # Add the specific prenominal quantifier phrases to exclude
    Prenominal_Quantifier_Words = ['many', 'all', 'most', 'some', 'few', 'several', 'number', 'bit', "lot","bunch", "any", "one"]

    # Count modifiers that are not in the excluded lists
    modifier_count = 0

    for ii, (pos, word) in enumerate(leaves[-2::-1]):  # Exclude the last leaf from counting
        if pos not in excluded_pos and word not in excluded_words:
            # Check for prenominal quantifier phrases
            if word in Prenominal_Quantifier_Words and ii + 1 < len(leaves) and leaves[-ii-2][1] == 'of':
                continue  # Skip the word if it is part of a prenominal quantifier phrase
            
            # Check if the current leaf contains a special phrase
            leaf_phrase = ' '.join(leaf[1] for leaf in leaves[-ii - 2:])
            if any(special_phrase in leaf_phrase for special_phrase in special_phrases):
                continue  # Skip if it matches any special phrase
            
            # Increment the modifier count
            modifier_count += 1

        if pos in CC_pos:
            if ii + 1 < len(leaves) and leaves[-ii-1][0] in All_Nouns:
                modifier_count -= 1
                continue
            elif ii + 2 < len(leaves) and leaves[-ii-2][0] == "DT":
                break

    return modifier_count



# List of POS tags and specific words to be considered or excluded
All_Nouns = ["NN", "NNS", "NNP", "NNPS"]
All_Adjectives = ["JJ", "JJR", "JJS", "PRP$"]
All_Adverbs = ["RB", "RBR", "RBS"]
Verbs_that_can_be_prenominal = ["VBN"]
Determiners_to_be_excluded_as_prenominals = ["a", "A", "an", "An", "the", "The", "that", "That", "these", "These", "those", "Those", "this", "This", "either", "Either", "around", "Around"]
Words_to_be_excluded_as_prenominals = ["been","around", "almost", "never","behind","above","whose","which"]
Prenominal_Quantifier_Phrases = ['a lot of', 'many of', 'all of', 'most of', 'some of', 'a few of', 'several of', 'a number of', 'a bit of', 'a bunch of', "any of", "one of"]




downloads_path = os.path.expanduser('~/Downloads')
# Define source directories and corresponding destination directories
directory_mapping = {
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\buggy sentences":r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\buggy sentences"}

# Ensure destination directories exist
for dest in directory_mapping.values():
    os.makedirs(dest, exist_ok=True)

def process_excel_file(src_file_path, dest_file_path, downloads_path):
    # Load the workbook and worksheet
    wb = load_workbook(src_file_path)
    ws = wb.active
    # Find the last row with data in column B
    last_row = ws.max_row
    for col in range(2, ws.max_column + 1):
        col_letter = get_column_letter(col)
        # Check if the header is "Comments"
        if ws[f'{col_letter}1'].value == "Comments":
            continue
        # Calculate the sum of values in the column (excluding the header)
        column_sum = 0
        for row in range(2, last_row + 1):
            cell_value = ws[f'{col_letter}{row}'].value
            if isinstance(cell_value, (int, float)):
                column_sum += cell_value
            elif cell_value == 'N/A':
                continue
        # Check if the last row contains a string that starts with "=SUM"
        last_row_value = ws[f'{col_letter}{last_row}'].value
        print("last_row_value:",last_row_value)
        if not isinstance(last_row_value, str) or not last_row_value.startswith("=SUM"):
            next_row = last_row + 1
            ws[f'{col_letter}{next_row}'] = column_sum
    
    # Save the file to the specified destination path
    wb.save(dest_file_path)
    
    # Save a copy to the Downloads folder
    downloads_dest_path = os.path.join(downloads_path, os.path.basename(dest_file_path))
    wb.save(downloads_dest_path)
    print(f"File saved to {dest_file_path} and {downloads_dest_path}")
    
    # Load the processed data into pandas for further manipulation
    data = pd.read_excel(dest_file_path)
    return data
    data.insert(1, 'Prenominal Modifiers', '')

    # Iterate over each row in the DataFrame
    for index, row in data.iterrows():
        # Get the utterance from the 'UTTERANCES' column
        utterance = row['UTTERANCES']
        
        if pd.isna(utterance) or utterance.strip() == '':
            continue  # Skip this iteration if the cell is blank or empty
        
        # Preprocess the utterance to remove specific substrings
        utterance = utterance.replace("[+ 0subj]", "").replace("[+ gram]", "").replace("0det","")
        
        print(f"Utterance: {utterance}")

        sentence_class = sentence_tree(utterance)
        special_phrase_count = sentence_class.handle_special_phrases(Prenominal_Quantifier_Phrases, utterance)
        sentence_class.prune_to_np_branches(sentence_class.parse_tuples)
        nps_without_POS = sentence_class.remove_pos_tuples(sentence_class.prune_tree[0])
        nps_factored_for_complicated_NP_embedding = sentence_class.case_for_complicated_NP_embeddings(nps_without_POS)
        total_modifiers = 0
        for tree in nps_factored_for_complicated_NP_embedding[0]:
            leaves = extract_leaves(tree)
            
            # Count modifiers using the updated function
            modifier_count = scan_modifiers(leaves, Prenominal_Quantifier_Phrases)
            total_modifiers += modifier_count
            if modifier_count > 0:  # Change to >0 when debugging
                print()
                print("Extracted Phrase:", leaves)
                print(f"Prenominal No: {modifier_count}")
        total_modifiers += special_phrase_count
        print()
        print(f"Total No: {total_modifiers}")

        data.at[index, 'Prenominal Modifiers'] = total_modifiers

    # Ensure all entries in 'Prenominal Modifiers' are numeric, converting any non-numeric to NaN
    data['Prenominal Modifiers'] = pd.to_numeric(data['Prenominal Modifiers'], errors='coerce')

    # Replace NaNs with 0s (NaNs can result from non-numeric conversions)
    data['Prenominal Modifiers'].fillna(0, inplace=True)
    data.to_excel(dest_file_path, index=False)
    wb = load_workbook(dest_file_path)
    ws = wb.active
    last_row = len(data) + 1
    ws[f'B{last_row}'] = f"=SUM(B2:B{last_row-1})"
    next_row = last_row + 1
    ws[f'A{next_row}'] = "Prenominal Modifiers/#Nouns"
    ws[f'B{next_row}'] = f"=B{last_row}/G{last_row}"
    wb.save(dest_file_path)

In [None]:
# Function to extract leaves from the tree
def extract_leaves(tree):
    leaves = []  # List to collect leaves

    def traverse(node):
        if isinstance(node, tuple) and isinstance(node[1], list):
            if all(isinstance(item, tuple) for item in node[1]):
                for item in node[1]:
                    traverse(item)
            else:
                leaves.extend([(node[0], subitem) for subitem in node[1]])
        elif isinstance(node, list):
            for subnode in node:
                traverse(subnode)

    traverse(tree)
    return leaves

def count_modifiers(leaves):
    if not leaves or leaves[-1][0] not in All_Nouns:
        # If the last POS is not a noun, return 0
        return 0
    
    # Set up POS and words to exclude from counting
    excluded_pos = ["CC","TO", "IN", "POS", "DET"]  # Include TO, IN, and POS in the exclusions
    excluded_words = Determiners_to_be_excluded_as_prenominals + Words_to_be_excluded_as_prenominals + ["been"]  # Include 'been' in the exclusions

    # Count modifiers that are not in the excluded lists
    modifier_count = 0
    for pos, word in leaves[:-1]:  # Exclude the last leaf from counting
        if pos not in excluded_pos and word not in excluded_words:
            modifier_count += 1

    return modifier_count

def scan_modifiers(leaves):
    if not leaves or leaves[-1][0] not in All_Nouns:
        # If the last POS is not a noun, return 0
        return 0
    
    # Set up POS and words to exclude from counting
    excluded_pos = ["CC","TO", "IN", "POS", "DET"]  # Include TO, IN, and POS in the exclusions
    excluded_words = Determiners_to_be_excluded_as_prenominals + Words_to_be_excluded_as_prenominals + ["been"]  # Include 'been' in the exclusions
    CC_pos = ["CC"]

    # Count modifiers that are not in the excluded lists
    modifier_count = 0
    for ii, (pos,word) in enumerate(leaves[-1::-1]):  # Exclude the last leaf from counting
        if pos not in excluded_pos and word not in excluded_words:
            modifier_count += 1
        elif pos in CC_pos:
            if len(leaves-1)-ii-1 > 0:
                if leaves[len(leaves-1)-ii-1][0] in All_Nouns:
                    continue
            else:
                break
    return modifier_count

# List of POS tags and specific words to be considered or excluded
All_Nouns = ["NN", "NNS", "NNP", "NNPS", "CD"] #added CD for cases like the fare one, the bad one etc.
All_Adjectives = ["JJ", "JJR", "JJS", "PRP$"]
All_Adverbs = ["RB", "RBR", "RBS"]
Verbs_that_can_be_prenominal = ["VBN"]
Determiners_to_be_excluded_as_prenominals = ["a", "an", "the", "that", "these", "those", "this", "either"]
Words_to_be_excluded_as_prenominals = ["been"]







# Define source directories and corresponding destination directories
directory_mapping = {
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\older adult cinderella individual sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\older adult cinderella individual sentences",
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\older adult hobby individual sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\older adult hobby individual sentences",
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\young adult cinderella individual sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\young adult cinderella individual sentences",
    r"C:\Users\james\Documents\Prenominal Modifiers\CLAN QPA Output\young adult hobby individual sentences": r"C:\Users\james\Documents\Prenominal Modifiers\Program Processed\young adult hobby individual sentences"
}

# Ensure destination directories exist
for dest in directory_mapping.values():
    os.makedirs(dest, exist_ok=True)

def process_excel_file(src_file_path, dest_file_path):
    data = pd.read_excel(src_file_path)
    data.insert(1, 'Prenominal Modifiers', '')
    
    # Iterate over each row in the DataFrame
    for index, row in data.iterrows():
        # Get the utterance from the 'UTTERANCES' column
        utterance = row['UTTERANCES']

        if pd.isna(utterance) or utterance.strip() == '':
            continue  # Skip this iteration if the cell is blank or empty
        
        # Preprocess the utterance to remove specific substrings
        utterance = utterance.replace("[+ 0subj]", "").replace("[+ gram]", "").replace("0det","")
        
        print(f"Utterance: {utterance}")

        sentence_class = sentence_tree(utterance)
        sentence_class.prune_to_np_branches(sentence_class.parse_tuples)
        #sentence_class.prune_to_np_branches(sentence_class.indexed_tuples)
        nps_without_POS = sentence_class.remove_pos_tuples(sentence_class.prune_tree[0])
        nps_factored_for_complicated_NP_embedding = sentence_class.case_for_complicated_NP_embeddings(nps_without_POS)
        total_modifiers = 0
        for tree in nps_factored_for_complicated_NP_embedding[0]:
            leaves = extract_leaves(tree)
            print("Extracted Phrase:", leaves)

            # Count modifiers using the updated function
            modifier_count = scan_modifiers(leaves)
            total_modifiers += modifier_count
            print(f"Prenominal No: {modifier_count}")
            print()



        data.at[index, 'Prenominal Modifiers'] = total_modifiers

    # Ensure all entries in 'Prenominal Modifiers' are numeric, converting any non-numeric to NaN
    data['Prenominal Modifiers'] = pd.to_numeric(data['Prenominal Modifiers'], errors='coerce')

    # Replace NaNs with 0s (NaNs can result from non-numeric conversions)
    data['Prenominal Modifiers'].fillna(0, inplace=True)

    
    data.to_excel(dest_file_path, index=False)
    wb = load_workbook(dest_file_path)
    ws = wb.active
    last_row = len(data) + 1
    ws[f'B{last_row}'] = f"=SUM(B2:B{last_row-1})"
    next_row = last_row + 1
    ws[f'A{next_row}'] = "Prenominal Modifiers/#Nouns"
    ws[f'B{next_row}'] = f"=B{last_row}/G{last_row}"
    wb.save(dest_file_path)