In [9]:
# impot all libraries needed
import stanza
import pandas as pd
from nltk.tree import *
from stanza.models.constituency.tree_reader import read_trees

In [11]:
#define pipeline for Stanza 
lang = 'en'

nlp = stanza.Pipeline(lang= 'en', processors= 'tokenize, mwt, pos, constituency', 
                      use_gpu=(False))

2024-12-06 16:22:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-12-06 16:22:27 INFO: Downloaded file to /Users/test/stanza_resources/resources.json
2024-12-06 16:22:27 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2024-12-06 16:22:27 INFO: Using device: cpu
2024-12-06 16:22:27 INFO: Loading: tokenize
2024-12-06 16:22:27 INFO: Loading: mwt
2024-12-06 16:22:27 INFO: Loading: pos
2024-12-06 16:22:28 INFO: Loading: constituency
2024-12-06 16:22:28 INFO: Done loading processors!


In [12]:
# load dataframe with timestamps
path = 'Data/example_fig_timestamps.csv'

#load into a pandas df
dfWords = pd.read_csv(path)

In [13]:
dfWords

Unnamed: 0,line,tmin,tier,text,tmax
0,1,0.079043,words,The,0.222566
1,2,0.222566,words,idea,0.710037
2,3,0.710037,words,of,0.936425
3,4,0.936425,words,one,1.114281
4,5,1.114281,words,employee,1.810454
5,6,1.810454,words,improved,2.351353
6,7,2.351353,words,the,2.471574
7,8,2.471574,words,quality,3.046079
8,9,3.046079,words,of,3.19763
9,10,3.19763,words,the,3.272175


In [14]:
# extract words
sent_to_analyse = ' '.join(w for w in list(dfWords.text))

In [15]:
sent_to_analyse

'The idea of one employee improved the quality of the product'

In [21]:
def syntactic_annotation(tree, words):
    '''
    tree is a constituency object coming from stanza. You can also import a string instead of the object.

    words is a word object coming from stanza. Alternatively you can pass a list of words

    Example
    
    import stanza
    import pandas as pd
    from nltk.tree import *
    *sentence is a string
    doc =nlp(sentence)
    
    e.g., 
    tree = doc.sentences[j].constituency # j is the indext of the sentence to analyse
    words = doc.sentences[j].words
    '''
    
    #check if the input is a string or not
    if isinstance(tree, str): 

        tree = read_trees(tree)[0]
    
    

    #import tree into nltk
    tree_string = Tree.fromstring(str(tree))
    
    #get indexes for every terminal node
    tpos = tree_string.treepositions('leaves')
    
    
    #initialize list to store phrase extraction
    #in this case, it will be a list of nexted lists
    
    all_labels = list()
    
    
    #loop through the idx to get to every leaf
    for leaf_idx in tpos: 
        
        #initialize function to be used in Stanza
        layer = tree.children
        
        #list of labels for this leaf, to nest into the other list
        labels = []
        


        #loop through current leaf_idx
        for n, i in enumerate(leaf_idx): 
            
            #add current index
            layer = layer[i]
            
            #extract labels
            label = layer.label

            #add current level of embedding
            label_n = '/'
            for idx in list(leaf_idx)[:n+1]: 
                label_n += str(idx) 
            
            #add identifier
            label += label_n
            
            #store label
            labels.append(label)
            
            #prepare for next iteration
            layer = layer.children

            #function_string += '.children'
        
        all_labels.append(labels)
    
    #store in a df and add words    
    df = pd.DataFrame(all_labels)
    
    #chech if words is a list
    if isinstance(words[0], str):

        df.insert(loc=0, column='words', value=words)
        
        
    else: 
        words = [w.text for w in words]
        df.insert(loc=0, column='words', value=words) 
        
        
    return df


In [22]:
# parse the text
doc =nlp(sent_to_analyse)
tree = doc.sentences[0].constituency
words_from_stanza = doc.sentences[0].words

In [23]:
tree

(ROOT (S (NP (NP (DT The) (NN idea)) (PP (IN of) (NP (CD one) (NN employee)))) (VP (VBD improved) (NP (NP (DT the) (NN quality)) (PP (IN of) (NP (DT the) (NN product)))))))

In [24]:
# run the analysis 
df = syntactic_annotation(tree, words_from_stanza)

In [25]:
df

Unnamed: 0,words,0,1,2,3,4,5,6
0,The,S/0,NP/00,NP/000,DT/0000,The/00000,,
1,idea,S/0,NP/00,NP/000,NN/0001,idea/00010,,
2,of,S/0,NP/00,PP/001,IN/0010,of/00100,,
3,one,S/0,NP/00,PP/001,NP/0011,CD/00110,one/001100,
4,employee,S/0,NP/00,PP/001,NP/0011,NN/00111,employee/001110,
5,improved,S/0,VP/01,VBD/010,improved/0100,,,
6,the,S/0,VP/01,NP/011,NP/0110,DT/01100,the/011000,
7,quality,S/0,VP/01,NP/011,NP/0110,NN/01101,quality/011010,
8,of,S/0,VP/01,NP/011,PP/0111,IN/01110,of/011100,
9,the,S/0,VP/01,NP/011,PP/0111,NP/01111,DT/011110,the/0111100


In [None]:
# align the dataframe to the original dataframe

In [26]:
df = pd.concat([dfWords, df], axis = 1)

In [27]:
df

Unnamed: 0,line,tmin,tier,text,tmax,words,0,1,2,3,4,5,6
0,1,0.079043,words,The,0.222566,The,S/0,NP/00,NP/000,DT/0000,The/00000,,
1,2,0.222566,words,idea,0.710037,idea,S/0,NP/00,NP/000,NN/0001,idea/00010,,
2,3,0.710037,words,of,0.936425,of,S/0,NP/00,PP/001,IN/0010,of/00100,,
3,4,0.936425,words,one,1.114281,one,S/0,NP/00,PP/001,NP/0011,CD/00110,one/001100,
4,5,1.114281,words,employee,1.810454,employee,S/0,NP/00,PP/001,NP/0011,NN/00111,employee/001110,
5,6,1.810454,words,improved,2.351353,improved,S/0,VP/01,VBD/010,improved/0100,,,
6,7,2.351353,words,the,2.471574,the,S/0,VP/01,NP/011,NP/0110,DT/01100,the/011000,
7,8,2.471574,words,quality,3.046079,quality,S/0,VP/01,NP/011,NP/0110,NN/01101,quality/011010,
8,9,3.046079,words,of,3.19763,of,S/0,VP/01,NP/011,PP/0111,IN/01110,of/011100,
9,10,3.19763,words,the,3.272175,the,S/0,VP/01,NP/011,PP/0111,NP/01111,DT/011110,the/0111100
