In [None]:
!pip install stanza
import stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency,mwt,lemma,depparse')

import nltk
from nltk.tree import Tree

import pandas as pd
import re
import glob
import pathlib

 **Analysing a single sentence**

In [None]:
depDic={"compound":"com", "acl:relcl":"rel", "nmod":"prep", "amod":"adj", "det":"det", "nmod:poss":"det", "nummod":"nummod"}

doc = nlp("the significance of having a part-time job for Tom is quite clear.") # Chane a sentence here
deprel_list=[]


for sentence in doc.sentences:

#### Process related to noun phrases ####

  # Convert constituency parsing into a tree object which is handlable in nltk toolkit
    tree = Tree.fromstring(str(sentence.constituency))

  # extract NP subtrees
    nps = [subtree for subtree in tree.subtrees() if subtree.label() == "NP"]

  # Display the surface forms of the largest NP subtrees that do not have "PRP (pronoun)", "EX (it/there)", or "NNP (proper noun)" as their head.
    NP_list=[]
    NGNP_list=["It", "There","it", "there" ]
    for np in nps:
      if any(subtree.label() not in  ["PRP","EX", "NNP","DT"] for subtree in np):
        surface_form = " ".join(np.leaves())
        NP_list.append(surface_form)

    for NP1 in NP_list: # Remove overlaps unique to constituency parsing (check for the presence of substring matches at the string level and remove those that are partially matched)
      for NP2 in NP_list:
        if NP1 == NP2:
          continue
        elif NP1 in NP2:
          NGNP_list.append(NP1)

    NP_list = [x for x in NP_list if x not in NGNP_list]
    print("extracted NPs:", NP_list)


#### Process related to noun modifiers ####
    deprel_list=[]
    for word in  sentence.words:

      if word.deprel in ["compound","acl:relcl","nmod","amod","acl", "det", "nmod:poss","nummod"]:  #　Extract words with tags matching　"compound","acl:relcl","nmod","amod","acl", "det", "nmod:poss","nummod"

        if word.deprel == "acl":

          if word.xpos =="VBG": # exception handling related to "prep VBG (e.g., significance of -ing)"
              head_word_id=word.id
              dep_list=[]
              for word_2 in sentence.words:
                  if word_2.head == head_word_id:
                    dep_list.append(word_2.xpos)
              if "IN" in dep_list:
                deprel_list.append("prep")
              else:
                deprel_list.append("ing-acl")

          if word.xpos =="VBN":  # exception handling related to "to be VBN (e.g.,things to be done)"
              head_word_id=word.id
              dep_list=[]
              for word_2 in sentence.words:
                  if word_2.head == head_word_id:
                    dep_list.append(word_2.xpos)
              if "TO" and "VB" in dep_list:
                deprel_list.append("to-acl")

              else:
                deprel_list.append("ed-acl")

          if word.xpos =="VB":
            deprel_list.append("to-acl")

        else:
          deprel_list.append(depDic[word.deprel])

    print("extracted noun modifiers:",deprel_list )

# if you want to see the dependency parsing out put, delele #
# print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

# if you want to see the constituency parsing out put, delele #
# tree.pretty_print()

**Analysing multiple files in a directory**

In [None]:
depDic={"compound":"com", "acl:relcl":"rel", "nmod":"prep", "amod":"adj", "det":"det", "nmod:poss":"det", "nummod":"nummod"}
df_deprel = pd.DataFrame(columns=['filename','sentenceID', 'sentence', "deprel"])
df_NP = pd.DataFrame(columns=['filename','sentenceID','sentence', 'NP'])
npID=0
depID=0

files = glob.glob('/content/drive/MyDrive/2023_JUNE_AutoAnalysis/NS/*.txt')  # Chane the path here
for i in files:
  filename = pathlib.PurePath(i).stem
  with open(i, 'r') as file:
      text = file.read()

  doc = nlp(text)

  sentenceID=0
  for sentence in doc.sentences:
    sentenceID=sentenceID+1
#### Process related to noun phrases ####
    tree = Tree.fromstring(str(sentence.constituency))
    nps = [subtree for subtree in tree.subtrees() if subtree.label() == "NP"]
    NP_list=[]
    NGNP_list=["It", "There","it", "there" ]
    for np in nps:
      if any(subtree.label() not in  ["PRP","EX", "NNP","DT"] for subtree in np):
        surface_form = " ".join(np.leaves())
        NP_list.append(surface_form)

    for NP1 in NP_list:
      for NP2 in NP_list:
        if NP1 == NP2:
          continue
        elif NP1 in NP2:
          NGNP_list.append(NP1)

    NP_list = [x for x in NP_list if x not in NGNP_list]

    for np in NP_list:
      npID=npID+1
      df_NP.loc[npID, 'filename'] = filename
      df_NP.loc[npID, 'NP'] = np
      df_NP.loc[npID, 'sentenceID']= sentenceID
      df_NP.loc[npID, 'sentence']= sentence.text

#### Process related to noun modifiers ####
    deprel_list=[]
    for word in  sentence.words:
      if word.deprel in ["compound","acl:relcl","nmod","amod","acl", "det", "nmod:poss","nummod"]:  #　Extract words with tags matching　"compound","acl:relcl","nmod","amod","acl", "det", "nmod:poss","nummod"

        if word.deprel == "acl":
          if word.xpos =="VBG": # exception handling related to "prep VBG (e.g., significance of -ing)"
              head_word_id=word.id
              dep_list=[]
              for word_2 in sentence.words:
                  if word_2.head == head_word_id:
                    dep_list.append(word_2.xpos)
              if "IN" in dep_list:
                deprel_list.append("prep")
              else:
                deprel_list.append("ing-acl")

          if word.xpos =="VBN":  # exception handling related to "to be VBN (e.g.,things to be done)"
              head_word_id=word.id
              dep_list=[]
              for word_2 in sentence.words:
                  if word_2.head == head_word_id:
                    dep_list.append(word_2.xpos)
              if "TO" and "VB" in dep_list:
                deprel_list.append("to-acl")

              else:
                deprel_list.append("ed-acl")

          if word.xpos =="VB":
            print(word, "to-acl")

        else:
          deprel_list.append(depDic[word.deprel])

    for dep in deprel_list:
      depID=depID+1
      df_deprel.loc[depID, 'filename'] = filename
      df_deprel.loc[depID, 'deprel'] = dep
      df_deprel.loc[depID, 'sentenceID']= sentenceID
      df_deprel.loc[depID, 'sentence']= sentence.text


**Computing the value of indices**

In [None]:
import numpy as np
import pathlib
from collections import Counter

All_df=pd.DataFrame(columns=['filename',"CEFR",'mean_length_of_NPs','mean_number_of_modifiers', 'types_of_modifiers', 'det_frequency', 'adj_frequency',  'com_frequency', 'ed-acl_frequency', 'ing-acl_frequency','to-acl_frequency', 'rel_frequency','prep_frequency', 'nummod_frequency'])
fileID=0
modifierList=['det', 'prep', 'adj', 'ed-acl', 'ing-acl', 'rel', 'com', 'to-acl', 'nummod']

files = glob.glob('/content/drive/MyDrive/2023_JUNE_AutoAnalysis/A2/*.txt') # Chane the path here
for i in files:
  fileID=fileID+1
  filename = pathlib.PurePath(i).stem
  All_df.loc[fileID, 'filename'] = filename

#### Process related to Mean Length of NPs ####
  list=[]
  for index, row in df_NP[df_NP["filename"]==filename].iterrows():
      number_of_words = len(row["NP"].split())
      list.append(number_of_words)
  mean_length_of_NPs = np.mean(list)
  All_df.loc[fileID, 'mean_length_of_NPs'] = mean_length_of_NPs

#### Process related to Mean Number of Modifiers ####
  NP_condition = df_NP["filename"]==filename
  NP_count = (NP_condition).sum()
  dep_condition = df_deprel["filename"]==filename
  dep_count = (dep_condition).sum()
  mean_number_of_modifiers=dep_count/NP_count
  All_df.loc[fileID, 'mean_number_of_modifiers'] = mean_number_of_modifiers

#### Number of Modifier Types ####
  dep_counter = Counter(df_deprel[df_deprel["filename"]==filename]['deprel'])
  dep_frequency_dict = dict(dep_counter)
  types_of_modifiers= len(dep_frequency_dict)
  All_df.loc[fileID, 'types_of_modifiers'] = types_of_modifiers

#### Process related to Frequency of Each Modifier Type ####
  for i in modifierList:
    if i in dep_frequency_dict:
      All_df.loc[fileID, i+"_frequency"] = dep_frequency_dict[i]/NP_count
    else:
      All_df.loc[fileID, i+"_frequency"] = 0

csv_file_path = '2023_英語コーパス学会_分析用データ.csv' # Chane the path here
All_df.to_csv(csv_file_path, index=False)