<a href="https://colab.research.google.com/github/HaeChan0305/Sentence-Type-Classifier/blob/main/Sentence_Type_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# cal spc

In [None]:
def find_root(doc):
    for token in doc:
        if token.dep_ == 'ROOT':
            return token

def find_all_root(doc):
    roots = []
    for token in doc:
        if token.dep_ == 'ROOT':
            roots.append(token)

    return roots
          
def walk_tree(doc, token, depths, depth):
    children = list(token.children)
    if len(children) > 0:
        for child in children:
            depths[child.i] = depth
            depths = walk_tree(doc, child, depths, depth + 1)

    return depths

def children_of_children(token):
    children = list(token.children)
    
    if len(children) == 0:
        return []
    
    result = []
    for child in list(token.children):
        result += children_of_children(child)

    return result + children

def right_upper_ancestor(token):
    for ancestor in list(token.ancestors):
          if token in list(ancestor.children):
                return ancestor

def is_verb(token):
    return token.pos_ == "VERB" or (token.pos_ == "AUX" and len(list(token.children)) > 0)

def have_subj_child(token):
    for child in children_of_children(token):
        if child.dep_ == 'nsubj' or child.dep_ == 'csubj' or child.dep_ == 'nsubjpass':
            return True
    return False

def is_to_V(token):
    for child in list(token.children):
        if child.text == 'to' and child.dep_ == 'aux':
            return True
    return False

In [None]:
def is_main_verb(token):
    if not is_verb(token):
        return False

    prep = ['after', 'as', 'before', 'despite', 'lest', 'since', 'supposing', 'than', 'till', 'until']
    ignore_dep = ['xcomp', 'acl', 'acomp', 'amod', 'prep', 'auxpass', 'dep']
    need_subj_dep = ['csubj', 'advcl', 'relcl']
    causative_verb = ['make', 'have', 'let', 'help', 'get']

    if token.dep_ == 'ROOT':
        return True

    elif token.dep_ in ignore_dep:
        return False

    elif token.dep_ in need_subj_dep:
        return have_subj_child(token) and not is_to_V(token)

    elif token.dep_ == 'conj':
        ancestor = right_upper_ancestor(token)
        return is_main_verb(ancestor)
          
    elif token.dep_ == 'pcomp':
        ancestor = right_upper_ancestor(token)
        return ((ancestor.dep_ == 'prep') and (ancestor.text.lower() in prep)) or have_subj_child(token)
            
    elif token.dep_ == 'ccomp':
        if is_to_V(token):
            return False
        # causative_verb detect
        ancestor = right_upper_ancestor(token)
        if (ancestor.lemma_ in causative_verb) and (ancestor.i < token.i):
            if 'nsubj' in [child.dep_ for child in list(token.children)]:
                index_list = list(range(ancestor.i+1,token.i))
                for child in children_of_children(token):
                    if child.i in index_list:
                        index_list.remove(child.i)
                if not len(index_list):
                    return False
        else:
          return True

    else:
      return False

def find_main_verbs(doc):
    main_verbs = []
    for token in doc:
        if is_main_verb(token):
            main_verbs.append(token)
    
    return main_verbs


def is_indep_verb(verb, main_verbs, doc):
    if verb.dep_ == 'ROOT' or verb.dep_ == 'conj':
        return True
      
    elif verb.dep_ == 'ccomp':
        for token in doc:
            if token.text.lower() == 'so':
                anc = right_upper_ancestor(token)
                #print(anc)
                if anc == None:
                    continue
                if anc.i == verb.i:
                    return True
                elif anc.dep_ == 'ROOT' and verb.i < anc.i:
                    return True


            elif token.tag_ == ':':
                anc = right_upper_ancestor(verb)
                if anc == None:
                    continue
                if anc.dep_ == 'ROOT' and (anc.i - token.i) * (token.i - verb.i) > 0:
                    return True
            
        return False

    elif verb.dep_ == 'advcl':
        for token in doc:
            if token.text.lower() == 'for' and token.dep_ == 'mark':
                #print(right_upper_ancestor(token))
                if right_upper_ancestor(token) == None:
                    continue
                if right_upper_ancestor(token).i == verb.i:
                    return True 

            elif token.text.lower() == 'so':
                #print(right_upper_ancestor(token))
                if right_upper_ancestor(token) == None:
                    continue
                if right_upper_ancestor(token).i == verb.i:
                    return True     

        return False

    else:
      return False     
                

def find_indep_verbs(doc):
    main_verbs = find_main_verbs(doc)

    result = []
    for verb in main_verbs:
        if is_indep_verb(verb, main_verbs, doc):
            result.append(verb)
        else:
            pass
            
    return result

In [None]:
def sentence_type_ratio(docs):
  result = {
      "simple" : 0,
      "complex" : 0,
      "compound" : 0,
      "cc" : 0,
      "except" : 0
  }

  for doc in docs:
    num_indep_verbs = len(find_indep_verbs(doc))
    num_dep_verbs = len(find_main_verbs(doc)) - num_indep_verbs

    if num_indep_verbs == 0:
      result["except"] += 1
      # print(f"except : {doc}")

    elif num_indep_verbs == 1:
      if num_dep_verbs == 0:
        result["simple"] += 1
        # print(f"simple : {doc}")
      else:
        result["complex"] += 1
        # print(f"complex : {doc}")
    
    else:
      if num_dep_verbs == 0:
        result["compound"] += 1
        # print(f"compound : {doc}")
      else:
        result["cc"] += 1
        # print(f"cc : {doc}")

  return result

In [None]:
def sub_clause_per_cunit_all_turns(docs):
  num_cunit = 0
  num_clause = 0
  num_sub_clause = 0

  for doc in docs:
    num_cunit_turn = len(find_indep_verbs(doc))
    num_clause_turn = len(find_main_verbs(doc))
    num_sub_clause_turn = num_clause_turn - num_cunit_turn

    num_cunit += num_cunit_turn
    num_clause += num_clause_turn
    num_sub_clause += num_sub_clause_turn

  try:
    spc = num_sub_clause/num_cunit
  except:
    spc = None
  
  return spc