# Grammar dependency notebook
Authors
https://towardsdatascience.com/natural-language-processing-dependency-parsing-cf094bbbe3f7

In [None]:
# !curl https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-4.2.2.zip -o stanford-corenlp-4.2.2.zip 
# !curl https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-4.2.2-models-english.jar -o stanford-corenlp-4.2.2-models-english.jar
# !7z x stanford-corenlp-4.2.2.zip # windows
# !unzip stanford-corenlp-4.2.2.zip# mac / linux

In [None]:
# https://web.stanford.edu/~jurafsky/slp3/14.pdf
# https://stanfordnlp.github.io/CoreNLP/index.html    
from nltk.parse import DependencyGraph,ProjectiveDependencyParser,NonprojectiveDependencyParser
from nltk import word_tokenize
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.corenlp import CoreNLPServer
import tqdm

In [None]:
# Paths to CoreNLP jar unzipped and model jar
jar_path = 'stanford-corenlp-4.2.2/stanford-corenlp-4.2.2.jar'
models_jar_path = 'stanford-corenlp-4.2.2-models-english.jar'

# Initialize StanfordDependency Parser from the path
parser = StanfordDependencyParser(path_to_jar = jar_path, path_to_models_jar = models_jar_path)

# Parse the sentence
text = 'The monkey is in the tree'

result = parser.raw_parse(text)
dependency = result.__next__() #bad API

In [None]:
print ("{:<15} | {:<10} | {:<10} | {:<15} | {:<10}".format('Head', 'Head POS','Relation','Dependent', 'Dependent POS'))
print ("-" * 75)
  
# Use dependency.triples() to extract the dependency triples in the form
# ((head word, head POS), relation, (dependent word, dependent POS))  
for dep in list(dependency.triples()):
    print ("{:<15} | {:<10} | {:<10} | {:<15} | {:<10}"
         .format(str(dep[0][0]),str(dep[0][1]), str(dep[1]), str(dep[2][0]),str(dep[2][1])))

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G = dependency.nx_graph()

words = text.split(" ")
labels = {index + 1: words[index] for index in range(len(words))}
nx.draw(G, with_labels=True, labels=labels, node_size=2500, node_color='#B5EAD7', font_size=10)

In [None]:
# my contribution

In [None]:
# https://networkx.org/documentation/stable/index.html
    
G = nx.Graph(G)
centrality = nx.betweenness_centrality(G)
print(centrality)

print("-"*60)
for key in centrality:
    print ("{:<10}|{:<10}".format(labels[key],str(centrality[key])))

# Activity

The grammatical complexity of a text can be determined by the degree of dependency between words in the same sentence. Any suggestions on how to calculate a footprint/spectrum of this?

La complejidad gramatical de un texto se puede determinar por el grado de dependencia entre palabras de una misma sentencia. ¿alguna sugerencia de como poder calcular una huella/un espectro de esto?


shall we implement it?

In [None]:
import re
import functools

romeojulieta = open('romeojulieta.txt','r')
text = romeojulieta.read()
romeojulietaSentences = re.split(r'(\.\W|\n{2,})', text) # may create empty elements
romeojulietaSentences = list(filter(lambda sentence: not re.match(r'^\W*$', sentence), romeojulietaSentences)) # remove empty elements
romeojulietaSentences = list(filter(lambda sentence: not re.match(r'^\w+$', sentence), romeojulietaSentences)) # remove one word elements

romeojulietaCleanSentences = list(map(lambda sentence: re.sub(r'((\W(?!(\w)))+.|\n+)', " ", sentence).lower(), romeojulietaSentences))
for sentence in romeojulietaCleanSentences[0:5]:
  print(sentence)

In [None]:
def getValues(dict):
  values = []
  for k,v in dict.items():
    values.append(v)
  return values

def calculateListAverage(list):
    return sum(list) / len(list)

def calculateDependencyDegree(text):
  result = parser.raw_parse(text)
  graph = nx.Graph(result.__next__().nx_graph())
  centrality = nx.betweenness_centrality(graph)
  values = getValues(centrality)
  return calculateListAverage(values)

In [None]:

averages = []
for sentence in tqdm.tqdm(romeojulietaCleanSentences[0:50]):
  degree = calculateDependencyDegree(sentence)
  averages.append(degree)

print(calculateListAverage(averages))