# Dependency parsing using spacy

In [1]:
# Importing libraries
import spacy
from spacy import displacy

In [2]:
# Load the language model
nlp = spacy.load('en_core_web_sm')

sentence = 'Jack flied high within the sea'

# doc will return the tokens of this sentences
# linguistic features and relationships
doc = nlp(sentence)


In [19]:
print ("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relation','Head', 'Children'))
print ("-" * 70)
for token in doc:
    # print the otken, dependecy, head and all dependents of the token
    print("{:<15} | {:<8} | {:<15} | {:<20}".format(str(token.text), str(token.dep_), str(token.head), str([child for child in token.children])))

Token           | Relation | Head            | Children            
----------------------------------------------------------------------
Revenue         | nsubj    | exceeded        | []                  
exceeded        | ROOT     | exceeded        | [Revenue, dollars, ,, with, .]
twelve          | compound | billion         | []                  
billion         | nummod   | dollars         | [twelve]            
dollars         | dobj     | exceeded        | [billion]           
,               | punct    | exceeded        | []                  
with            | prep     | exceeded        | [loss]              
a               | det      | loss            | []                  
loss            | pobj     | with            | [a, of]             
of              | prep     | loss            | [1b]                
$               | nmod     | 1b              | []                  
1b              | pobj     | of              | [$]                 
.               | punct    | exceed

In [16]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 120})

#### Noun chunks

Noun chunks are base noun phrases

In [5]:
doc = nlp('Autonomous cars shift insurance liability toward manufacturers')
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [6]:
from spacy.symbols import nsubj, VERB

doc = nlp('Autonomous cars shift insurance liability toward manufacturers')

# Finding a verb with a subject from below - good

# verbs = set()
# for possible_subject in doc:
#     if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
#         verbs.add(possible_subject.head)

verbs = []
for possible_verb in doc:
    if possible_verb.pos == VERB:
        for possible_subject in possible_verb.children:
            if possible_subject.dep == nsubj:
                verbs.append(possible_verb)
                break
print(verbs)

[shift]


#### Iterating around the local tree

In [7]:
doc = nlp('car is flying through the sky for a book')
    
print([token.text for token in doc[2].lefts])
print([token.text for token in doc[2].rights])
print(doc[2].n_lefts)
print(doc[2].n_rights)



['car', 'is']
['through', 'for']
2
2


In [8]:
doc = nlp("Credit and mortgage account holders must submit their requests")

root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
print(subject)
print('-' * 70)

print([token.text for token in subject.subtree])
print('=' * 70)

for descendant in subject.subtree:
    print(subject.is_ancestor(descendant))
    print('-' * 70)
    
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, 
          [ancestor.text for ancestor in descendant.ancestors])

holders
----------------------------------------------------------------------
['Credit', 'and', 'mortgage', 'account', 'holders']
True
----------------------------------------------------------------------
Credit nmod 0 2 ['account', 'holders', 'submit']
True
----------------------------------------------------------------------
and cc 0 0 ['Credit', 'account', 'holders', 'submit']
True
----------------------------------------------------------------------
mortgage conj 0 0 ['Credit', 'account', 'holders', 'submit']
True
----------------------------------------------------------------------
account compound 1 0 ['holders', 'submit']
False
----------------------------------------------------------------------
holders nsubj 1 0 ['submit']


In [9]:
# nlp.add_pipe("merge_entities")
# nlp.add_pipe("merge_noun_chunks")

TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

for doc in nlp.pipe(TEXTS):
    for token in doc:
#         print(token.text, token.head, token.dep_)
#        print(token.ent_type_)
        if token.ent_type_ == 'MONEY':
            # We have an attribute and direct object, so check for subject
            if token.dep_ in ('attr', 'dobj'):
                subj = [w for w in token.head.lefts if w.dep_ == 'nsubj']
                if subj:
                    print(subj[0], "---->", token)
            elif token.dep_ == "pobj" and token.head.dep_ == "prep":
                print(token.head.head, "-->", token)


income ----> million
year --> million
Revenue ----> dollars
loss --> 1b


# Dependency parsing using nltk

In [10]:
# from nltk.parse.corenlp import CoreNLPDependencyParser

In [11]:
# Path to conrenlp jar unzipped
# jar_path = 'stanford-corenlp-4.2.2/stanford-corenlp-4.2.2/stanford-corenlp-4.2.2.jar'

# # Path to corenlp model jar
# models_jar_path = 'stanford-corenlp-4.2.2-models-english.jar'

# sentence = 'Jack flied high within the sea'

# # Initialize StanfordDependency Parser from the path
# parser = CoreNLPDependencyParser(url='http://localhost:8888/notebooks/Machine%20learning/NLP/Libraries/Dependecy%20parsing/')

# result = parser.raw_parse(sentence)

# dep = next(result)

# print(list(dep.triples()))

# Dependency Parsing using stanza

In [21]:
import stanza
# stanza.download('en')

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.5.0-py3-none-any.whl (802 kB)
     -------------------------------------- 802.5/802.5 kB 3.0 MB/s eta 0:00:00
Collecting torch>=1.3.0
  Downloading torch-2.0.1-cp39-cp39-win_amd64.whl (172.4 MB)
     -------------------------------------- 172.4/172.4 MB 2.1 MB/s eta 0:00:00
Collecting sympy
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
     ---------------------------------------- 5.7/5.7 MB 1.8 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.12.1-py3-none-any.whl (10 kB)
Collecting networkx
  Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 1.5 MB/s eta 0:00:00
Collecting mpmath>=0.19
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ------------------------------------ 536.2/536.2 kB 560.7 kB/s eta 0:00:00
Installing collected packages: mpmath, sympy, networkx, filelock, torch, stanza
Successfully installed filelock-3.12.1 mpmath-1.3.0 networkx-3.1 stanza-1.5

In [22]:
# Initializing the pipeline
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse')

2023-06-10 11:20:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 19.7MB/s]
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/tokenize/combined.pt: 100%|█| 647k/647k 
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/pos/combined.pt: 100%|█| 38.5M/38.5M [00
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/lemma/combined.pt: 100%|█| 4.17M/4.17M [
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/depparse/combined.pt: 100%|█| 145M/145M 
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/forward_charlm/1billion.pt: 100%|█| 22.7
Downloading https://huggingface.co/stanfordnlp/stanza-en/res

In [23]:
# store all the details in doc
doc = nlp(sentence)

doc.sentences

[[
   {
     "id": 1,
     "text": "Jack",
     "lemma": "Jack",
     "upos": "PROPN",
     "xpos": "NNP",
     "feats": "Number=Sing",
     "head": 2,
     "deprel": "nsubj",
     "start_char": 0,
     "end_char": 4
   },
   {
     "id": 2,
     "text": "flied",
     "lemma": "fly",
     "upos": "VERB",
     "xpos": "VBD",
     "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
     "head": 0,
     "deprel": "root",
     "start_char": 5,
     "end_char": 10
   },
   {
     "id": 3,
     "text": "high",
     "lemma": "high",
     "upos": "ADV",
     "xpos": "RB",
     "feats": "Degree=Pos",
     "head": 2,
     "deprel": "advmod",
     "start_char": 11,
     "end_char": 15
   },
   {
     "id": 4,
     "text": "within",
     "lemma": "within",
     "upos": "ADP",
     "xpos": "IN",
     "head": 6,
     "deprel": "case",
     "start_char": 16,
     "end_char": 22
   },
   {
     "id": 5,
     "text": "the",
     "lemma": "the",
     "upos": "DET",
     "xpos": "DT",
     

In [24]:
doc.sentences[0].print_dependencies()

('Jack', 2, 'nsubj')
('flied', 0, 'root')
('high', 2, 'advmod')
('within', 6, 'case')
('the', 6, 'det')
('sea', 2, 'obl')


In [25]:
sent_dict = doc.sentences[0].to_dict()
print(sent_dict)
for word in sent_dict:
    print ("{:<15} | {:<10} | {:<15} "
         .format(str(word['text']), str(word['deprel']), str(sent_dict[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))

print('*' * 60)



    

[{'id': 1, 'text': 'Jack', 'lemma': 'Jack', 'upos': 'PROPN', 'xpos': 'NNP', 'feats': 'Number=Sing', 'head': 2, 'deprel': 'nsubj', 'start_char': 0, 'end_char': 4}, {'id': 2, 'text': 'flied', 'lemma': 'fly', 'upos': 'VERB', 'xpos': 'VBD', 'feats': 'Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin', 'head': 0, 'deprel': 'root', 'start_char': 5, 'end_char': 10}, {'id': 3, 'text': 'high', 'lemma': 'high', 'upos': 'ADV', 'xpos': 'RB', 'feats': 'Degree=Pos', 'head': 2, 'deprel': 'advmod', 'start_char': 11, 'end_char': 15}, {'id': 4, 'text': 'within', 'lemma': 'within', 'upos': 'ADP', 'xpos': 'IN', 'head': 6, 'deprel': 'case', 'start_char': 16, 'end_char': 22}, {'id': 5, 'text': 'the', 'lemma': 'the', 'upos': 'DET', 'xpos': 'DT', 'feats': 'Definite=Def|PronType=Art', 'head': 6, 'deprel': 'det', 'start_char': 23, 'end_char': 26}, {'id': 6, 'text': 'sea', 'lemma': 'sea', 'upos': 'NOUN', 'xpos': 'NN', 'feats': 'Number=Sing', 'head': 2, 'deprel': 'obl', 'start_char': 27, 'end_char': 30}]
Jack