# Dependency Parsing

In [1]:
from notebooks.utils import *
import itertools

In [2]:
train_data = read_jsonl('data/spider/nl2code,output_from=true,fs=2,emb=bert,cvlink/enc/train.jsonl')

In [3]:
train_data_questions = [i['raw_question'] for i in train_data]
train_data_qj = [j['question'] for j in train_data]

In [4]:
raw_question = train_data[10]['raw_question']
question = train_data[10]['question']
question_join = ' '.join(question)

print(f'raw_question: {raw_question}')
print(f'question: {question}')
print(f'question_join: {question_join}')

raw_question: How many acting statuses are there?
question: ['how', 'many', 'acting', 'status', '##es', 'are', 'there', '?']
question_join: how many acting status ##es are there ?


In [20]:
import stanza, spacy

nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')
# doc_stanza = nlp_stanza(question_join)

nlp_spacy = spacy.load("en_core_web_sm")

dp_whole_stanza = []
dp_whole_spacy = []


for qj in train_data_qj:
    
    dp_linkage_stanza, dp_linkage_spacy = {}, {}
    doc_stanza = nlp_stanza(' '.join(qj))
    doc_spacy = nlp_spacy(' '.join(qj))

    words_stanza = doc_stanza.sentences[0].words

    for i in range(len(words_stanza)):
        words_stanza[i].id -= 1 
        words_stanza[i].head -= 1

    words_spacy  = []

    for idx, token in enumerate(doc_spacy):

        words_spacy.append({"id": idx,
                            "text": token.text,
                            "dep": token.dep_,
                            "head": token.head.i})

    # print(f'Original Sentence: {qj}')
    # print(f'')

    for i, j in itertools.product(range(len(words_spacy)), repeat=2):
        if i == j:
            continue
        
        # if head's index is 0 > ROOT Node
        elif words_spacy[i]['head'] == words_spacy[i]['id']:
            continue

        elif words_spacy[i]['head'] == j:
            dp_linkage_spacy[f"{j},{i}"] = 'F'
            dp_linkage_spacy[f"{i},{j}"] = 'B'

    dp_link_spacy = {"dp_link": dp_linkage_spacy}

    for i, j in itertools.product(range(len(words_stanza)), repeat=2):
        if i == j:
            continue
        
        # if head's index is 0 > ROOT Node
        elif words_stanza[i].head == -1:
            continue

        elif words_stanza[i].head == j:
            dp_linkage_stanza[f"{j},{i}"] = 'F'
            dp_linkage_stanza[f"{i},{j}"] = 'B'


    dp_link_stanza = {"dp_link": dp_linkage_stanza}

    print(f'words_spacy:')
    for w in words_spacy:
        print(w)
    print('-'*30)
    print(f'dp_link_spacy: {dp_link_spacy}')
    print('='*30)
    print(f'words_stanza: {words_stanza}')
    print('-'*30)
    print(f'dp_link_stanza: {dp_link_stanza}')
    print('='*30)

    dp_whole_spacy.append(dp_link_spacy)
    dp_whole_stanza.append(dp_link_stanza)

    # print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

2022-10-05 01:56:35 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-10-05 01:56:35 INFO: Use device: gpu
2022-10-05 01:56:35 INFO: Loading: tokenize
2022-10-05 01:56:35 INFO: Loading: pos
2022-10-05 01:56:36 INFO: Loading: lemma
2022-10-05 01:56:36 INFO: Loading: depparse
2022-10-05 01:56:36 INFO: Done loading processors!


words_spacy:
{'id': 0, 'text': 'how', 'dep': 'advmod', 'head': 1}
{'id': 1, 'text': 'many', 'dep': 'amod', 'head': 2}
{'id': 2, 'text': 'heads', 'dep': 'nsubj', 'head': 6}
{'id': 3, 'text': 'of', 'dep': 'prep', 'head': 2}
{'id': 4, 'text': 'the', 'dep': 'det', 'head': 5}
{'id': 5, 'text': 'departments', 'dep': 'pobj', 'head': 3}
{'id': 6, 'text': 'are', 'dep': 'ROOT', 'head': 6}
{'id': 7, 'text': 'older', 'dep': 'acomp', 'head': 6}
{'id': 8, 'text': 'than', 'dep': 'prep', 'head': 7}
{'id': 9, 'text': '56', 'dep': 'pobj', 'head': 8}
{'id': 10, 'text': '?', 'dep': 'punct', 'head': 6}
------------------------------
dp_link_spacy: {'dp_link': {'1,0': 'F', '0,1': 'B', '2,1': 'F', '1,2': 'B', '6,2': 'F', '2,6': 'B', '2,3': 'F', '3,2': 'B', '5,4': 'F', '4,5': 'B', '3,5': 'F', '5,3': 'B', '6,7': 'F', '7,6': 'B', '7,8': 'F', '8,7': 'B', '8,9': 'F', '9,8': 'B', '6,10': 'F', '10,6': 'B'}}
words_stanza: [{
  "id": 0,
  "text": "how",
  "lemma": "how",
  "upos": "ADV",
  "xpos": "WRB",
  "feats": "

KeyboardInterrupt: 

In [56]:
for i,j in zip(dp_whole_stanza, dp_whole_spacy):
    if i != j:
        print(f'stanza: {i}')
        print(f'spacy: {j}')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Spacy Example

In [53]:
import spacy
nlp = spacy.load("en_core_web_sm")

dp_whole_spacy = []

for qj in train_data_qj:
    doc = nlp(' '.join(qj))
    words = []
    dp_linkage = {}

    for idx, token in enumerate(doc):

        words.append({"id": idx,
                      "text": token.text,
                      "dep": token.dep_,
                      "head": token.head.i})
                                                
                                                
    for i, j in itertools.product(range(len(words)), repeat=2):
        if i == j:
            continue

        elif words[i]['head'] == j:
            if i < j:
                dp_linkage[f"{i},{j}"] = 'F'
            elif i > j:
                dp_linkage[f"{i},{j}"] = 'B'

        elif words[j]['head'] == i:
            if j < i:
                dp_linkage[f"{j},{i}"] = 'F'
            elif j > i:
                dp_linkage[f"{j},{i}"] = 'B'

        dp_link = {"dp_link": dp_linkage}

    dp_whole_spacy.append(dp_link)

    

In [51]:
len(dp_whole)

1

In [11]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')
doc = nlp(question_join)

print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

2022-10-04 06:05:16 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-10-04 06:05:16 INFO: Use device: gpu
2022-10-04 06:05:16 INFO: Loading: tokenize
2022-10-04 06:05:17 INFO: Loading: pos
2022-10-04 06:05:17 INFO: Loading: lemma
2022-10-04 06:05:17 INFO: Loading: depparse
2022-10-04 06:05:18 INFO: Done loading processors!


id: 1	word: how	head id: 2	head: many	deprel: advmod
id: 2	word: many	head id: 7	head: es	deprel: amod
id: 3	word: acting	head id: 4	head: status	deprel: compound
id: 4	word: status	head id: 7	head: es	deprel: compound
id: 5	word: #	head id: 7	head: es	deprel: compound
id: 6	word: #	head id: 7	head: es	deprel: compound
id: 7	word: es	head id: 9	head: there	deprel: nsubj
id: 8	word: are	head id: 9	head: there	deprel: cop
id: 9	word: there	head id: 0	head: root	deprel: root
id: 10	word: ?	head id: 9	head: there	deprel: punct
