# Getting ready

In [1]:
import stanza

_nlp = stanza.Pipeline(lang="en", processors="tokenize,pos,lemma,depparse")

_AI_FILE = "data/ai.en/ai.en.txt"
_OUTPUT_FILE = "data/ai.en.depparse.txt"

2022-02-01 07:31:55 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-02-01 07:31:55 INFO: Use device: cpu
2022-02-01 07:31:55 INFO: Loading: tokenize
2022-02-01 07:31:55 INFO: Loading: pos
2022-02-01 07:31:56 INFO: Loading: lemma
2022-02-01 07:31:56 INFO: Loading: depparse
2022-02-01 07:31:57 INFO: Done loading processors!


# 40. Read the parse result (words)

In [2]:
class Word:
    def __init__(self, word: dict) -> None:
        self.id = int(word.id) - 1
        self.text = word.text.lower()
        self.pos = word.xpos
        self.lemma = word.lemma.lower()
        self.head = int(word.head) - 1
        self.dep = word.deprel
        self.children = []
            
    def __str__(self) -> str:
        children = [child.text for child in self.children]
        return f"id: {self.id}\n" \
               f"text: {self.text}\nlemma: {self.lemma}\npos: {self.pos}\n"\
               f"head: {self.head}\ndep: {self.dep}\nchildren: {children}\n"
    
    def add_child(self, word: object) -> None:
        self.children.append(word)

In [3]:
def _read_data(file_object, CHUNK_SIZE: int = -1) -> str:
    while True:
        data = file_object.read(CHUNK_SIZE)
        if not data:
            break
        yield data

_sentences = []

with open(_AI_FILE) as _input_file, open(_OUTPUT_FILE, "w") as _output_file:
    for data in _read_data(_input_file):
        doc = _nlp(data)
        for sentence in doc.sentences:
            current_sent = []
            for word in sentence.words:
                _output_file.write(f"{sentence.id} {word}\n")
                current_sent.append(Word(word))
            _sentences.append(current_sent)

In [4]:
for word in _sentences[0]:
    print(f"{word}\n")

id: 0
text: in
lemma: in
pos: IN
head: 2
dep: case
children: []


id: 1
text: computer
lemma: computer
pos: NN
head: 2
dep: compound
children: []


id: 2
text: science
lemma: science
pos: NN
head: 16
dep: obl
children: []


id: 3
text: ,
lemma: ,
pos: ,
head: 16
dep: punct
children: []


id: 4
text: artificial
lemma: artificial
pos: JJ
head: 5
dep: amod
children: []


id: 5
text: intelligence
lemma: intelligence
pos: NN
head: 16
dep: nsubj
children: []


id: 6
text: (
lemma: (
pos: -LRB-
head: 7
dep: punct
children: []


id: 7
text: ai
lemma: ai
pos: NN
head: 5
dep: appos
children: []


id: 8
text: )
lemma: )
pos: -RRB-
head: 7
dep: punct
children: []


id: 9
text: ,
lemma: ,
pos: ,
head: 11
dep: punct
children: []


id: 10
text: sometimes
lemma: sometimes
pos: RB
head: 11
dep: advmod
children: []


id: 11
text: called
lemma: call
pos: VBN
head: 5
dep: acl
children: []


id: 12
text: machine
lemma: machine
pos: NN
head: 13
dep: compound
children: []


id: 13
text: intelligence
lemma: i

# 41. Read the parse result (dependency)

In [5]:
for sentence in _sentences:
    for word in sentence:
        if word.head > -1:
            sentence[word.head].add_child(word)

In [None]:
def _show_sentence_pairs(sentence: list, sentence_number: int) -> None:
    for word in sentence:
        children = [child.lemma for child in word.children]
        print(f"SENTENCE N°{sentence_number} WORD {word.lemma} --> {children}\n")
        
_show_sentence_pairs(_sentences[0], 0)

# 42. Show root words

In [None]:
_root_words = []
for sentence in _sentences:
    index = 0
    while sentence[index].head != -1:
        index += 1
    _root_words.append(sentence[index])

for word in _root_words:
    print(word)

# 43. Show verb governors and noun dependents

In [None]:
def _show_verb_noun_pairs(sentence: list, sentence_number: int) -> None:
    for word in sentence:
        head_word = sentence[word.head]
        if head_word.pos.startswith("V") and word.pos.startswith("N"):
            print(f"SENTENCE N°{sentence_number} head: {head_word.lemma} --> noun: {word.lemma}\n")

for index in range(0, len(_sentences)):
    _show_verb_noun_pairs(_sentences[index], index)    

# 44. Visualize dependency trees

In [None]:
import pydot

def _get_correct_syntax(word: Word) -> str:
    return word.text.join(["\"", "\""])

def _draw_dependency_tree(FILE_NAME: str, sentence: list) -> str:
    graph = pydot.Dot(FILE_NAME, graph_type="graph")
    
    for word in sentence:
        if word.dep != "punct":
            head = _get_correct_syntax(sentence[word.head])
            tail = _get_correct_syntax(word)
            graph.add_node(pydot.Node(head, xlabel=sentence[word.head].pos))
            graph.add_node(pydot.Node(tail, xlabel=word.pos))
            graph.add_edge(pydot.Edge(head, tail, label=word.dep))

    graph.write_raw("".join([FILE_NAME, ".dot"]))
    return graph.to_string()

# 45. Triple with subject, verb, and direct object

In [6]:
def _find_tuple_element(word: Word, component: list) -> list:
    return component

In [7]:
def _find_tuple(tuples: list, subject_type: str, object_type: str) -> None:
    for sentence in _sentences:
        for word in sentence:
            if word.pos == "VBD":
                _subject_flag = False
                _object_flag = False
                for child in word.children:
                    if child.pos.startswith("N") and child.dep.startswith(subject_type):
                        _subject_flag = True
                        _subject = " ".join(_find_tuple_element(child, [child.text]))
                    elif child.pos.startswith("N") and child.dep.startswith(object_type):
                        _object_flag = True
                        _object = " ".join(_find_tuple_element(child, [child.text]))
                if _subject_flag and _object_flag:
                    tuples.append((_subject, word.text, _object))

In [8]:
_tuples = []
_find_tuple(_tuples, "nsubj", "obj")
_tuples

[('governments', 'cut', 'research'),
 ('project', 'inspired', 'governments'),
 ('development', 'enabled', 'development'),
 ('match', 'defeated', 'champions'),
 ('alphago', 'won', 'match'),
 ('china', 'accelerated', 'funding'),
 ('councilmen', 'refused', 'permit'),
 ('researchers', 'developed', 'algorithms'),
 ('deepmind', 'developed', 'intelligence'),
 ('number', 'explored', 'connection'),
 ('one', 'developed', 'style'),
 ('john', 'named', 'approaches'),
 ('economist', 'studied', 'skills'),
 ('work', 'laid', 'foundations'),
 ('team', 'used', 'results'),
 ('people', 'used', 'algorithms'),
 ('roger', 'described', 'approaches'),
 ('researchers', 'rejected', 'ai'),
 ('work', 'revived', 'point'),
 ('researchers', 'adopted', 'tools'),
 ('language', 'permitted', 'level'),
 ('frank', 'invented', 'perceptron'),
 ('publication', 'introduced', 'way'),
 ('yann', 'applied', 'backpropagation'),
 ('recognition', 'experienced', 'jump'),
 ('google', 'used', 'lstm'),
 ('lstm', 'improved', 'captioning'),

# 46. Expanding subjects and objects

In [9]:
def _find_tuple_element(word: Word, component: list) -> list:
    for child in word.children:
        if child.dep in ("compound", "flat", "amod"):
            component.append(child.text)
            _find_tuple_element(child, component)
    return component

In [10]:
_tuples = []
_find_tuple(_tuples, "nsubj", "obj")
_tuples

[('governments u.s.', 'cut', 'research exploratory'),
 ('project generation fifth computer', 'inspired', 'governments u.s'),
 ('development', 'enabled', 'development'),
 ('match jeopardy quiz show exhibition',
  'defeated',
  'champions greatest jeopardy'),
 ('alphago', 'won', 'match game'),
 ('china', 'accelerated', 'funding government'),
 ('councilmen city', 'refused', 'permit'),
 ('researchers early', 'developed', 'algorithms'),
 ('deepmind', 'developed', 'intelligence generalized artificial'),
 ('number', 'explored', 'connection'),
 ('one', 'developed', 'style own'),
 ('john haugeland', 'named', 'approaches symbolic'),
 ('economist herbert simon', 'studied', 'skills human problem- solving'),
 ('work', 'laid', 'foundations'),
 ('team research', 'used', 'results'),
 ('people', 'used', 'algorithms same'),
 ('roger schank', 'described', 'approaches anti-logic'),
 ('researchers', 'rejected', 'ai symbolic'),
 ('work', 'revived', 'point non-symbolic'),
 ('researchers', 'adopted', 'tools s

# 47. Triple from the passive sentence

In [11]:
def _find_object_element(word: Word, component: list) -> list:
    for child in word.children:
        if child.pos == "IN":
            return component, child.text
        if child.dep in ("compound", "flat", "obl", "nmod"):
            component.append(child.text)
            _find_object_element(child, component)
    return component, ""
                    
def _find_tuple(tuples: list, subject_type: str, object_type: str) -> None:
    for sentence in _sentences:
        for word in sentence:
            if word.pos == "VBD":
                _subject_flag = False
                _object_flag = False
                for child in word.children:
                    if child.dep.startswith(subject_type):
                        _subject_flag = True
                        _subject = " ".join(_find_tuple_element(child, [child.text]))
                    elif child.dep.startswith(object_type):
                        _object_flag = True
                        result, prep = _find_object_element(child, [child.text])
                        _object = " ".join(result)
                    if _subject_flag and _object_flag:
                        tuples.append((_subject, "-".join([word.text, prep]), _object))
                        _object_flag = False

In [12]:
_tuples = []
_find_tuple(_tuples, "nsubj", "obl")
_tuples

[('beings thought capable artificial', 'appeared-as', 'devices'),
 ('study', 'began-with', 'philosophers'),
 ('study', 'began-in', 'antiquity'),
 ('study', 'led-to', 'theory'),
 ('this', 'led-along', 'discoveries'),
 ('press', 'described-as', 'astonishing'),
 ('governments u.s.', 'cut-in', 'response'),
 ('governments u.s.', 'cut-in', 'ai'),
 ('project generation fifth computer', 'inspired-at', 'time'),
 ('development', 'enabled-in', 'form'),
 ('match jeopardy quiz show exhibition', 'defeated-in', '2011'),
 ('match jeopardy quiz show exhibition', 'defeated-by', 'margin'),
 ('that', 'emerged-from', 'research'),
 ('alphago', 'won-in', '2016'),
 ('alphago', 'won-out', 'games'),
 ('alphago', 'won-in', 'future'),
 ('who', 'held-at', 'time'),
 ('who', 'held-for', 'years'),
 ('one', 'reported-in', 'survey'),
 ('china', 'accelerated-around', '2016'),
 ('that', 'worked-in', 'past'),
 ('sun', 'rose-', 'morning'),
 ('sun', 'rose-for', 'days'),
 ('who', 'stated-in', '1988'),
 ('number', 'explored-i

# 48. Extract paths from the root to nouns

In [16]:
def _find_node(word: Word, component: list, paths: list) -> None:
    for child in word.children:
        new_tab = component.copy()
        if child.pos.startswith("N"):
            new_tab.append(child.text)
            paths.append(" -> ".join(new_tab))
        _find_node(child, new_tab, paths)

_paths = []
for sentence in _sentences:
    for word in sentence:
        if word.head == -1:
            path = [word.text]
            _find_node(word, path, _paths)
            
for path in _paths:
    print(path)

intelligence -> science
intelligence -> science -> computer
intelligence -> intelligence
intelligence -> intelligence -> ai
intelligence -> intelligence -> intelligence
intelligence -> intelligence -> intelligence -> machine
intelligence -> machines
intelligence -> contrast
intelligence -> contrast -> intelligence
intelligence -> contrast -> intelligence -> humans
intelligence -> contrast -> intelligence -> humans -> animals
define -> textbooks
define -> field
define -> study
define -> study -> agents
define -> study -> agents -> device
define -> study -> agents -> device -> environment
define -> study -> agents -> device -> actions
define -> study -> agents -> device -> actions -> chance
define -> study -> agents -> device -> actions -> chance -> goals
used -> term
used -> term -> intelligence
used -> machines
used -> machines -> computers
used -> machines -> functions
used -> machines -> functions -> humans
used -> machines -> functions -> mind
used -> machines -> functions -> mind -

combine -> accuracy -> page -> level -> paragraph
combine -> understanding
combine -> understanding -> sentences
scales -> difficulties
scales -> difficulties -> knowledge
scales -> nlp
scales -> applications
scales -> applications -> business
is -> goal
is -> goal -> nlp
is -> nlp
is -> understanding
is -> understanding -> reasoning
ability -> perception
ability -> perception -> machine
ability -> input
ability -> sensors
ability -> sensors -> cameras
ability -> sensors -> cameras -> spectrum
ability -> sensors -> cameras -> microphones
ability -> sensors -> cameras -> signals
ability -> sensors -> cameras -> lidar
ability -> sensors -> cameras -> lidar -> sonar
ability -> sensors -> cameras -> radar
ability -> sensors -> cameras -> sensors
ability -> aspects
ability -> aspects -> world
include -> applications
include -> recognition
include -> recognition -> speech
include -> recognition -> recognition
include -> recognition -> recognition
include -> recognition -> recognition -> obje

includes -> ai
includes -> ai -> behavior
rejected -> researchers
rejected -> researchers -> field
rejected -> researchers -> field -> robotics
rejected -> researchers -> field -> rodney
rejected -> researchers -> field -> rodney -> brooks
rejected -> ai
rejected -> problems
rejected -> problems -> engineering
rejected -> problems -> robots
revived -> work
revived -> point
revived -> point -> view
revived -> point -> view -> researchers
revived -> point -> view -> researchers -> cybernetics
revived -> point -> view -> researchers -> 1950s
revived -> use
revived -> use -> theory
revived -> use -> theory -> control
revived -> ai
coincided -> development
coincided -> development -> thesis
coincided -> development -> thesis -> mind
coincided -> development -> thesis -> field
coincided -> development -> thesis -> field -> science
coincided -> idea
coincided -> idea -> aspects
coincided -> idea -> aspects -> body
coincided -> idea -> aspects -> body -> movement
coincided -> idea -> aspects -

considered -> game -> interpretation
considered -> game -> interpretation -> test
considered -> game -> interpretation -> test -> turing
considered -> game -> interpretation -> test -> computer
considered -> benchmark
test -> derivative
test -> derivative -> test
test -> derivative -> test -> turing
test -> automated
test -> turing
test -> turing -> public
test -> computers
test -> computers -> humans
test -> computers -> captcha
helps -> name
helps -> person
helps -> person -> user
helps -> person -> computer
helps -> person -> computer -> human
administered -> contrast
administered -> contrast -> test
administered -> contrast -> test -> turing
administered -> captcha
administered -> machine
administered -> machine
asks -> computer
asks -> user
asks -> test
asks -> grade
asks -> grade -> test
unable -> computers
unable -> problem
unable -> solutions
unable -> result
unable -> result -> person
unable -> result -> person -> test
test -> type
test -> type -> captcha
test -> typing
test -

led -> concern
led -> concern -> risk
led -> concern -> risk -> intelligence
led -> donations
led -> donations -> profile
led -> donations -> investments
committed -> group
committed -> group -> titans
committed -> group -> titans -> tech
committed -> group -> titans -> peter
committed -> group -> titans -> peter -> thiel
committed -> group -> titans -> peter -> services
committed -> group -> titans -> peter -> services -> amazon
committed -> group -> titans -> peter -> services -> web
committed -> group -> titans -> peter -> musk
committed -> openai
committed -> openai -> company
committed -> openai -> company -> nonprofit
committed -> openai -> company -> development
committed -> openai -> company -> development -> ai
mixed -> opinion
mixed -> opinion -> experts
mixed -> opinion -> experts -> field
mixed -> opinion -> experts -> field -> intelligence
mixed -> fractions
mixed -> fractions -> risk
mixed -> fractions -> risk -> ai
believe -> leaders
believe -> leaders -> industry
believ

# 49. Extract the shortest path between two nouns

In [14]:
import networkx as nx

def _add_node(word: Word, g: object) -> None:
    if word.pos.startswith("N"):
        g.add_node(word.id)
        for child in word.children:
            if child.pos.startswith("N"):
                g.add_edge(word.id, child.id)
            _add_node(child, g)

for sentence in _sentences[:5]:
    graph = nx.Graph()
    for word in sentence:
        _add_node(word, graph)
    paths = dict(nx.all_pairs_shortest_path(graph))
    for begin_node in paths:
        for end_node in paths[begin_node]:
            path_length = len(paths[begin_node][end_node])
            for index in range(0, path_length-1):
                path_node = paths[begin_node][end_node][index]
                next_node = paths[begin_node][end_node][index+1]
                if path_node == begin_node:
                    print("X <- ", end='') 
                elif path_node > next_node:
                    print(f"{sentence[path_node].text} -> ", end='')
                elif path_node < next_node:
                    print(f"{sentence[path_node].text} <- ", end='')
            if path_length > 1:
                print("Y")
    

X <- Y
X <- science <- Y
X <- science <- intelligence -> Y
X <- science <- intelligence -> intelligence <- Y
X <- Y
X <- Y
X <- intelligence -> Y
X <- intelligence -> intelligence <- Y
X <- Y
X <- Y
X <- intelligence -> Y
X <- intelligence -> science -> Y
X <- Y
X <- intelligence <- Y
X <- intelligence <- intelligence -> Y
X <- intelligence <- intelligence -> science -> Y
X <- Y
X <- Y
X <- Y
X <- Y
X <- science -> Y
X <- intelligence <- Y
X <- Y
X <- Y
X <- Y
X <- Y
X <- Y
X <- agents <- Y
X <- Y
X <- Y
X <- Y
X <- agents -> Y
X <- Y
X <- Y
X <- Y
X <- Y
X <- Y
X <- learning <- Y
X <- learning <- solving -> Y
X <- Y
X <- Y
X <- solving -> Y
X <- Y
X <- Y
X <- learning -> Y
X <- Y
X <- solving -> Y
X <- solving -> learning -> Y
X <- Y
X <- Y
X <- Y
X <- definition <- Y
X <- Y
X <- definition <- Y
X <- Y
X <- Y
X <- Y
X <- Y
