In [None]:
# By: Thomas Vugia
# ID# 013580942

import spacy, pandas as pd
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
sentences = [
    ("Leonard Simon Nimoy was born in Boston."),
    ("Nimoy played Spock."),
    ("Spock is a character in the Star Trek franchise."),
    ("Star Trek was created by the great Eugene Wesley Roddenberry.")]
for sent in sentences:
  doc = nlp(sent)
  print(doc.text)
  for token in doc:
    print(token.text, token.pos_, token.dep_)

displacy.render(doc, style="dep")

Leonard Simon Nimoy was born in Boston.
Leonard PROPN compound
Simon PROPN compound
Nimoy PROPN nsubjpass
was AUX auxpass
born VERB ROOT
in ADP prep
Boston PROPN pobj
. PUNCT punct
Nimoy played Spock.
Nimoy NOUN nsubj
played VERB ROOT
Spock PROPN dobj
. PUNCT punct
Spock is a character in the Star Trek franchise.
Spock NOUN nsubj
is AUX ROOT
a DET det
character NOUN attr
in ADP prep
the DET det
Star PROPN compound
Trek PROPN compound
franchise NOUN pobj
. PUNCT punct
Star Trek was created by the great Eugene Wesley Roddenberry.
Star PROPN compound
Trek PROPN nsubjpass
was AUX auxpass
created VERB ROOT
by ADP agent
the DET det
great ADJ amod
Eugene PROPN compound
Wesley PROPN compound
Roddenberry PROPN pobj
. PUNCT punct


In [None]:
# Task 1 - Entity Pair Extraction
# from spacy.symbols import nsubj, pobj, VERB

def extract_entity_pairs(sent):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(sent)
  subj = []
  obj = []
  for token in doc:
    # Find the subject and its preceding 'subject' words and concatenate them
    if "subj" in token.dep_:
      subj.append(token)
      subj = " ".join(subtoken.text for subtoken in token.subtree if subtoken.pos_ not in ("DET", "ADJ", "ADP", "VERB") and "obj" not in subtoken.dep_)
    # Find the object and its preceding 'object' words and concatenate them
    elif "obj" in token.dep_:
      obj.append(token)
      obj = " ".join(subtoken.text for subtoken in token.subtree if subtoken.pos_ not in ("DET", "ADJ", "ADP", "VERB") and "subj" not in subtoken.dep_)
  return subj, obj

for sent in sentences:
  subj, obj = extract_entity_pairs(sent)
  print("Subject: ", subj)
  print("Object: ", obj)

Subject:  [Nimoy]
Object:  [Boston]
Subject:  [Nimoy]
Object:  [Spock]
Subject:  [Spock]
Object:  [franchise]
Subject:  [Trek]
Object:  [Roddenberry]


In [None]:
# Task 2 - Relation Extraction
# from spacy.symbols import nsubj, VERB
from spacy.matcher import Matcher

def extract_relation(sent):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(sent)
  matcher = Matcher(nlp.vocab)
  reln = []

  pattern = [
        [ # Find verbs prefaced with auxiliary words, possibly
          # followed by determiners and definitely followed by nouns.
          # This assumes the verb is a whole phrase such
          # as "is a"/"was a" in lieu of single word verbs.
            {"POS": "AUX"},
            {"POS": "DET", "OP": "?"},
            {"POS": "NOUN", "OP": "+"}
        ],
        [ # Find the verb first and then find following adverbs and
          # adpositions. This approach attempts to find verbs
          # preceding the head/tail of a sentence.
            {"POS": "VERB"},
            {"POS": "ADV", "OP": "*"},
            {"POS": "ADP", "OP": "?"} # 'at', 'by', 'in'
        ],
        [ # Find verbs like 'be'
            {"POS": "AUX", "DEP": "ROOT"}
        ]
  ]

  for p in pattern:
    matcher.add("RELATION", [p])

  matches = matcher(doc)
  for match_id, start, end in matches:
    span = doc[start:end]
    reln.append(span.text)

  return reln[-1]

for sent in sentences:
  print(extract_relation(sent))

born in
played
is a character
created by


In [None]:
# Task 3 - Triple Extraction

# Combining previous 2 extraction functions into 1
def extract_triple(sent):
  head, tail = extract_entity_pairs(sent)
  relation = extract_relation(sent)
  return head, relation, tail

for sent in sentences:
  head, relation, tail = extract_triple(sent)
  print(head, relation, tail)

[Nimoy] born in [Boston]
[Nimoy] played [Spock]
[Spock] is a character [franchise]
[Trek] created by [Roddenberry]


In [None]:
# Optional self challenge 1

ambig = "A wonderland full of neverending adventure can be very dull."
doc = nlp(ambig)
for token in doc:
  print(token.text, token.pos_, token.dep_)
head, relation, tail = extract_triple(ambig)
print("Head: ", head, "\nRelation: ", relation, "\nTail: ", tail)

A DET det
wonderland NOUN nsubj
full ADJ amod
of ADP prep
neverending VERB compound
adventure NOUN pobj
can AUX aux
be AUX ROOT
very ADV advmod
dull ADJ acomp
. PUNCT punct
Head:  [wonderland] 
Relation:  be 
Tail:  [adventure]


In [None]:
# Optional self challenge 2

ambig = "The beginning only happens after the end."
doc = nlp(ambig)
for token in doc:
  print(token.text, token.pos_, token.dep_)
head, relation, tail = extract_triple(ambig)
print("Head: ", head, "\nRelation: ", relation, "\nTail: ", tail)

The DET det
beginning NOUN nsubj
only ADV advmod
happens VERB ROOT
after ADP prep
the DET det
end NOUN pobj
. PUNCT punct
Head:  [beginning] 
Relation:  happens after 
Tail:  [end]


In [None]:
# Optional self challenge 3

ambig = "Life is but a wondrous gift."
doc = nlp(ambig)
for token in doc:
  print(token.text, token.pos_, token.dep_)
head, relation, tail = extract_triple(ambig)
print("Head: ", head, "\nRelation: ", relation, "\nTail: ", tail)

Life NOUN nsubj
is AUX ROOT
but CCONJ cc
a DET det
wondrous ADJ amod
gift NOUN attr
. PUNCT punct
Head:  [Life] 
Relation:  is 
Tail:  []


In [None]:
# Optional self challenge 4 - unable to resolve handling multiple subjects

# ambig = "The end is just the beginning, and this story must end."
# doc = nlp(ambig)
# for token in doc:
#   print(token.text, token.pos_, token.dep_)
# head, relation, tail = extract_triple(ambig)
# print("Head: ", head, "\nRelation: ", relation, "\nTail: ", tail)

The DET det
end NOUN nsubj
is AUX ROOT
just ADV advmod
the DET det
beginning NOUN attr
, PUNCT punct
and CCONJ cc
this DET det
story NOUN nsubj
must AUX aux
end VERB conj
. PUNCT punct
Head:  [end, story] 
Relation:  end 
Tail:  []


One of the first things I did in this assignment was to have the spaCy read and parse the given sentences to understand where each of the words lie in its classifications. This allows me to have an initial understanding of the tasks.

The second thing I did was to read on spaCy's documentation and attempt to learn how to use it to parse sentences myself and attempt to extract the head/subject and tail/object of any given sentence. This was somewhat rather difficult as spaCy documentation only provided examples for navigating the trees mainly using .lefts and .rights rather than filtering by dependency labels and POS tags.

The third thing I did was to try and understand matcher as well as the arguments it took and how to use them to filter out the verb(s) within the sentences, both including and excluding. This was most apparent in the first self challenge sentence as several problems were not addressed in the initial code of this assignment.

Thankfully, adapting the code to fit the self challenges did not make any notable differences to the output of the given sentences and they still match the expected output.