In [None]:
!pip install treetaggerwrapper
!pip install spacy
!pip install pattern

In [None]:
!pip install stanza


In [None]:
!python -m spacy download en_core_web_lg

In [76]:
import spacy
import en_core_web_lg

nlp =en_core_web_lg.load()

In [37]:
import string


def read_text(path_to_text: str) -> str:

    with open(path_to_text, "r", encoding="utf-8") as f:
        text = f.read()

    return text.replace("\n", "")


def read_text_splitted(path_to_text: str) -> str:

    text = []

    with open(path_to_text, "r", encoding="utf-8") as f:
        for line in f:
            line_clean = line.replace("\n", "")
            if len(line_clean) < 2:
                continue
            text.append(line_clean)

    return text


def clean_text(text: str) -> list:

    text_full = " ".join(text)
    text_full_clean = "".join(
        [i for i in text_full if i not in string.punctuation.replace(".", "").replace("!", "") + "”"]
    )
    text_full_clean = (
        text_full_clean.replace("That’ll", "That will")
        .replace("Potter’s", "Potter is")
        .replace("Voldy’s", "Voldy has")
        .replace("let’s", "let us")
    )

    return text_full, text_full_clean

In [71]:
sentences=read_text_splitted('./Harry_en.txt')
last_5_sent_full, last_5_sent_full_clean = clean_text(sentences[-5:])
words=last_5_sent_full_clean.replace(".","").split(" ")

### Pattern Dependency Parsing:
Issue: it seems to only recognize Nsubj and Obj

In [10]:
from pattern.en import parse
from pattern.en import pprint

In [63]:
pprint(parse(last_5_sent_full_clean,
    tokenize=True,      # Split punctuation marks from words?
    tags=True,          # Parse part-of-speech tags? (NN, JJ, ...)
    chunks=True,        # Parse chunks? (NP, VP, PNP, ...)
    relations=True,    # Parse chunk relations? (-SBJ, -OBJ, ...)
    lemmata=False,      # Parse lemmata? (ate => eat)
    encoding='utf-8',   # Input string encoding.
    tagset=None         # Penn Treebank II (default) or UNIVERSAL.
))

          WORD   TAG    CHUNK   ROLE   ID     PNP    LEMMA   
                                                             
           The   DT     NP      SBJ    1      -      -       
          bang   NN     NP ^    SBJ    1      -      -       
           was   VBD    VP      -      1      -      -       
          like   IN     PP      -      -      PNP    -       
             a   DT     NP      -      -      PNP    -       
        cannon   NN     NP ^    -      -      PNP    -       
         blast   NN     NP ^    -      -      PNP    -       
           and   CC     -       -      -      -      -       
           the   DT     NP      -      -      -      -       
        golden   JJ     NP ^    -      -      -      -       
        flames   NNS    NP ^    -      -      -      -       
          that   WDT    -       -      -      -      -       
       erupted   VBD    VP      -      -      -      -       
       between   IN     PP      -      -      PNP    -       
        

### Spacy Dependency Parsing

In [78]:
doc = nlp(last_5_sent_full_clean)
spc=[]
for token in doc:
  dep_rel=token.dep_
  if dep_rel=='ROOT':
    dep_rel='root'
  elif dep_rel=='prep':
    dep_rel='case'
  elif dep_rel=='auxpass':
    dep_rel='aux:pass'
  elif dep_rel=='nsubjpass':
    dep_rel='nsubj:pass'
  spc.append(dep_rel)
  print(token.text,token.dep_)

The det
previous amod
master nsubjpass
will aux
never neg
have aux
been auxpass
defeated ROOT
. punct
That nsubj
will aux
be ROOT
the det
end attr
of prep
it pobj
. punct
There expl
would aux
be ROOT
time attr
to aux
talk relcl
later amod
hours dobj
and cc
days conj
and cc
maybe advmod
years ROOT
in prep
which pobj
to aux
talk relcl
. punct
We nsubj
did ccomp
it dobj
we nsubj
bashed ROOT
them dobj
wee compound
Potter dobj
is ROOT
the det
one attr
And cc
Voldy nsubj
has aux
gone ROOT
moldy acomp
so advmod
now advmod
let ROOT
us nsubj
have ccomp
fun dobj
! punct
But cc
it nsubj
was ROOT
applause attr
. punct


### Stanza Dependency Parsing


In [None]:
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English
doc = nlp(last_5_sent_full_clean)
# doc.sentences[0].print_dependencies()

In [74]:
words=[]
stanz_rels=[]
for doc in doc.sentences:
  for w in doc.words:
    words.append(w.text)
    stanz_rels.append(w.deprel)
    print(w.text,":",w.deprel)

The : det
previous : amod
master : nsubj:pass
will : aux
never : advmod
have : aux
been : aux:pass
defeated : root
. : punct
That : nsubj
will : aux
be : cop
the : det
end : root
of : case
it : nmod
. : punct
There : expl
would : aux
be : root
time : nsubj
to : mark
talk : acl
later : amod
hours : obj
and : cc
days : conj
and : cc
maybe : advmod
years : conj
in : case
which : obl
to : mark
talk : acl
. : punct
We : nsubj
did : root
it : obj
we : nsubj
bashed : ccomp
them : obj
wee : mark
Potter : nsubj
is : cop
the : det
one : advcl
And : cc
Voldy : nsubj
has : aux
gone : conj
moldy : xcomp
so : advmod
now : advmod
let : parataxis
us : obj
have : xcomp
fun : obj
! : punct
But : cc
it : nsubj
was : cop
applause : root
. : punct


#### Writing result to file and adding ground truth

In [79]:
import pandas as pd
df=pd.DataFrame({'words':words,'ground_truth':stanz_rels,'stanza_predictions':stanz_rels,'spacy_predictions':spc})
df.to_csv('./token_dependency_labeled.csv')

### Comparing Results

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
results=pd.read_csv('./token_dependency_labeled.csv')
print("Stanza Accuracy: ",np.mean(results.ground_truth==results.stanza_predictions),'%')
print("Spacy Accuracy: ",np.mean(results.ground_truth==results.spacy_predictions),'%')

Stanza Accuracy:  0.9047619047619048 %
Spacy Accuracy:  0.6031746031746031 %


#### Stanza Error Analysis:

In [6]:
from collections import Counter
errors=[]
for g,s in zip(results.ground_truth,results.stanza_predictions):
  if g!=s:
    errors.append(g)
errors_counter=Counter(errors)
dict(errors_counter)

{'neg': 1, 'obj': 2, 'parataxis': 1, 'relcl': 1, 'root': 1}

#### Spacy Error Analysis

In [7]:
from collections import Counter
errors=[]
for g,s in zip(results.ground_truth,results.spacy_predictions):
  if g!=s:
    errors.append(g)
errors_counter=Counter(errors)
dict(errors_counter)

{'acl': 1,
 'advcl': 1,
 'ccomp': 1,
 'conj': 2,
 'cop': 3,
 'mark': 3,
 'nmod': 1,
 'nsubj': 2,
 'obj': 7,
 'parataxis': 1,
 'root': 2,
 'xcomp': 1}