In [None]:
# installations
!pip install dframcy
!python -m spacy download en_core_web_sm
!pip install networkx

# imports
import spacy
from spacy import displacy
import nltk
import en_core_web_sm
from dframcy import DframCy
import networkx as nx
import nltk 
nltk.download('punkt') 
nltk.download('averaged_perceptron_tagger') 
from nltk import pos_tag, word_tokenize, RegexpParser 

Collecting dframcy
  Downloading dframcy-0.1.6-py3-none-any.whl (13 kB)
Collecting tox-travis
  Downloading tox_travis-0.12-py2.py3-none-any.whl (10 kB)
Collecting spacy>=3.0.0
  Downloading spacy-3.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 33.7 MB/s 
Collecting pytest-cov
  Downloading pytest_cov-3.0.0-py3-none-any.whl (20 kB)
Collecting tox
  Downloading tox-3.24.5-py2.py3-none-any.whl (85 kB)
[K     |████████████████████████████████| 85 kB 4.9 MB/s 
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 53.8 MB/s 
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.1-py3-none-any.whl (7.0 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K   

In [None]:
# load English spacy model and open text as nlp doc
nlp = spacy.load('en_core_web_sm')
text = 'Mr Wonka himself had suddenly become even more excited than usual, and anyone could see that this was the room he loved best of all. He was hopping about among the saucepans and the machines like a child among his Christmas presents, not knowing which thing to look at first. He lifted the lid from a huge pot and took a sniff; then he rushed over and dipped a finger into a barrel of sticky yellow stuff and had a taste; then he skipped across to one of the machines and turned half a dozen knobs this way and that; then he peered anxiously through the glass door of a gigantic oven, rubbing his hands and cackling with delight at what he saw inside. Then he ran over to another machine, a small shiny affair that kept going phut-phut-phut-phut-phut, and every time it went phut, a large green marble dropped out of it into a basket on the floor.'
doc_text = nlp(text)

In [None]:
# open text in Dframcy module 
dframcy = DframCy(nlp)
doc = dframcy.nlp(text)
# create dataframe with dFramcy features
df = dframcy.to_dataframe(doc)

# display Dframcy dataframe
df

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_
0,Mr,0,2,PROPN,NNP,compound,Wonka,PERSON
1,Wonka,3,8,PROPN,NNP,nsubj,become,PERSON
2,himself,9,16,PRON,PRP,appos,Wonka,
3,had,17,20,AUX,VBD,aux,become,
4,suddenly,21,29,ADV,RB,advmod,become,
...,...,...,...,...,...,...,...,...
175,basket,827,833,NOUN,NN,pobj,into,
176,on,834,836,ADP,IN,prep,basket,
177,the,837,840,DET,DT,det,floor,
178,floor,841,846,NOUN,NN,pobj,on,


In [6]:
# path and pathlength of a target-token from the head. 

def shortestPathSentence(df):
  ''' 
  Opens Dframcy df, adds the following columns:
  :length to head: distance between token and its head in numbers
  :path_to_head: path from token to its head in tokens

  :param df: Dframcy dataframe
  '''
  length_list = []
  path_list = []
  new_list = list(zip(df['token_text'], df['token_head']))
  graph = nx.Graph()
  for token, head in new_list:
    graph.add_edge(token, head)
    length_list.append(nx.shortest_path_length(graph, source=token, target=head))
    path_list.append(nx.shortest_path(graph, source=token, target=head))
  df['length_to_head'] = length_list
  df['path_to_head'] = path_list

shortestPathSentence(df)

In [7]:
# children
def extract_children(doc, df):
  ''' 
  Opens Dframcy df, adds the following columns:
  :children: for every token the children are calculated

  :param df: Dframcy dataframe
  :param doc: NLP doc created from our data
  '''
  children = []

  for token in doc:
    c = [child for child in token.children]
    children.append(c)
  
  df['children'] = children

extract_children(doc, df)

In [8]:
# descendants
def extract_descendants(doc, df):
  ''' 
  Opens Dframcy df, adds the following columns:
  :descendants: for every token the descendants are calculated

  :param df: Dframcy dataframe
  :param doc: NLP doc created from our data
  '''
  descendants = []

  for token in doc:
    l = []
    for descendant in token.subtree:
      l.append(descendant)
    descendants.append(l) 
  
  df['descendants'] = descendants

extract_descendants(doc, df)

In [9]:
# ancestors
def extract_ancestors(doc, df):
  ''' 
  Opens Dframcy df, adds the following columns:
  :ancestors: for every token the ancestors are calculated

  :param df: Dframcy dataframe
  :param doc: NLP doc created from our data
  '''
  ancestors = []

  for token in doc:
    l = []
    for ancestor in token.ancestors:
      l.append(ancestor)
    ancestors.append(l) 
  
  df['ancestors'] = ancestors

extract_ancestors(doc, df)

In [10]:
# ngrams
def extract_ngrams(df):
  ''' 
  Opens Dframcy df, adds the following columns:
  :bigram: for every token the bigrams are calculated
  :trigram: for every token the trigrams are calculated

  :param df: Dframcy dataframe
  '''
  token_list = df['token_text'].tolist()

  next_tokenlist = token_list[1:]
  next_tokenlist.append('<EOD>')

  next_next_tokenlist = token_list[2:]
  next_next_tokenlist.append('<EOD>')
  next_next_tokenlist.append('<EOD>')

  bi_zipped = zip(token_list, next_tokenlist)
  bigrams = list(bi_zipped)

  tri_zipped = zip(token_list, next_tokenlist, next_next_tokenlist)
  trigrams = list(tri_zipped)


  df['bigram'] = bigrams
  df['trigram'] = trigrams

extract_ngrams(df)


In [11]:
# 2 previous and 2 following tokens as features
df['token+1'] = df["token_text"].shift(-1)
df['token+2'] = df["token_text"].shift(-2)
df['token-1'] = df["token_text"].shift(1)
df['token-2'] = df["token_text"].shift(2)

In [12]:
#shows the first 30 rows of the dataframe with all features
df.head(30)

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_,length_to_head,path_to_head,children,descendants,ancestors,bigram,trigram,token+1,token+2,token-1,token-2
0,Mr,0,2,PROPN,NNP,compound,Wonka,PERSON,1,"[Mr, Wonka]",[],[Mr],"[Wonka, become]","(Mr, Wonka)","(Mr, Wonka, himself)",Wonka,himself,,
1,Wonka,3,8,PROPN,NNP,nsubj,become,PERSON,1,"[Wonka, become]","[Mr, himself]","[Mr, Wonka, himself]",[become],"(Wonka, himself)","(Wonka, himself, had)",himself,had,Mr,
2,himself,9,16,PRON,PRP,appos,Wonka,,1,"[himself, Wonka]",[],[himself],"[Wonka, become]","(himself, had)","(himself, had, suddenly)",had,suddenly,Wonka,Mr
3,had,17,20,AUX,VBD,aux,become,,1,"[had, become]",[],[had],[become],"(had, suddenly)","(had, suddenly, become)",suddenly,become,himself,Wonka
4,suddenly,21,29,ADV,RB,advmod,become,,1,"[suddenly, become]",[],[suddenly],[become],"(suddenly, become)","(suddenly, become, even)",become,even,had,himself
5,become,30,36,VERB,VBN,ROOT,become,,0,[become],"[Wonka, had, suddenly, excited, ,, and, see]","[Mr, Wonka, himself, had, suddenly, become, ev...",[],"(become, even)","(become, even, more)",even,more,suddenly,had
6,even,37,41,ADV,RB,advmod,more,,1,"[even, more]",[],[even],"[more, excited, become]","(even, more)","(even, more, excited)",more,excited,become,suddenly
7,more,42,46,ADV,RBR,advmod,excited,,1,"[more, excited]",[even],"[even, more]","[excited, become]","(more, excited)","(more, excited, than)",excited,than,even,become
8,excited,47,54,ADJ,JJ,acomp,become,,1,"[excited, become]","[more, than]","[even, more, excited, than, usual]",[become],"(excited, than)","(excited, than, usual)",than,usual,more,even
9,than,55,59,ADP,IN,prep,excited,,1,"[than, excited]",[usual],"[than, usual]","[excited, become]","(than, usual)","(than, usual, ,)",usual,",",excited,more


In [13]:
# print noun chunks
for chunk in doc.noun_chunks:
    print(chunk.text)

Mr Wonka
himself
anyone
this
the room
he
all
He
the saucepans
the machines
a child
his Christmas presents
which thing
He
the lid
a huge pot
a sniff
he
a finger
a barrel
sticky yellow stuff
a taste
he
the machines
half a dozen knobs
he
the glass door
a gigantic oven
his hands
delight
what
he
he
another machine
a small shiny affair
that
phut-phut-phut-phut-phut
it
a large green marble
it
a basket
the floor


In [14]:
# print syntactic constituents
   
# Find all parts of speech in text
tagged = pos_tag(word_tokenize(text)) 
   
#Extract all parts of speech from any text 
chunker = RegexpParser(""" 
                       NP: {<DT>?<JJ>*<NN>}    #To extract Noun Phrases 
                       P: {<IN>}               #To extract Prepositions 
                       V: {<V.*>}              #To extract Verbs 
                       PP: {<P> <NP>}          #To extract Prepostional Phrases 
                       VP: {<V> <NP|PP>*}      #To extarct Verb Phrases 
                       """) 
  
# Print all parts of speech grouped by constituents 
output = chunker.parse(tagged) 
print(output) 

(S
  Mr/NNP
  Wonka/NNP
  himself/PRP
  (VP (V had/VBD))
  suddenly/RB
  (VP (V become/VBN))
  even/RB
  more/RBR
  excited/JJ
  (P than/IN)
  usual/JJ
  ,/,
  and/CC
  (NP anyone/NN)
  could/MD
  (VP (V see/VB))
  (P that/IN)
  this/DT
  (VP (V was/VBD) (NP the/DT room/NN))
  he/PRP
  (VP (V loved/VBD))
  best/JJS
  (P of/IN)
  all/DT
  ./.
  He/PRP
  (VP (V was/VBD))
  (VP (V hopping/VBG))
  (P about/IN)
  (P among/IN)
  the/DT
  saucepans/NNS
  and/CC
  the/DT
  machines/NNS
  (PP (P like/IN) (NP a/DT child/NN))
  (P among/IN)
  his/PRP$
  Christmas/NNP
  presents/NNS
  ,/,
  not/RB
  (VP (V knowing/VBG))
  which/WDT
  (NP thing/NN)
  to/TO
  (VP (V look/VB))
  (P at/IN)
  first/RB
  ./.
  He/PRP
  (VP
    (V lifted/VBD)
    (NP the/DT lid/NN)
    (PP (P from/IN) (NP a/DT huge/JJ pot/NN)))
  and/CC
  (VP (V took/VBD) (NP a/DT sniff/NN))
  ;/:
  then/RB
  he/PRP
  (VP (V rushed/VBD))
  over/RB
  and/CC
  (VP
    (V dipped/VBD)
    (NP a/DT finger/NN)
    (PP (P into/IN) (NP a/DT barr

In [15]:
# print dependency tree
displacy.render(doc, jupyter=True)