https://markgw.github.io/uh-nlp19/day4/

In [1]:
import nltk
from nltk import CFG
from nltk import PCFG
from nltk.grammar import CFG
from nltk.parse.generate import generate
from nltk.corpus import treebank
nltk.download('treebank')
from nltk.parse.chart import BottomUpChartParser

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [2]:
cfg_rules = """
S -> NP VP
NP -> Det N | PropN
Det -> PosPro | Art
VP -> Vt NP

Art -> 'the' | 'a'
PropN -> 'Alice'
N -> 'duck' | 'telescope' | 'park'
Vt -> 'saw'
PosPro -> 'my' | 'her'
"""
cfg = nltk.CFG.fromstring(cfg_rules)
gram=cfg.is_flexible_chomsky_normal_form()



In [3]:
parser = BottomUpChartParser(gram)
print(treebank.parsed_sents()[0])


(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


### **Grammar** **Parsing**

https://www.nltk.org/howto/grammar.html

In [4]:
grammar = CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | NP PP
VP -> V NP | VP PP
Det -> 'a' | 'the'
N -> 'dog' | 'cat'
V -> 'chased' | 'sat'
P -> 'on' | 'in'
""")

In [5]:
grammar

<Grammar with 14 productions>

In [6]:
grammar.start()

S

In [7]:
grammar.productions()

[S -> NP VP,
 PP -> P NP,
 NP -> Det N,
 NP -> NP PP,
 VP -> V NP,
 VP -> VP PP,
 Det -> 'a',
 Det -> 'the',
 N -> 'dog',
 N -> 'cat',
 V -> 'chased',
 V -> 'sat',
 P -> 'on',
 P -> 'in']

Probabilistic CFGs:

In [8]:
toy_pcfg1 = PCFG.fromstring("""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
N -> 'man' [0.5] | 'telescope' [0.5]
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
""")

In [9]:
toy_pcfg1

<Grammar with 17 productions>

# **Chomsky Normal Form grammar (Test for bug 474)**

In [10]:
g = CFG.fromstring("VP^<TOP> -> VBP NP^<VP-TOP>")

In [11]:
g.productions()[0].lhs()

VP^<TOP>

In [12]:
grammar = CFG.fromstring("""
... S -> A B
... A -> 'a' | 'c'
... # An empty string:
... B -> 'b' | ''
... """)

In [13]:
list(generate(grammar))

[['a', 'b'], ['a', ''], ['c', 'b'], ['c', '']]

In [14]:
grammar2 = CFG.fromstring("""
... S -> A B
... A -> 'a' | 'c'
... # An empty production:
... B -> 'b' | 'c'
... """)

In [15]:
list(generate(grammar2))

[['a', 'b'], ['a', 'c'], ['c', 'b'], ['c', 'c']]