In [1]:
import pickle
import subprocess
import sys
from nltk import Nonterminal, nonterminals, Production, CFG

In [2]:
nt1 = Nonterminal('NP')
nt2 = Nonterminal('VP')

In [3]:
nt1.symbol()

'NP'

In [6]:
nt1 == Nonterminal('NP')

True

In [7]:
nt1 == nt2

False

Вводим компоненты правила

In [8]:
S, NP, VP, PP = nonterminals('S, NP, VP, PP')
N, V, P, DT = nonterminals('N, V, P, DT')

Левые и правые части правила:

In [9]:
prod1 = Production(S, [NP, VP])
prod2 = Production(NP, [DT, NP])

Спрашиваются части у правила

In [10]:
prod1.lhs() #левая часть

S

In [11]:
prod1.rhs() #правая часть

(NP, VP)

In [12]:
prod1 == Production(S, [NP, VP])

True

In [13]:
prod1 == prod2

False

In [14]:
grammar = CFG.fromstring("""
... S -> NP VP
... PP -> P NP
... NP -> 'the' N | N PP | 'the' N PP
... VP -> V NP | V PP | V NP PP
... N -> 'cat'
... N -> 'dog'
... N -> 'rug'
... V -> 'chased'
... V -> 'sat'
... P -> 'in'
... P -> 'on'
... """)

In [15]:
cmd = """import pickle
... from nltk import Production
... p = Production('S', ['NP', 'VP'])
... print(pickle.dumps(p))
... """

In [16]:
# Start a subprocess to simulate pickling in another process
proc = subprocess.run([sys.executable, '-c', cmd], stdout=subprocess.PIPE)
p1 = pickle.loads(eval(proc.stdout))
p2 = Production('S', ['NP', 'VP'])
print(hash(p1) == hash(p2))

True


In [20]:
from nltk.parse import RecursiveDescentParser #метод рекурсивного спуска (парсинг сверху-вниз)
rd = RecursiveDescentParser(grammar)

In [21]:
sentence1 = 'the cat chased the dog'.split()
sentence2 = 'the cat chased the dog on the rug'.split()

In [22]:
for t in rd.parse(sentence1):
     print(t)

(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))


In [23]:
for t in rd.parse(sentence2): #кошку догнала собака на ковре или кошка догнала собаку, из-за этого 2 варианта
     print(t)

(S
  (NP the (N cat))
  (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug))))))
(S
  (NP the (N cat))
  (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug)))))


In [24]:
from nltk.parse import ShiftReduceParser
sr = ShiftReduceParser(grammar)

In [25]:
sentence1 = 'the cat chased the dog'.split()
sentence2 = 'the cat chased the dog on the rug'.split()

In [26]:
for t in sr.parse(sentence1):
     print(t)

(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))


In [27]:
for t in sr.parse(sentence2):
     print(t) #тут он ничего не выводит, потому что он зашёл в тупик, так как нет одного решения

In [28]:
import nltk

In [29]:
nltk.parse.chart.demo(2, print_times=False, trace=1,
                       sent='I saw a dog', numparses=1)

* Sentence:
I saw a dog
['I', 'saw', 'a', 'dog']

* Strategy: Bottom-up

|.    I    .   saw   .    a    .   dog   .|
|[---------]         .         .         .| [0:1] 'I'
|.         [---------]         .         .| [1:2] 'saw'
|.         .         [---------]         .| [2:3] 'a'
|.         .         .         [---------]| [3:4] 'dog'
|>         .         .         .         .| [0:0] NP -> * 'I'
|[---------]         .         .         .| [0:1] NP -> 'I' *
|>         .         .         .         .| [0:0] S  -> * NP VP
|>         .         .         .         .| [0:0] NP -> * NP PP
|[--------->         .         .         .| [0:1] S  -> NP * VP
|[--------->         .         .         .| [0:1] NP -> NP * PP
|.         >         .         .         .| [1:1] Verb -> * 'saw'
|.         [---------]         .         .| [1:2] Verb -> 'saw' *
|.         >         .         .         .| [1:1] VP -> * Verb NP
|.         >         .         .         .| [1:1] VP -> * Verb
|.         [--------->

In [30]:
nltk.parse.chart.demo(1, print_times=False, trace=0,
                       sent='I saw John with a dog', numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Top-down

Nr edges in chart: 48
(S
  (NP I)
  (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
(S
  (NP I)
  (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))



In [31]:
nltk.parse.chart.demo(2, print_times=False, trace=0,
                       sent='I saw John with a dog', numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Bottom-up

Nr edges in chart: 53
(S
  (NP I)
  (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
(S
  (NP I)
  (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))



In [32]:
nltk.parse.chart.demo(3, print_times=False, trace=0,
                      sent='I saw John with a dog', numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Bottom-up left-corner

Nr edges in chart: 36
(S
  (NP I)
  (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
(S
  (NP I)
  (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))



In [33]:
nltk.parse.chart.demo(4, print_times=False, trace=0,
                       sent='I saw John with a dog', numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Filtered left-corner

Nr edges in chart: 28
(S
  (NP I)
  (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
(S
  (NP I)
  (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))



In [34]:
nltk.parse.chart.demo(5, print_times=False, trace=1,
                       sent='I saw John with a dog', numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Stepping (top-down vs bottom-up)

*** SWITCH TO TOP DOWN
|[------]      .      .      .      .      .| [0:1] 'I'
|.      [------]      .      .      .      .| [1:2] 'saw'
|.      .      [------]      .      .      .| [2:3] 'John'
|.      .      .      [------]      .      .| [3:4] 'with'
|.      .      .      .      [------]      .| [4:5] 'a'
|.      .      .      .      .      [------]| [5:6] 'dog'
|>      .      .      .      .      .      .| [0:0] S  -> * NP VP
|>      .      .      .      .      .      .| [0:0] NP -> * NP PP
|>      .      .      .      .      .      .| [0:0] NP -> * Det Noun
|>      .      .      .      .      .      .| [0:0] NP -> * 'I'
|[------]      .      .      .      .      .| [0:1] NP -> 'I' *
|[------>      .      .      .      .      .| [0:1] S  -> NP * VP
|[------>      .      .      .      .      .| [0:1] NP -> NP * PP
|.      >      .      .      .      .      .| [1

In [35]:
nltk.parse.earleychart.demo(print_times=False, trace=1,
                             sent='I saw John with a dog', numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

|.  I   . saw  . John . with .  a   . dog  .|
|[------]      .      .      .      .      .| [0:1] 'I'
|.      [------]      .      .      .      .| [1:2] 'saw'
|.      .      [------]      .      .      .| [2:3] 'John'
|.      .      .      [------]      .      .| [3:4] 'with'
|.      .      .      .      [------]      .| [4:5] 'a'
|.      .      .      .      .      [------]| [5:6] 'dog'
|>      .      .      .      .      .      .| [0:0] S  -> * NP VP
|>      .      .      .      .      .      .| [0:0] NP -> * NP PP
|>      .      .      .      .      .      .| [0:0] NP -> * Det Noun
|>      .      .      .      .      .      .| [0:0] NP -> * 'I'
|[------]      .      .      .      .      .| [0:1] NP -> 'I' *
|[------>      .      .      .      .      .| [0:1] S  -> NP * VP
|[------>      .      .      .      .      .| [0:1] NP -> NP * PP
|.      >      .      .      .      .      .| [1:1] VP -> * VP PP
|.   

In [36]:
from nltk.corpus import treebank
from itertools import islice
from nltk.grammar import PCFG, induce_pcfg
toy_pcfg1 = PCFG.fromstring("""
     S -> NP VP [1.0]
     NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
     Det -> 'the' [0.8] | 'my' [0.2]
     N -> 'man' [0.5] | 'telescope' [0.5]
     VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
     V -> 'ate' [0.35] | 'saw' [0.65]
     PP -> P NP [1.0]
     P -> 'with' [0.61] | 'under' [0.39]
    """)

In [37]:
toy_pcfg2 = PCFG.fromstring("""
     S    -> NP VP         [1.0]
     VP   -> V NP          [.59]
     VP   -> V             [.40]
     VP   -> VP PP         [.01]
     NP   -> Det N         [.41]
     NP   -> Name          [.28]
     NP   -> NP PP         [.31]
     PP   -> P NP          [1.0]
     V    -> 'saw'         [.21]
     V    -> 'ate'         [.51]
     V    -> 'ran'         [.28]
     N    -> 'boy'         [.11]
     N    -> 'cookie'      [.12]
     N    -> 'table'       [.13]
     N    -> 'telescope'   [.14]
     N    -> 'hill'        [.5]
     Name -> 'Jack'        [.52]
     Name -> 'Bob'         [.48]
     P    -> 'with'        [.61]
     P    -> 'under'       [.39]
     Det  -> 'the'         [.41]
     Det  -> 'a'           [.31]
     Det  -> 'my'          [.28]
     """)

In [38]:
grammar = PCFG.fromstring("""
... A -> B B [.3] | C B C [.7]
... B -> B D [.5] | C [.5]
... C -> 'a' [.1] | 'b' [0.9]
... D -> 'b' [1.0]
... """)
prod = grammar.productions()[0]
prod

A -> B B [0.3]

In [39]:
prod.lhs()

A

In [40]:
prod.rhs()

(B, B)

In [41]:
print((prod.prob()))

0.3


In [42]:
grammar.start()

A

In [43]:
grammar.productions()

[A -> B B [0.3],
 A -> C B C [0.7],
 B -> B D [0.5],
 B -> C [0.5],
 C -> 'a' [0.1],
 C -> 'b' [0.9],
 D -> 'b' [1.0]]

In [45]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\DarkLord\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


True

In [46]:
productions = []
for fileid in treebank.fileids()[:2]:
     for t in treebank.parsed_sents(fileid):
         productions += t.productions()

In [47]:
grammar = induce_pcfg(S, productions)
grammar

<Grammar with 71 productions>

In [48]:
sorted(grammar.productions(lhs=Nonterminal('PP')))[:2]

[PP -> IN NP [1.0]]

In [49]:
sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2]

[NNP -> 'Agnew' [0.0714286], NNP -> 'Consolidated' [0.0714286]]

In [50]:
sorted(grammar.productions(lhs=Nonterminal('JJ')))[:2]

[JJ -> 'British' [0.142857], JJ -> 'former' [0.142857]]

In [51]:
sorted(grammar.productions(lhs=Nonterminal('NP')))[:2]

[NP -> CD NNS [0.133333], NP -> DT JJ JJ NN [0.0666667]]

In [52]:
tokens = "Jack saw Bob with my cookie".split()
grammar = toy_pcfg2
print(grammar)

Grammar with 23 productions (start state = S)
    S -> NP VP [1.0]
    VP -> V NP [0.59]
    VP -> V [0.4]
    VP -> VP PP [0.01]
    NP -> Det N [0.41]
    NP -> Name [0.28]
    NP -> NP PP [0.31]
    PP -> P NP [1.0]
    V -> 'saw' [0.21]
    V -> 'ate' [0.51]
    V -> 'ran' [0.28]
    N -> 'boy' [0.11]
    N -> 'cookie' [0.12]
    N -> 'table' [0.13]
    N -> 'telescope' [0.14]
    N -> 'hill' [0.5]
    Name -> 'Jack' [0.52]
    Name -> 'Bob' [0.48]
    P -> 'with' [0.61]
    P -> 'under' [0.39]
    Det -> 'the' [0.41]
    Det -> 'a' [0.31]
    Det -> 'my' [0.28]


In [53]:
from nltk.parse import pchart

In [54]:
parser = pchart.InsideChartParser(grammar)
for t in parser.parse(tokens):
    print(t)

(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
(S
  (NP (Name Jack))
  (VP
    (VP (V saw) (NP (Name Bob)))
    (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)


In [55]:
parser = pchart.RandomChartParser(grammar)
for t in parser.parse(tokens):
     print(t)

(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
(S
  (NP (Name Jack))
  (VP
    (VP (V saw) (NP (Name Bob)))
    (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)


In [56]:
parser = pchart.UnsortedChartParser(grammar)
for t in parser.parse(tokens):
     print(t)

(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
(S
  (NP (Name Jack))
  (VP
    (VP (V saw) (NP (Name Bob)))
    (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)


In [57]:
parser = pchart.LongestChartParser(grammar)
for t in parser.parse(tokens):
     print(t)

(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
(S
  (NP (Name Jack))
  (VP
    (VP (V saw) (NP (Name Bob)))
    (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)


In [58]:
parser = pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)
for t in parser.parse(tokens):
     print(t)

In [59]:
from nltk.parse import ViterbiParser
tokens = "Jack saw Bob with my cookie".split()
grammar = toy_pcfg2

In [60]:
parser = ViterbiParser(grammar)
for t in parser.parse(tokens):
     print(t)

(S
  (NP (Name Jack))
  (VP
    (V saw)
    (NP
      (NP (Name Bob))
      (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)


In [61]:
from nltk.grammar import FeatStructNonterminal
FeatStructNonterminal(
    pos='n', agr=FeatStructNonterminal(number='pl', gender='f'))

[agr=[gender='f', number='pl'], pos='n']

In [62]:
FeatStructNonterminal('VP[+fin]/NP[+pl]')

VP[+fin]/NP[+pl]

In [63]:
nltk.parse.featurechart.demo(print_times=False,
                              print_grammar=True,
                              parser=nltk.parse.featurechart.FeatureChartParser,
                              sent='I saw John with a dog')


Grammar with 18 productions (start state = S[])
    S[] -> NP[] VP[]
    PP[] -> Prep[] NP[]
    NP[] -> NP[] PP[]
    VP[] -> VP[] PP[]
    VP[] -> Verb[] NP[]
    VP[] -> Verb[]
    NP[] -> Det[pl=?x] Noun[pl=?x]
    NP[] -> 'John'
    NP[] -> 'I'
    Det[] -> 'the'
    Det[] -> 'my'
    Det[-pl] -> 'a'
    Noun[-pl] -> 'dog'
    Noun[-pl] -> 'cookie'
    Verb[] -> 'ate'
    Verb[] -> 'saw'
    Prep[] -> 'with'
    Prep[] -> 'under'

* FeatureChartParser
Sentence: I saw John with a dog
|.I.s.J.w.a.d.|
|[-] . . . . .| [0:1] 'I'
|. [-] . . . .| [1:2] 'saw'
|. . [-] . . .| [2:3] 'John'
|. . . [-] . .| [3:4] 'with'
|. . . . [-] .| [4:5] 'a'
|. . . . . [-]| [5:6] 'dog'
|[-] . . . . .| [0:1] NP[] -> 'I' *
|[-> . . . . .| [0:1] S[] -> NP[] * VP[] {}
|[-> . . . . .| [0:1] NP[] -> NP[] * PP[] {}
|. [-] . . . .| [1:2] Verb[] -> 'saw' *
|. [-> . . . .| [1:2] VP[] -> Verb[] * NP[] {}
|. [-] . . . .| [1:2] VP[] -> Verb[] *
|. [-> . . . .| [1:2] VP[] -> VP[] * PP[] {}
|[---] . . . .| [0:2] S[] ->