In [1]:
#sample sentences. should be parseable
ss = ["humid in the morning.",
      "windy and mostly cloudy tonight.",
      "heavy drizzle (4 – 6 in.) throughout the day.",
      "windy and windy and possible light flurries (with a chance of 10 – 3 cm. of snow) starting tomorrow, continuing until tonight and this morning.",
      "windy and possible light flurries (with a chance of 10 – 3 cm. of snow) starting tomorrow, continuing until tonight and this morning.",
      "humid until tonight.",
      "possible heavy snow starting later this morning, continuing until tonight."
    ]
# sentence #4 is definitely ambiguous but I think the others are not
# the cky seems to be having trouble with anything with parentheticals

In [2]:
import bigsky.cky as cky
import bigsky.forecast as fc
import bigsky.cfg as cfg
from sacremoses import MosesTokenizer

In [3]:
g = cfg.Cfg.from_file("data/cfgs/weather.cfg")
g = g.binarize()

In [4]:
# make sure the sentences will be tokenized properly/whether they parse
for s in ss:
    print(MosesTokenizer().tokenize(s))

['humid', 'in', 'the', 'morning', '.']
['windy', 'and', 'mostly', 'cloudy', 'tonight', '.']
['heavy', 'drizzle', '(', '4', '–', '6', 'in', '.', ')', 'throughout', 'the', 'day', '.']
['windy', 'and', 'windy', 'and', 'possible', 'light', 'flurries', '(', 'with', 'a', 'chance', 'of', '10', '–', '3', 'cm.', 'of', 'snow', ')', 'starting', 'tomorrow', ',', 'continuing', 'until', 'tonight', 'and', 'this', 'morning', '.']
['windy', 'and', 'possible', 'light', 'flurries', '(', 'with', 'a', 'chance', 'of', '10', '–', '3', 'cm.', 'of', 'snow', ')', 'starting', 'tomorrow', ',', 'continuing', 'until', 'tonight', 'and', 'this', 'morning', '.']
['humid', 'until', 'tonight', '.']
['possible', 'heavy', 'snow', 'starting', 'later', 'this', 'morning', ',', 'continuing', 'until', 'tonight', '.']


In [5]:
ts = []
for s in ss:
    ts += cky.cky_tree(s,g,split_trees=True)
for t in ts:
    print(t)

[('S', ('WEATHER', 'humid'), [('__TIME___"."', [('TIME', ('_"in"', 'in'), [('___"the"__TIMEWORD', ('_"the"', 'the'), ('TIMEWORD', 'morning'))])], ('_"."', '.'))])]
[('S', [('WEATHER', ('WEATHER', 'windy'), [('___"and"__WEATHER', ('_"and"', 'and'), [('WEATHER', ('_"mostly"', 'mostly'), ('_"cloudy"', 'cloudy'))])])], [('__TIME___"."', ('TIME', 'tonight'), ('_"."', '.'))])]
[('S', [('WEATHER', ('PRECIPMODIFIERS', 'heavy'), [('__PRECIPNOUN__PRECIPPAREN', ('PRECIPNOUN', 'drizzle'), [('PRECIPPAREN', ('_"("', '('), [('__MEASURE___")"', [('MEASURE', ('NUM', '4'), [('___"–"__NUM__UNIT', ('_"–"', '–'), [('__NUM__UNIT', ('NUM', '6'), [('UNIT', ('_"in"', 'in'), ('_"."', '.'))])])])], ('_")"', ')'))])])])], [('__TIME___"."', [('TIME', ('_"throughout"', 'throughout'), [('___"the"___"day"', ('_"the"', 'the'), ('_"day"', 'day'))])], ('_"."', '.'))])]
[('S', [('WEATHER', ('PRECIPMODIFIERS', 'heavy'), [('__PRECIPNOUN__PRECIPPAREN', ('PRECIPNOUN', 'drizzle'), [('PRECIPPAREN', ('_"("', '('), [('__MEASURE_

In [10]:
import json
'''
put the resulting string into this syntax tree viewer:
http://ironcreek.net/syntaxtree/
'''
def reformat_tree(t):
    s = json.dumps(t)
    s = s.replace('","', '###')
    s = s.replace('"', '')
    s = s.replace(',','')
    s = s.replace('[[','[')
    s = s.replace(']]',']')
    s = s.replace('_','')
    s = s.replace('###', ',')
    return s


tstrings = {reformat_tree(t) for t in ts}
for tstr in list(tstrings):
    print(tstr)
    print("\n")

[S [WEATHER [PRECIPMODIFIERS heavy] [PRECIPNOUNPRECIPPAREN [PRECIPNOUN drizzle] [PRECIPPAREN [\(\ (] [MEASURE\)\ [MEASURE [NUM 4] [\\u2013\NUMUNIT [\\u2013\ \u2013] [NUMUNIT [NUM 6] [UNIT [\in\ in] [\.\ .]]]]] [\)\ )]]]]] [TIME\.\ [TIME [\throughout\ throughout] [\the\\day\ [\the\ the] [\day\ day]]] [\.\ .]]]


[S [WEATHER humid] [TIME\.\ [TIME [\until\ until] [BTIME tonight]] [\.\ .]]]


[S [WEATHER [PRECIPMODIFIERS [PRECIPMODIFIER possible] [PRECIPMODIFIERS heavy]] [PRECIPNOUN snow]] [TIME\.\ [TIME [\starting\ starting] [BTIME\\\continuing\\until\BTIME [BTIME [\later\ later] [\this\\morning\ [\this\ this] [\morning\ morning]]] [\\\continuing\\until\BTIME [\\ ,] [\continuing\\until\BTIME [\continuing\ continuing] [\until\BTIME [\until\ until] [BTIME tonight]]]]]] [\.\ .]]]


[S [WEATHER [WEATHER [WEATHER windy] [\and\WEATHER [\and\ and] [WEATHER windy]]] [\and\WEATHER [\and\ and] [WEATHER [PRECIPMODIFIERS [PRECIPMODIFIER possible] [PRECIPMODIFIERS light]] [PRECIPNOUNPRECIPPAREN [PRECI

In [7]:
from bigsky.cfg import Terminal, Nonterminal


def orig_cky_parse(sent, grammar):
    """Based on pseudocode in Jurafsky and Martin."""
    words =  MosesTokenizer().tokenize(sent)
    chart = [[set() for i in range(len(words) + 1)] 
             for j in range(len(words) + 1)]
    for j in range(1, 1 + len(words)):
        rules = grammar.get_rules_with_rhs([Terminal(words[j-1])])
        nts = set([rule.lhs for rule in rules])
        chart[j-1][j] = chart[j-1][j] | nts
        for i in range(j-2, -1, -1):
            for k in range(i+1, j):
                nt_pairs = [(x, y) 
                            for x in chart[i][k] 
                            for y in chart[k][j]]
                for nt_pair in nt_pairs:
                    rules = grammar.get_rules_with_rhs(nt_pair)
                    nts = set([rule.lhs for rule in rules])
                    chart[i][j] = chart[i][j] | nts
    return Nonterminal("S") in chart[0][len(words)]    

In [8]:
#quick sanity check against the originally provided cky parser
for s in ss:
    print(orig_cky_parse(s, g), "\t:\t", cky.cky_parse(s, g))

True 	:	 True
True 	:	 True
True 	:	 True
True 	:	 True
True 	:	 True
True 	:	 True
True 	:	 True
