forked from pes10k/cs412-scorer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_order.py
149 lines (117 loc) · 5.89 KB
/
word_order.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import parsers
import essay_utils
import tree_utils
from cmd_utils import cmd_essay_index, cmd_log_level, log
from sentence_tokenizer import parse_sentences
from cache_utils import cache_get, cache_set
def tags_appear_in_order(haystack, needles):
indexes = [haystack.index(n) for n in needles if n in haystack]
return sorted(indexes) == indexes
def num_tag_order_errors(tree, sub_tree_nodes, tags):
sub_trees = list(tree.subtrees(lambda x: x.node in sub_tree_nodes))
num_errors = 0
if len(sub_trees) > 0:
num_tags = len(tags)
for a_tree in sub_trees:
sub_trees_nodes = list([t.node for t in a_tree])
# print sub_tree_nodes
# print sub_trees_nodes
num_missing_tags = sum([0 if tags[i] in sub_trees_nodes else 1 for i in range(0, num_tags)])
if num_missing_tags == 0 and not tags_appear_in_order(sub_trees_nodes, tags):
num_errors += 1
return num_errors
def num_forbidden_orders(tree, sub_tree_nodes, tags):
# Note that we flag nodes that have mis ordered children, so that we don't
# double count the same error twice
sub_trees = list(tree.subtrees(lambda x: x.node in sub_tree_nodes and not hasattr(x, '_has_error')))
num_errors = 0
if len(sub_trees) > 0:
num_tags = len(tags)
for a_tree in sub_trees:
sub_trees_nodes = list([t.node for t in a_tree])
# print sub_tree_nodes
# print sub_trees_nodes
num_missing_tags = sum([0 if tags[i] in sub_trees_nodes else 1 for i in range(0, num_tags)])
if num_missing_tags == 0 and tags_appear_in_order(sub_trees_nodes, tags):
# Add Flag to not over count errors
a_tree._has_error = True
num_errors += 1
return num_errors
# def tags_appear_in_relative_order(source, tags):
# """Checks to see if the given tags appear in the expected order. Tags can
# be missing, but if they do appear, the must be in the given expected order"""
# pos_transition_rules = dict(
# 'VP'=[],
# 'NP'=['DT', 'CD', ]
#
# )
def issues_in_sentence(sentence, use_cache=True):
"""'Brute force' check for a bunch of possible word ordering issues.
Specifically, looking for the following:
- VP coming before NP in standard sentence
- NP coming before VP in inverted sentence
- JJ coming after Nount in NP
- VB before PP in VP
- VB before NP in VP
- VP before S in standard sentence (with embedded sentences)
- NN before CD in NP
- NNP before CD in NP
"""
if use_cache:
result = cache_get('word_order_issues', sentence)
if result is not None:
return result
tree = parsers.parse(sentence)[0]
tree_utils.simplify_tree(tree, trim_adjecent_prop_nouns=True,
normalize_sent_roots=True,
normalize_plural=True,
normalize_case=True)
log("Looking for order issues in: %s" % (sentence,), 1)
if cmd_log_level() >= 4:
print "Simplified Parse Tree"
print tree
problems = []
problems += ["VP->NP in S"] * num_forbidden_orders(tree, ("S",), ('VP', 'NP'))
problems += ["NP->VP in SINV"] * num_forbidden_orders(tree, ('SINV',), ('NP', 'VP'))
problems += ["NN->JJ in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'JP'))
problems += ["PP->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('PP', 'VB'))
problems += ["NP->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('NP', 'VP'))
problems += ["S->VP in S"] * num_forbidden_orders(tree, ('S',), ('S', 'VP'))
problems += ["S->VB in VP"] * num_forbidden_orders(tree, ('VP',), ('S', 'VB'))
# problems += ["VB->VP in VP"] * num_forbidden_orders(tree, ('VP',), ('VB', 'VP'))
problems += ["NP->RBR in ADVP"] * num_forbidden_orders(tree, ('ADVP',), ('NP', 'RBR'))
problems += ["NN->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'DT'))
problems += ["NNP->DT in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'DT'))
problems += ["NN->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NN', 'CD'))
problems += ["NNP->CD in NP"] * num_forbidden_orders(tree, ('NP',), ('NNP', 'CD'))
problems += ['PP->NP in S'] * num_forbidden_orders(tree, ('S',), ('PP', 'NP'))
# Toggle?
problems += ['NP->VP in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'VP'))
# Seems like it should be VB->ADVP->PP
problems += ['VB->PP->ADVP in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'ADVP'))
problems += ['VB->PP->SBAR in VP'] * num_forbidden_orders(tree, ('VP',), ('VB', 'PP', 'SBAR'))
problems += ['NP->S in NP'] * num_forbidden_orders(tree, ('NP',), ('NP', 'S'))
# Seems like the ADJP should be in a NP or somewhere else, not a sibling
# of a noun phrase
problems += ['NP->ADJP in S'] * num_forbidden_orders(tree, ('S',), ('NP', 'ADJP'))
# Last, if there is an S w/ only one child, we call it a word order problem...
problems += ['Single Child S'] * len(list(tree.subtrees(lambda x: x in tree_utils.semi_tree_roots and len(x) == 1)))
if tree[0].node not in tree_utils.semi_tree_roots and not hasattr(tree[0], '_has_error'):
tree[0]._has_error = True
problems += ['No S Root']
log("Found %d order issues" % (len(problems),), 1)
log("Issues: %s", (problems,), 2)
if use_cache:
cache_set('word_order_issues', sentence, problems)
return problems
if __name__ == "__main__":
essay_index = cmd_essay_index()
for essay in [essay_utils.essays[essay_index]]:
issues_in_text = []
for line in essay:
issues_in_line = []
for sentence in parse_sentences(line):
issues_in_sentence = issues_in_sentence(sentence)
issues_in_text += issues_in_sentence
issues_in_line += issues_in_sentence
print issues_in_text