-
Notifications
You must be signed in to change notification settings - Fork 0
/
earley.cpp
288 lines (232 loc) · 8.04 KB
/
earley.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#include <iostream>
#include <regex>
#include <sstream>
#include <string>
#include "earley.h"
#include "grammar.h"
using std::cout;
using std::endl;
using Earley::S_grammar_type_t;
using Earley::S_set_type_t;
using Earley::S_state_type_t;
using Earley::S_type_t;
inline S_type_t Earley::init(const size_t len_words) {
S_type_t S;
for (unsigned int i = 0; i < len_words + 1; ++i) {
S_set_type_t s_set;
S.push_back(s_set);
}
return S;
}
inline bool Earley::is_nonterminal(const char ch) {
return Earley::nonterminals.find(ch) != Earley::nonterminals.end();
}
inline bool Earley::is_terminal(const char ch) {
return Earley::parts_of_speech.find(ch) != Earley::parts_of_speech.end();
}
inline bool Earley::is_finished(const S_state_type_t &state) {
std::string prod = std::get<1>(state);
return prod.find(Earley::dot) >= prod.size() - 1;
}
// Swap with item in front of the dot by default
std::string Earley::swap_around_dot(const std::string &strng) {
if (strng.find(Earley::dot) >= strng.size() - 1)
return strng;
std::string n_str(strng);
auto dot_idx = strng.find(Earley::dot);
auto next_item = strng.at(dot_idx + 1);
// Swap around a/the lexeme
if (Earley::nonterminals.find(next_item) != Earley::nonterminals.end()) {
std::swap(n_str[dot_idx], n_str[dot_idx + 1]);
return n_str;
} else if (Earley::parts_of_speech.find(next_item) !=
Earley::parts_of_speech.end()) {
std::swap(n_str[dot_idx], n_str[dot_idx + 1]);
return n_str;
}
// Swap around a/the token (take the dot, remove it from the front and append
// to the end)
auto indx = n_str.find(Earley::dot);
n_str.erase(indx, indx + 1);
n_str += Earley::dot;
return n_str;
}
std::string Earley::get_next_element(const S_state_type_t &state) {
auto prod = std::get<1>(state);
auto m_index = prod.find(Earley::dot);
std::string nxt(1, prod.at(m_index + 1));
if (is_nonterminal(nxt.at(0)) || is_terminal(nxt.at(0)))
return nxt; // return the thing(char) that comes just after the Earley::dot
// return the latter portion of the string (everything after the Earley::dot)
return prod.substr(m_index + 1, prod.size() - m_index);
}
std::vector<std::string> Earley::split(const std::string &strng,
const char delim) {
std::vector<std::string> retvec;
std::stringstream ss(strng);
std::string holder;
while (ss) {
std::getline(ss, holder, delim);
retvec.push_back(holder);
}
retvec.pop_back(); // take out the final, extra, element (not needed)
return retvec;
}
std::vector<std::string> Earley::prod_split(const std::string &strng) {
std::smatch sm;
std::regex rgx("(\\b([^|])+\\b([|])*)");
std::string m_string(strng);
std::vector<std::string> retvec;
bool got_match = true;
// Parse through the string, stripping the matched terminal/non-terminal from
// the front this allows for continuous matching against one or more
// terminals/non-terminals between the pipes/spaces
while (got_match) {
got_match = std::regex_search(m_string, sm, rgx);
if (got_match) {
retvec.push_back(sm.str());
m_string = sm.suffix();
}
}
return retvec;
}
bool Earley::predict(S_type_t &S, const unsigned int k,
const std::string &nxt_elem, S_grammar_type_t &grammar) {
bool added = false;
std::string nxt_production = nxt_elem;
std::string grammar_prod = grammar[nxt_production];
auto prod_splits = prod_split(grammar_prod);
for (auto &split : prod_splits) {
std::stringstream ss;
std::string split_nows = std::regex_replace(split, std::regex("\\s"), "");
ss << Earley::dot;
ss << split_nows;
std::string dSplit(ss.str());
Earley::S_set_type_t ¤t_set = S[k];
S_state_type_t set_element = std::make_tuple(nxt_production, dSplit, k);
// if not found: insert the tuple(set_element)
if (std::find(current_set.begin(), current_set.end(), set_element) ==
current_set.end()) {
current_set.push_back(set_element);
added = true;
}
}
return added;
}
bool Earley::scan(S_type_t &S, const unsigned int k, S_state_type_t &state,
const std::string &words) {
bool added = false;
std::string nxt_elem_scanner = get_next_element(state);
if (k >= words.size())
return added;
auto words_k = words.at(k);
{
// Process regex information against the token/word (see if it's a number)
std::string rgx_input(1, words_k);
std::regex rgx("[0-9]");
std::smatch sm;
auto res = std::regex_match(rgx_input, sm, rgx);
if (nxt_elem_scanner == "number" && !res) {
cout << "Not going to add non-number to forward set:";
cout << "Terminal symbol is not a number?";
cout << endl;
return added;
}
}
if (k > words.size() - 1)
return added;
if (Earley::parts_of_speech.find(words.at(k)) !=
Earley::parts_of_speech.end()) {
std::string state_swapped = swap_around_dot(std::get<1>(state));
S_state_type_t new_state =
std::make_tuple(std::get<0>(state), state_swapped, std::get<2>(state));
auto &nxt_set = S[k + 1];
// If the element (new_state) is in the language, described by the grammar,
// then add that completed terminal symbol to S[k + 1]
if (std::find(nxt_set.begin(), nxt_set.end(), new_state) == nxt_set.end()) {
nxt_set.push_back(new_state);
added = true;
}
}
return added;
}
bool Earley::complete(S_type_t &S, const unsigned int k,
S_state_type_t &state) {
bool added = false;
int state_origin = std::get<2>(state);
for (Earley::S_state_type_t &t_state : S[state_origin]) {
std::string t_state_sym = std::get<1>(t_state);
std::string t_state_temp(t_state_sym);
auto fnd = t_state_temp.find(Earley::dot);
t_state_temp.erase(fnd, fnd + 1);
if (Earley::nonterminals.find(t_state_temp.at(0)) !=
Earley::nonterminals.end()) {
std::string swapped_prod = swap_around_dot(t_state_sym);
// assemble t_state with the dot after the nonterminal (completed the
// nonterminal) then add the new, completed, t_state to S[k]
S_state_type_t n_tuple = std::make_tuple(
std::get<0>(t_state), swapped_prod, std::get<2>(t_state));
auto sk = S[k];
// If we didn't already add the completed state to the current state, do
// so now; otherwise,ignore and continue on
if (std::find(sk.begin(), sk.end(), n_tuple) == sk.end()) {
S[k].push_back(n_tuple);
added = true;
}
}
}
return added;
}
bool Earley::check_end_set(const S_type_t &S, const S_state_type_t &expected) {
auto &end_set = S[S.size() - 1];
if (std::find(end_set.begin(), end_set.end(), expected) != end_set.end()) {
return true;
}
return false;
}
bool Earley::earley_parse(const std::string &words,
const S_grammar_type_t &grammar) {
// TBD: Add full support for empty string in grammar/earley
if (words.size() == 0)
return true;
std::string start_p = "S";
start_p += Earley::dot;
S_state_type_t expected = std::make_tuple("P", start_p, 0);
// Create the overarching array of sets (parse table)
S_type_t S = init(words.size());
{
std::string sp(1, Earley::dot);
sp += "S";
S[0].push_back(std::make_tuple("P", sp, 0));
}
bool done = false;
bool complete_parse = false;
for (unsigned int k = 0; k < words.size() + 1; ++k) {
bool added = true;
if (done)
break;
while (added) {
added = false;
for (Earley::S_state_type_t state : S[k]) {
if (check_end_set(S, expected)) {
done = true;
complete_parse = true;
break;
}
bool finished = is_finished(state);
if (!finished) {
auto nxt_elem = get_next_element(state);
if (is_nonterminal(nxt_elem.at(0))) {
added = predict(S, k, nxt_elem,
const_cast<S_grammar_type_t &>(grammar));
} else {
added = scan(S, k, state, words);
}
} else {
added = complete(S, k, state);
}
}
}
}
return complete_parse;
}