forked from lucaong/minisearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove.ts
232 lines (205 loc) · 7.5 KB
/
remove.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import { type SearchIndex } from "./SearchIndex.js";
import { SearchableMap } from "./SearchableMap/SearchableMap.js";
import { removeTerm } from "./term.js";
import { maybeAutoVacuum } from "./vacuum.js";
const removeFieldLength = <Document, ID>(
searchIndex: SearchIndex<Document, ID>,
fieldId: number,
count: number,
length: number,
): void => {
if (count === 1) {
searchIndex._avgFieldLength[fieldId] = 0;
return;
}
const totalFieldLength =
searchIndex._avgFieldLength[fieldId] * count - length;
searchIndex._avgFieldLength[fieldId] = totalFieldLength / (count - 1);
};
/**
* Discards the document with the given ID, so it won't appear in search results
*
* It has the same visible effect of [[remove]] (both cause the
* document to stop appearing in searches), but a different effect on the
* internal data structures:
*
* - [[remove]] requires passing the full document to be removed
* as argument, and removes it from the inverted index immediately.
*
* - [[discard]] instead only needs the document ID, and works by
* marking the current version of the document as discarded, so it is
* immediately ignored by searches. This is faster and more convenient than
* `remove`, but the index is not immediately modified. To take care of
* that, vacuuming is performed after a certain number of documents are
* discarded, cleaning up the index and allowing memory to be released.
*
* After discarding a document, it is possible to re-add a new version, and
* only the new version will appear in searches. In other words, discarding
* and re-adding a document works exactly like removing and re-adding it. The
* [[replace]] method can also be used to replace a document with a
* new version.
*
* #### Details about vacuuming
*
* Repetitive calls to this method would leave obsolete document references in
* the index, invisible to searches. Two mechanisms take care of cleaning up:
* clean up during search, and vacuuming.
*
* - Upon search, whenever a discarded ID is found (and ignored for the
* results), references to the discarded document are removed from the
* inverted index entries for the search terms. This ensures that subsequent
* searches for the same terms do not need to skip these obsolete references
* again.
*
* - In addition, vacuuming is performed automatically by default (see the
* `autoVacuum` field in [[Options]]) after a certain number of documents
* are discarded. Vacuuming traverses all terms in the index, cleaning up
* all references to discarded documents. Vacuuming can also be triggered
* manually by calling [[vacuum]].
*
* @param searchIndex The search Index
* @param id The ID of the document to be discarded
*/
export const discard = <Document, ID>(
searchIndex: SearchIndex<Document, ID>,
id: ID,
): void => {
const shortId = searchIndex._idToShortId.get(id);
if (shortId == null)
throw new Error(
`SlimSearch: cannot discard document with ID ${<string>(
id
)}: it is not in the index`,
);
searchIndex._idToShortId.delete(id);
searchIndex._documentIds.delete(shortId);
searchIndex._storedFields.delete(shortId);
(searchIndex._fieldLength.get(shortId) || []).forEach(
(fieldLength, fieldId) => {
removeFieldLength(
searchIndex,
fieldId,
searchIndex._documentCount,
fieldLength,
);
},
);
searchIndex._fieldLength.delete(shortId);
searchIndex._documentCount -= 1;
searchIndex._dirtCount += 1;
maybeAutoVacuum(searchIndex);
};
/**
* Discards the documents with the given IDs, so they won't appear in search
* results
*
* It is equivalent to calling [[discard]] for all the given IDs,
* but with the optimization of triggering at most one automatic vacuuming at
* the end.
*
* Note: to remove all documents from the index, it is faster and more
* convenient to call [[removeAll]] with no argument, instead of
* passing all IDs to this method.
*/
export const discardAll = <Document, ID>(
searchIndex: SearchIndex<Document, ID>,
ids: readonly ID[],
): void => {
const autoVacuum = searchIndex._options.autoVacuum;
try {
searchIndex._options.autoVacuum = false;
for (const id of ids) discard(searchIndex, id);
} finally {
searchIndex._options.autoVacuum = autoVacuum;
}
maybeAutoVacuum(searchIndex);
};
/**
* Removes the given document from the index.
*
* The document to remove must NOT have changed between indexing and removal,
* otherwise the index will be corrupted.
*
* This method requires passing the full document to be removed (not just the
* ID), and immediately removes the document from the inverted index, allowing
* memory to be released. A convenient alternative is [[discard]],
* which needs only the document ID, and has the same visible effect, but
* delays cleaning up the index until the next vacuuming.
*
* @param searchIndex The search Index
* @param document The document to be removed
*/
export const remove = <Document, ID>(
searchIndex: SearchIndex<Document, ID>,
document: Document,
): void => {
const { tokenize, processTerm, extractField, fields, idField } =
searchIndex._options;
const id = <ID>extractField(document, idField);
if (id == null)
throw new Error(`SlimSearch: document does not have ID field "${idField}"`);
const shortId = searchIndex._idToShortId.get(id);
if (shortId == null)
throw new Error(
`SlimSearch: cannot remove document with ID ${<string>(
id
)}: it is not in the index`,
);
for (const field of fields) {
const fieldValue = extractField(document, field);
if (fieldValue == null) continue;
const tokens = tokenize(fieldValue.toString(), field);
const fieldId = searchIndex._fieldIds[field];
const uniqueTerms = new Set(tokens).size;
removeFieldLength(
searchIndex,
fieldId,
searchIndex._documentCount,
uniqueTerms,
);
for (const term of tokens) {
const processedTerm = processTerm(term, field);
if (Array.isArray(processedTerm))
for (const t of processedTerm)
removeTerm(searchIndex, fieldId, shortId, t);
else if (processedTerm)
removeTerm(searchIndex, fieldId, shortId, processedTerm);
}
}
searchIndex._storedFields.delete(shortId);
searchIndex._documentIds.delete(shortId);
searchIndex._idToShortId.delete(id);
searchIndex._fieldLength.delete(shortId);
searchIndex._documentCount -= 1;
};
/**
* Removes all the given documents from the index. If called with no arguments,
* it removes _all_ documents from the index.
*
* @param searchIndex The search Index
* @param documents The documents to be removed. If this argument is omitted,
* all documents are removed. Note that, for removing all documents, it is
* more efficient to call this method with no arguments than to pass all
* documents.
*/
export const removeAll = function removeAll<Document, ID>(
searchIndex: SearchIndex<Document, ID>,
documents?: readonly Document[],
): void {
if (documents) {
for (const document of documents) remove(searchIndex, document);
} else if (arguments.length > 1) {
throw new Error(
"Expected documents to be present. Omit the argument to remove all documents.",
);
} else {
searchIndex._index = new SearchableMap();
searchIndex._documentCount = 0;
searchIndex._documentIds = new Map();
searchIndex._idToShortId = new Map();
searchIndex._fieldLength = new Map();
searchIndex._avgFieldLength = [];
searchIndex._storedFields = new Map();
searchIndex._nextId = 0;
}
};