Skip to content
This repository
Newer
Older
100644 82 lines (69 sloc) 3.617 kb
fccc6851 »
2011-06-21 Initial open-source release
1 (*
2 Copyright © 2011 MLstate
3
4 This file is part of OPA.
5
6 OPA is free software: you can redistribute it and/or modify it under the
7 terms of the GNU Affero General Public License, version 3, as published by
8 the Free Software Foundation.
9
10 OPA is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for
13 more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with OPA. If not, see <http://www.gnu.org/licenses/>.
17 *)
18 (*
19 @author Louis Gesbert
20 **)
21
22 (**
23 This module extracts search-words from a text: it's used, for instance, by
24 the database full-search function.
25
26 Filtering is quite naive for now, but can be improved.
27
28 This module uses the Ulex syntax extension
29 *)
30
31 exception Result of int StringMap.t (* internal use, to return result at eof *)
32
33 (* Based on 10002fr.equ file (most common English words on Usenet list), taken
34 from Moby Word Lists by Grady Ward, a public domain package:
35 http://www.gutenberg.org/ebooks/3201. The regular expression contains all
36 the words up to line 70 of the file, excluding "article", "writes", "UUCP",
37 single letter words and capitalized words (all of which are variants of lower
38 case words already on the list).
39
40 Unoptimized regexp:
41 "\\(the\\|to\\|of\\|and\\|is\\|in\\|that\\|it\\|for\\|you\\|on\\|be\\|have\\|are\\|with\\|not\\|this\\|or\\|as\\|was\\|but\\|at\\|from\\|by\\|an\\|if\\|they\\|about\\|would\\|can\\|one\\|my\\|will\\|all\\|do\\|edu\\|has\\|like\\|there\\|me\\|out\\|your\\|what\\|which\\|some\\|so\\|we\\|more\\|who\\|any\\|don't\\|up\\|get\\|am\\|just\\|he\\|no\\|other\\)$"
42
43 Optimized regexp: *)
44 let common_words = Str.regexp "\\(a\\(bout\\|ll\\|n[dy]\\|re\\|[mnst]\\)\\|b\\(ut\\|[ey]\\)\\|can\\|do\\(n't\\)?\\|edu\\|f\\(or\\|rom\\)\\|get\\|h\\(a\\(s\\|ve\\)\\|e\\)\\|i[fnst]\\|just\\|like\\|m\\(ore\\|[ey]\\)\\|not?\\|o\\(ne\\|ther\\|ut\\|[fnr]\\)\\|so\\(me\\)?\\|t\\(h\\(at\\|e\\(re\\|y\\)?\\|is\\)\\|o\\)\\|up\\|w\\(as\\|e\\|h\\(at\\|ich\\|o\\)\\|i\\(ll\\|th\\)\\|ould\\)\\|your?\\)$"
45
46 (** @param map is a map from word to number of its occurences; this function
47 returns the updated map with occurences from s *)
48 let utf8_string map s =
49 let bad_utf8 () =
da5de875 »
2011-06-22 [cleanup] jlog: remove Base.jlog
50 (* Base.jlog ~level:3 "Invalid utf-8 in string, not indexing";*) map in
fccc6851 »
2011-06-21 Initial open-source release
51 let add_word map w =
52 match StringMap.find_opt w map with
53 | Some num -> StringMap.add w (num + 1) map
54 | None -> StringMap.add w 1 map
55 in
56 let extract_word acc = lexer
57 | xml_ideographic ->
58 (* Fixme: splitting words in ideographic languages is non-trivial. See
59 for example http://www.mnogosearch.org/doc33/msearch-cjk.html *)
60 (* add_word acc (Ulexing.utf8_lexeme lexbuf) -- index single ideograms ? *)
61 acc (* we don't index at all, for now *)
62 | xml_letter+ ->
63 if Ulexing.lexeme_length lexbuf <= 1 then
64 acc
65 else
66 let word = Cactutf.lowercase (Ulexing.utf8_lexeme lexbuf) in
67 if String.length word <= 1 || Str.string_match common_words word 0 then
68 acc
69 else
70 add_word acc word
71 | eof -> raise (Result acc)
72 | _ -> acc
73 in
74 let rec lex acc lexbuf = lex (extract_word acc lexbuf) lexbuf
75 in
76 try
77 lex map (Ulexing.from_utf8_string s)
78 with
79 | Result map -> map
80 | Ulexing.Error | Ulexing.InvalidCodepoint _ -> bad_utf8()
81 | Utf8.MalFormed -> bad_utf8() (* that one is undocumented in ulex (?) *)
Something went wrong with that request. Please try again.