Skip to content
This repository
tree: acb09a5171
Fetching contributors…

Cannot retrieve contributors at this time

file 81 lines (69 sloc) 3.619 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
(*
Copyright © 2011 MLstate

This file is part of OPA.

OPA is free software: you can redistribute it and/or modify it under the
terms of the GNU Affero General Public License, version 3, as published by
the Free Software Foundation.

OPA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for
more details.

You should have received a copy of the GNU Affero General Public License
along with OPA. If not, see <http://www.gnu.org/licenses/>.
*)
(*
@author Louis Gesbert
**)

(**
This module extracts search-words from a text: it's used, for instance, by
the database full-search function.

Filtering is quite naive for now, but can be improved.

This module uses the Ulex syntax extension
*)

exception Result of int StringMap.t (* internal use, to return result at eof *)

(* Based on 10002fr.equ file (most common English words on Usenet list), taken
from Moby Word Lists by Grady Ward, a public domain package:
http://www.gutenberg.org/ebooks/3201. The regular expression contains all
the words up to line 70 of the file, excluding "article", "writes", "UUCP",
single letter words and capitalized words (all of which are variants of lower
case words already on the list).

Unoptimized regexp:
"\\(the\\|to\\|of\\|and\\|is\\|in\\|that\\|it\\|for\\|you\\|on\\|be\\|have\\|are\\|with\\|not\\|this\\|or\\|as\\|was\\|but\\|at\\|from\\|by\\|an\\|if\\|they\\|about\\|would\\|can\\|one\\|my\\|will\\|all\\|do\\|edu\\|has\\|like\\|there\\|me\\|out\\|your\\|what\\|which\\|some\\|so\\|we\\|more\\|who\\|any\\|don't\\|up\\|get\\|am\\|just\\|he\\|no\\|other\\)$"

Optimized regexp: *)
let common_words = Str.regexp "\\(a\\(bout\\|ll\\|n[dy]\\|re\\|[mnst]\\)\\|b\\(ut\\|[ey]\\)\\|can\\|do\\(n't\\)?\\|edu\\|f\\(or\\|rom\\)\\|get\\|h\\(a\\(s\\|ve\\)\\|e\\)\\|i[fnst]\\|just\\|like\\|m\\(ore\\|[ey]\\)\\|not?\\|o\\(ne\\|ther\\|ut\\|[fnr]\\)\\|so\\(me\\)?\\|t\\(h\\(at\\|e\\(re\\|y\\)?\\|is\\)\\|o\\)\\|up\\|w\\(as\\|e\\|h\\(at\\|ich\\|o\\)\\|i\\(ll\\|th\\)\\|ould\\)\\|your?\\)$"

(** @param map is a map from word to number of its occurrences; this function
returns the updated map with occurrences from s *)
let utf8_string map s =
  let bad_utf8 () =
    (* Base.jlog ~level:3 "Invalid utf-8 in string, not indexing";*) map in
  let add_word map w =
    match StringMap.find_opt w map with
      | Some num -> StringMap.add w (num + 1) map
      | None -> StringMap.add w 1 map
  in
  let extract_word acc = lexer
    | xml_ideographic ->
        (* Fixme: splitting words in ideographic languages is non-trivial. See
for example http://www.mnogosearch.org/doc33/msearch-cjk.html *)
        (* add_word acc (Ulexing.utf8_lexeme lexbuf) -- index single ideograms ? *)
        acc (* we don't index at all, for now *)
    | xml_letter+ ->
        if Ulexing.lexeme_length lexbuf <= 1 then
          acc
        else
          let word = Cactutf.lowercase (Ulexing.utf8_lexeme lexbuf) in
          if String.length word <= 1 || Str.string_match common_words word 0 then
            acc
          else
            add_word acc word
    | eof -> raise (Result acc)
    | _ -> acc
  in
  let rec lex acc lexbuf = lex (extract_word acc lexbuf) lexbuf
  in
  try
    lex map (Ulexing.from_utf8_string s)
  with
    | Result map -> map
    | Ulexing.Error | Ulexing.InvalidCodepoint _ -> bad_utf8()
    | Utf8.MalFormed -> bad_utf8() (* that one is undocumented in ulex (?) *)
Something went wrong with that request. Please try again.