In [1]:
import os
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser

from jieba.analyse import ChineseAnalyzer

In [2]:
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))

In [3]:
if not os.path.exists("tmp"):
    os.mkdir("tmp")

ix = create_in("tmp", schema)  # for create new index
# ix = open_dir("tmp") # for read only

In [4]:
writer = ix.writer()
writer.add_document(
    title="document1",
    path="/a",
    content="This is the first document we’ve added!"
)

writer.add_document(
    title="document2",
    path="/b",
    content="The second one 你 中文测试中文 is even more interesting! 吃水果"
)

writer.add_document(
    title="document3",
    path="/c",
    content="买水果然后来世博园。"
)

writer.add_document(
    title="document4",
    path="/c",
    content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
)

writer.add_document(
    title="document4",
    path="/c",
    content="咱俩交换一下吧。"
)

writer.commit()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Jacen\AppData\Local\Temp\jieba.cache
Loading model cost 1.078 seconds.
Prefix dict has been built succesfully.


In [5]:
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)

In [6]:
for keyword in ("水果世博园", "你", "first", "中文", "交换机", "交换"):
    print("result of ", keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
        print(hit.highlights("content"))
    print("=" * 10)

result of  水果世博园
买<b class="match term0">水果</b>然后来<b class="match term1">世博园</b>
result of  你
second one <b class="match term0">你</b> 中文测试中文 is even more interesting
result of  first
<b class="match term0">first</b> document we’ve added
result of  中文
second one 你 <b class="match term0">中文</b>测试<b class="match term0">中文</b> is even more interesting
result of  交换机
干事每月经过下属科室都要亲口交代24口<b class="match term0">交换机</b>等技术性器件的安装工作
result of  交换
咱俩<b class="match term0">交换</b>一下吧
干事每月经过下属科室都要亲口交代24口<b class="match term0">交换</b>机等技术性器件的安装工作


In [7]:
for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
    print(t.text)

我
好
朋友
是
李明
我
爱
北京
天安
天安门
ibm
microsoft
dream
intetest
interest
me
lot


In [8]:
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser

from jieba.analyse import ChineseAnalyzer

analyzer = ChineseAnalyzer()

schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
    os.mkdir("tmp")
ix = open_dir("tmp")

searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)

for keyword in ("水果小姐", "你", "first", "中文", "交换机", "交换", "少林", "乔峰"):
    print("result of ", keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
        print(hit.highlights("content"))
    print("=" * 10)

result of  水果小姐
result of  你
second one <b class="match term0">你</b> 中文测试中文 is even more interesting
result of  first
<b class="match term0">first</b> document we’ve added
result of  中文
second one 你 <b class="match term0">中文</b>测试<b class="match term0">中文</b> is even more interesting
result of  交换机
干事每月经过下属科室都要亲口交代24口<b class="match term0">交换机</b>等技术性器件的安装工作
result of  交换
咱俩<b class="match term0">交换</b>一下吧
干事每月经过下属科室都要亲口交代24口<b class="match term0">交换</b>机等技术性器件的安装工作
result of  少林
result of  乔峰
