In [1]:
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import sys,os
sys.path.append("../")
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser

from jieba.analyse.analyzer import ChineseAnalyzer

analyzer = ChineseAnalyzer()

#创建索引模型
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
#ID：只能为一个单元值，不能分割为若干个词，常用于文件路径、URL、日期、分类。
#TEXT：文件的文本内容，建立文本的索引并存储，支持词汇搜索。
#path：索引创建地址。
#stored为True表示能够被检索

#建立索引存储目录
if not os.path.exists("tmp"):
    os.mkdir("tmp")
#如果目录tmp不存在则创建
ix = create_in("tmp", schema) #按照schema模式建立索引目录

#写入索引文件
writer = ix.writer()
writer.add_document(
    title="document1",
    path="/a",
    content="This is the first document we’ve added!"
)

writer.add_document(
    title="document2",
    path="/b",
    content="The second one 你 中文测试中文 is even more interesting! 吃水果"
)

writer.add_document(
    title="document3",
    path="/c",
    content="买水果然后来世博园。"
)

writer.add_document(
    title="document4",
    path="/c",
    content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
)

writer.add_document(
    title="document4",
    path="/c",
    content="咱俩交换一下吧。"
)

writer.commit() #将添加的文档保存到索引

#检索
searcher = ix.searcher() #创建一个检索器
parser = QueryParser("content", schema=ix.schema)

for keyword in ("水果世博园","你","first","中文","交换机","交换"): 
    #keyword：用空格或者逗号分隔的关键词，可以被索引和搜索，不支持词汇搜索。
    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
        print(hit.highlights("content"))
    print("="*10)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\deftu\AppData\Local\Temp\jieba.cache
Loading model cost 0.985 seconds.
Prefix dict has been built succesfully.


result of  水果世博园
买<b class="match term0">水果</b>然后来<b class="match term1">世博园</b>
result of  你
second one <b class="match term0">你</b> 中文测试中文 is even more interesting
result of  first
<b class="match term0">first</b> document we’ve added
result of  中文
second one 你 <b class="match term0">中文</b>测试<b class="match term0">中文</b> is even more interesting
result of  交换机
干事每月经过下属科室都要亲口交代24口<b class="match term0">交换机</b>等技术性器件的安装工作
result of  交换
咱俩<b class="match term0">交换</b>一下吧
干事每月经过下属科室都要亲口交代24口<b class="match term0">交换</b>机等技术性器件的安装工作
