# 对文档建立索引并实现检索及排序 

@Title: 对文档建立索引并实现检索及排序   
@Author: JJYDXFS   
@Date: 11 July 2021

In [17]:
import jieba
import os
import numpy as np
import time

In [18]:
def read_file(file_path):
    '''
    读文件内容
    '''
    fp = open(file_path,"r",encoding='gbk',errors='ignore')
    content = fp.read()
    fp.close()
    return content

def create_stopwords(file_path):
    '''
    创建停词列表
    '''
    stopwords = [line.strip() for line in open(file_path, 'r', encoding='utf-8').readlines()]
    return stopwords

def preprocess(content):
    '''
    对文本文件预处理
    '''
    content = content.replace("\n", "")
    content = content.replace("\u3000", "")
    
    # content = content.replace(" ", "")
    return content

def create_seq_index(start_doc_id, end_doc_id):
    '''
    建立顺序索引
    '''
    seq_index={}
    for doc_id in range(start_doc_id, end_doc_id, 1):
        raw_content = read_file(corpus_path+str(doc_id)+".txt").strip()
        content = preprocess(raw_content)
        content_seg = jieba.cut(content)    # jieba分词
        
        word_map = {} # 定义单文档词项表
        
        # 去停词 + 计算出现次数
        word_amount = 0
        for word in content_seg:
            word_amount+=1
            if word not in stopwords:
                if word not in word_map.keys():
                    word_map[word]=1
                else:
                    word_map[word]+=1
        # 计算 tf
        for word in word_map:
            # word_map[word]/=word_amount
            word_map[word] = round(word_map[word]/word_amount,8)
        # 存入顺排索引
        seq_index[doc_id]=word_map
        
    return seq_index

def Invert_in_batch(seq_index):
    '''
    对每块文档建立倒排索引
    '''
    
    global word_id_map
    global word_id_counting
    global word_id_table
    
    # 由顺序索引建立倒排索引
    tmp_word_table = {} # 定义 当前文档块 的词项表
    pos_table = {} # 定义 当前文档块 的倒排记录表
    for doc_id in seq_index: # 遍历顺序索引建立倒排索引
        for word in seq_index[doc_id]:
            if word not in word_id_map: # 全局词项表未收录词
                word_id_map[word] =  word_id_counting # 加入全局词项表映射
                word_id_table[word_id_map[word]] = 1 # 加入全局词项表
                word_id_counting += 1 # 序号自增
            
            word_id = word_id_map[word] # 获取词项表对应序号
            
            if word_id not in tmp_word_table: # 局部词项表未收录词
                tmp_word_table[word_id] = True # 加入局部词项表
                pos_table[word_id] = {doc_id:seq_index[doc_id][word]} # 创建倒排记录
            else:
                word_id_table[word_id] +=1
                pos_table[word_id][doc_id] = seq_index[doc_id][word] # 更新倒排记录
    
    return pos_table

## 1. 预处理工作

In [19]:
# 创建停词列表
stopwords_file = "F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\cn_stopwords.txt"
stopwords=create_stopwords(stopwords_file)

## 2. 建立倒排索引

### 2.1 基于内存建立倒排索引

In [20]:
time_start=time.time()

if __name__=="__main__":
    # corpus_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\testdata\\' # 文本语料路径
    corpus_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\data\\Sogou\\' # 文本语料路径
    start_doc_id, end_doc_id = 1, 2001 # 指定文档集范围，左闭右开
    doc_number = end_doc_id - start_doc_id # 文档总数
    
    # 建立顺序索引
    seq_index=create_seq_index(start_doc_id, end_doc_id)
    
    # 由顺序索引建立倒排索引
    word_table = {} # 定义词项表
    invert_table = {} # 定义倒排记录表
    for doc_id in seq_index: # 遍历顺序索引建立倒排索引
        for word in seq_index[doc_id]:
            if word not in word_table: # 未收录词
                word_table[word] = 1 # 加入词项表
                invert_table[word] = {doc_id:seq_index[doc_id][word]} # 创建倒排记录
            else:
                word_table[word] +=1
                invert_table[word][doc_id] = seq_index[doc_id][word]
    
    # 计算 idf
    for word in word_table:
        word_table[word] = round(word_table[word]/doc_number, 8)
                
time_end=time.time()
print('time cost',time_end-time_start,'s')

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\JOYCE\AppData\Local\Temp\jieba.cache
DEBUG:jieba:Loading model from cache C:\Users\JOYCE\AppData\Local\Temp\jieba.cache
Loading model cost 0.810 seconds.
DEBUG:jieba:Loading model cost 0.810 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


time cost 23.336418390274048 s


### 2.2 基于外部磁盘建立倒排索引（分块处理）

In [15]:
time_start=time.time()

if __name__=="__main__":
    # corpus_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\testdata\\' # 文本语料路径
    corpus_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\data\\Sogou\\' # 文本语料路径
    output_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\output\\' # 输出索引文件路径
    start_doc_id, end_doc_id = 10, 2000 # 指定文档集范围，左闭右开
    doc_number = end_doc_id - start_doc_id # 文档总数
    
    batch = 10 # 指定块数
    
    assert (end_doc_id - start_doc_id) % batch == 0 # 保证文档总数是batch的整数倍
    batch_size = int((end_doc_id - start_doc_id)/batch) # 分块进行处理
    
    # 全局变量
    word_id_map ={} # 维护 词项及其序号映射
    word_id_table = {} # 定义词项表
    word_id_counting = 0 # 词项自增序号
    
    tmp_doc_id = start_doc_id # 指定每块起始文档
    batch_count = 0 # 块计数器
    
    while batch_count != batch: # 还有块未处理完时则继续
        # 建立顺序索引
        seq_index = create_seq_index(tmp_doc_id, tmp_doc_id + batch_size)
        # 建立倒排索引
        tmp_pos_table = Invert_in_batch(seq_index)
        # 将块的倒排索引写入磁盘
        np.save(output_path+str(batch_count) + ".npy", tmp_pos_table)
        batch_count += 1
        # 读下一块
        tmp_doc_id += batch_size
        
    # 合并倒排索引
    invert_table = np.load(output_path+"0.npy",allow_pickle=True)[()] # 将nd array转为内置字典
    for table in range(1,batch):
        invert_table_tmp = np.load(output_path + str(table) + ".npy",allow_pickle=True)[()]
        for word_id in invert_table_tmp:
            if word_id in invert_table.keys(): # 若待合并项在原索引表中则更新原表项
                invert_table[word_id].update(invert_table_tmp[word_id])
            else: # 若待合并项不在原索引表中则新增一项
                invert_table[word_id] = invert_table_tmp[word_id]
    
    # 输出合并后的完整倒排索引
    # np.save(output_path+ "result.npy", invert_table)
    
    # 计算 idf
    for word_id in word_id_table:
        word_id_table[word] = round(word_id_table[word_id]/doc_number, 8)
                
time_end=time.time()
print('time cost',time_end-time_start,'s')

time cost 225.83593440055847 s


In [56]:
#print(invert_table_dict[word_id_map['教育']])
#print(sorted(invert_table[word_id_map['孩子']].items(), key = lambda kv:(kv[1], kv[0])))
#print(invert_table[word_id_map['教育']])
#np.save(output_path+ "result.npy", invert_table)

## 3. 布尔检索

In [21]:
def get_record(word):
    '''
    获得词项的倒排记录集合
    异常处理待完善
    '''
    try:
        result = set(list(invert_table[word].keys()))
    except Exception as e:
        raise e
        
    return result

def get_precede(op1, op2):
    '''
    返回两运算符优先级关系
    @param
    op1: 栈顶元素
    op2: 栈外元素
    @return
    优先级关系: > < = !
    '''
    
    if op2 == '#': return '>'
    
    precede_map={
        '(':{'(':'<', ')':'=', 'AND':'<', 'OR':'<', 'ANDNOT':'<'},
        ')':{'(':'!', ')':'>', 'AND':'>', 'OR':'>', 'ANDNOT':'>'},
        'AND':{'(':'<', ')':'>', 'AND':'>', 'OR':'>', 'ANDNOT':'>'},
        'OR':{'(':'<', ')':'>', 'AND':'<', 'OR':'>', 'ANDNOT':'<'},
        'ANDNOT':{'(':'<', ')':'>', 'AND':'>', 'OR':'>', 'ANDNOT':'>'}
    }

    return precede_map[op1][op2]

def is_op(op):
    '''
    判断是否为运算符
    '''
    return (op == '(' or op == ')' or op == 'AND' or op == 'OR' or op == 'ANDNOT')

def calc_record(record1, record2, op):
    '''
    按运算符对倒排记录执行计算
    '''
    if op == 'AND':
        return record1 & record2
    elif op == 'OR':
        return record1 | record2
    elif op == 'ANDNOT':
        return record2 - record1
    else:
        # 非法输入
        return

In [23]:
def calc_bool_exp(exp):
    '''
    计算布尔表达式
    @param 
    exp: 布尔表达式的列表形式
    @return
    result[-1]: 总体交集
    word_set: 涉及的词项集合
    '''
    result_stack=[] # 预算结果栈
    op_stack=[] # 运算符栈
    word_set = set([]) # 词项集合
    i = 0 # 计数器i
    elen = len(exp) # 表达式长度

    while i < elen or len(op_stack) != 0:

        if i < elen and not is_op(exp[i]): # 词项入栈
            result_stack.append(get_record(exp[i]))
            word_set.add(exp[i]) # 词项入集合
            i += 1
        elif i<elen and len(op_stack) == 0: # 第一个运算符入栈
            op_stack.append(exp[i])
            i += 1
        else:
            op1 = op_stack[-1] # 取运算符栈顶元素
            # 取当前运算符，若表达式结束，则返回 '#'
            c = exp[i] if i < elen else '#'
            # 判断栈顶和当前运算符的优先级
            precede = get_precede(op1, c)
            if precede == '<':
                op_stack.append(c)
                i += 1
            elif precede == '=':
                op_stack.pop()
                i += 1
            elif precede == '>':
                op_stack.pop() # 栈顶运算符出栈
                record1 = result_stack.pop() # 中间结果1出栈
                record2 = result_stack.pop() # 中间结果2出栈
                result = calc_record(record1, record2, op1) 
                result_stack.append(result) # 计算结果入栈
            else:
                # 优先级错误
                return

    return result_stack[-1], word_set

In [8]:
# 测试对照程序
edu = set(list(invert_table['教育'].keys()))
stu = set(list(invert_table['学生'].keys()))
child = set(list(invert_table['孩子'].keys()))

In [12]:
print(invert_table['教育'])

{45: 0.00060938, 57: 0.00065445, 65: 0.00134409, 94: 0.00120627, 95: 0.01577287, 101: 0.00282943, 124: 0.00332226, 129: 0.01470588, 209: 0.00098765, 228: 0.00043403, 234: 0.00058514, 486: 0.00167785, 534: 0.00410959, 564: 6.389e-05, 630: 0.00757576, 736: 0.00157233, 745: 0.00136612, 752: 0.00059737, 774: 0.00055741, 816: 0.00352113, 841: 0.00060901, 859: 0.00257732, 860: 0.00296736, 888: 0.00441501, 917: 0.004662, 1227: 0.00266667, 1241: 0.00409836, 1358: 4.241e-05, 1469: 0.00091241, 1531: 0.00088496, 1567: 0.00093721, 1625: 0.00396825, 1652: 0.00063816, 1673: 0.00070621, 1677: 0.00158479, 1726: 0.00059102, 1757: 0.00108932, 1776: 0.00062696, 1856: 0.00359712, 1876: 0.00114416, 1969: 0.00483092}


In [9]:
len((stu - child) | (edu - child))

40

In [24]:
# 测试驱动：输入表达式字符串
# exp_str = '孩子'
exp_str = input('请输入检索表达式：')
exp=exp_str.split(' ')
# 运行计算程序
search_result, word_set = calc_bool_exp(exp)

请输入检索表达式：北京 AND 企业


In [25]:
print(search_result)

{7, 1040, 17, 1043, 1044, 21, 1046, 1047, 536, 1048, 1049, 1050, 1051, 541, 1053, 1054, 1056, 546, 553, 1065, 555, 1067, 45, 1068, 1069, 564, 565, 566, 1588, 1591, 74, 1099, 1101, 1615, 80, 1104, 83, 1623, 90, 94, 95, 101, 1640, 115, 1652, 1154, 1668, 133, 1672, 1677, 1684, 1191, 1201, 1203, 1718, 1211, 188, 1730, 714, 1738, 1228, 1230, 1751, 1755, 231, 1768, 1259, 1776, 753, 1268, 1269, 770, 1290, 1291, 1293, 1808, 1819, 285, 1828, 810, 1835, 1836, 1840, 1329, 309, 1853, 831, 832, 1347, 839, 840, 1866, 339, 1373, 1384, 882, 884, 391, 906, 916, 918, 1455, 1976, 955, 957, 479, 487, 1517}


In [61]:
print(word_set)

{'北京', '企业'}


## 4. 基于TF/IDF排序对检索结果进行排序

In [26]:
sorted_result = {}

for doc in search_result: # 计算每个文档的 tfidf 值
    tfidf = 0
    for word in word_set:
        # 词没有对应文档倒排记录的，tf置为 0 
        tf = 0 if doc not in invert_table[word].keys() else invert_table[word][doc]
        idf = word_table[word]
        tfidf += round(tf*idf, 10)
    sorted_result[doc] = tfidf

# 降序排列
sorted(sorted_result.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)

[(840, 0.010137323299999999),
 (882, 0.009528383999999999),
 (309, 0.009528383999999999),
 (479, 0.008985073699999999),
 (918, 0.0079369735),
 (1677, 0.0065237731),
 (45, 0.005826629),
 (810, 0.0058165882),
 (1268, 0.005719511300000001),
 (80, 0.0053162696),
 (1976, 0.005103905),
 (1623, 0.0046229836),
 (1384, 0.0041428552),
 (884, 0.00401729),
 (916, 0.0040020754),
 (285, 0.0039490093),
 (1672, 0.003940246200000001),
 (1853, 0.003939862),
 (536, 0.0038124299),
 (1768, 0.0037738892),
 (17, 0.0033751975),
 (95, 0.0032492103000000003),
 (1828, 0.0030818182),
 (1684, 0.0030108006),
 (839, 0.0027259411),
 (94, 0.0026005218),
 (714, 0.0024463999),
 (1373, 0.0024260204),
 (101, 0.002250808),
 (906, 0.0019665522),
 (133, 0.0018590383),
 (1228, 0.0018456083),
 (1230, 0.0018403963),
 (1099, 0.0018152969),
 (1259, 0.0017940805000000002),
 (115, 0.0017681380999999999),
 (83, 0.0017565233),
 (1154, 0.0017339074),
 (566, 0.0016666652),
 (1776, 0.0016498427000000001),
 (90, 0.0016480000000000002),
 

In [39]:
str(tuple(search_result))

'(1984, 1992, 1998, 1943, 243, 1940, 788, 1941, 951, 1942, 348)'

In [37]:
search_result

{243, 348, 788, 951, 1940, 1941, 1942, 1943, 1984, 1992, 1998}

## 将摘要内容存入数据库

In [60]:
import pymysql

class MySQLDB:
    def __new__(cls, host, user, password, database):
        """ test connection """
        try:
            connect = pymysql.connect(host = host,
                            user = user,
                            password = password,
                            database = database,
                            charset = 'utf8')

        except Exception as e:
            print("Failed to connect database: ", e)
            return None

        else:
            connect.close()
            return super().__new__(cls)

    def __init__(self, host, user, password, database):
        """ initialize args """
        self.host, self.user, self.password, self.database = host, user, password, database

    def query(self,sqls):
        """ execute the sqls """
        assert isinstance(sqls,list), 'sqls must be a list'
        connect = pymysql.connect(host = self.host,
                            user = self.user,
                            password = self.password,
                            database = self.database,
                            cursorclass = pymysql.cursors.DictCursor)
        cursor = connect.cursor()
        try:
            for sql in sqls:
                cursor.execute(sql)

            data = cursor.fetchall()
        except Exception as e:
            print("Failed to query: ", e)
            data = None
        finally:
            connect.close()
            return data

In [61]:
db_mysql = MySQLDB(host='localhost',
                user='root',
                password='618618',
                database='test')

# print(db_mysql.query([sql]))

In [62]:
corpus_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\data\\Sogou\\' # 文本语料路径
doc_abstract = []
for doc_id in range(1, 17901, 1):
    raw_content = read_file(corpus_path+str(doc_id)+".txt").strip()
    content = preprocess(raw_content)
    doc_abstract.append(content[:70])

## 生成前端所需检索结果

In [63]:
def get_url(doc):
    server_path = "http://localhost:5000/news={}".format(doc)
    return server_path

In [64]:
sql="""
select * from detail
where doc_id in {search_set}
""".format(search_set = tuple(search_result))

In [65]:
rlist = db_mysql.query([sql])

In [66]:
doc_detail = {}
for item in rlist:
    doc_detail[item['doc_id']] = item['detail']

In [67]:
# 生成前端所需要的形式
result_data=[]
for doc in search_result:
    result_data.append(["文档"+str(doc),get_url(doc), doc_detail[doc]])

In [68]:
result_data

[['文档7',
  'http://localhost:5000/news=7',
  '本报记者王珍发自广州河北沧州献县的家电经销商王凤恩，2005年年中将自己的小店改为“幸福树电器连锁店”。“大卖场步步紧逼，员工素质低，信息闭'],
 ['文档1040',
  'http://localhost:5000/news=1040',
  'G捷利(行情,论坛)(000996)：中国物流领域的黑马连续的拉升和成交天量，个股的大面积崛起，正在逐步印证着牛市已经扑面而来，追逐强势品种'],
 ['文档17',
  'http://localhost:5000/news=17',
  '本报记者边长勇发自北京谈论起董事会话题，ColinB.Carter滔滔不绝，作为波士顿咨询的公司治理议题资深专家，ColinB.Carter'],
 ['文档1043',
  'http://localhost:5000/news=1043',
  'G捷利(行情,论坛)(000996)：中国物流领域的黑马连续的拉升和成交天量，个股的大面积崛起，正在逐步印证着牛市已经扑面而来，追逐强势品种'],
 ['文档1044',
  'http://localhost:5000/news=1044',
  'G捷利(行情,论坛)(000996)：中国物流领域的黑马连续的拉升和成交天量，个股的大面积崛起，正在逐步印证着牛市已经扑面而来，追逐强势品种'],
 ['文档21',
  'http://localhost:5000/news=21',
  '继上月永乐与大中两巨头宣布开始为期一年的合并计划后，又一个行业并购案也处于进行之中。昨天记者获悉，全球最大家电连锁巨头美国百思买将以1.2亿'],
 ['文档1046',
  'http://localhost:5000/news=1046',
  'G捷利(行情,论坛)(000996)：中国物流领域的黑马连续的拉升和成交天量，个股的大面积崛起，正在逐步印证着牛市已经扑面而来，追逐强势品种'],
 ['文档1047',
  'http://localhost:5000/news=1047',
  'G捷利(行情,论坛)(000996)：中国物流领域的黑马连续的拉升和成交天量，个股的大面积崛起，正在逐步印证着牛市已经扑面而来，追逐强势品种'],
 ['文档5