In [2]:
import jieba
import os
import numpy as np
import time

In [3]:
def read_file(file_path):
    '''
    读文件内容
    '''
    fp = open(file_path,"r",encoding='gbk',errors='ignore')
    content = fp.read()
    fp.close()
    return content

def create_stopwords(file_path):
    '''
    创建停词列表
    '''
    stopwords = [line.strip() for line in open(file_path, 'r', encoding='utf-8').readlines()]
    return stopwords

def preprocess(content):
    '''
    对文本文件预处理
    '''
    content = content.replace("\n", "")
    content = content.replace("\u3000", "")
    
    # content = content.replace(" ", "")
    return content

def create_seq_index(start_doc_id, end_doc_id):
    '''
    建立顺序索引
    '''
    seq_index={}
    for doc_id in range(start_doc_id, end_doc_id, 1):
        raw_content = read_file(corpus_path+str(doc_id)+".txt").strip()
        content = preprocess(raw_content)
        content_seg = jieba.cut(content)    # jieba分词
        
        word_map = {} # 定义单文档词项表
        
        # 去停词 + 计算出现次数
        word_amount = 0
        for word in content_seg:
            word_amount+=1
            if word not in stopwords:
                if word not in word_map.keys():
                    word_map[word]=1
                else:
                    word_map[word]+=1
        # 计算 tf
        for word in word_map:
            # word_map[word]/=word_amount
            word_map[word] = round(word_map[word]/word_amount
        # 存入顺排索引
        seq_index[doc_id]=word_map
        
    return seq_index

def Invert_in_batch(seq_index):
    '''
    对每块文档建立倒排索引
    '''
    
    global word_id_map
    global word_id_counting
    global word_id_table
    
    # 由顺序索引建立倒排索引
    tmp_word_table = {} # 定义 当前文档块 的词项表
    pos_table = {} # 定义 当前文档块 的倒排记录表
    for doc_id in seq_index: # 遍历顺序索引建立倒排索引
        for word in seq_index[doc_id]:
            if word not in word_id_map: # 全局词项表未收录词
                word_id_map[word] =  word_id_counting # 加入全局词项表映射
                word_id_table[word_id_map[word]] = 1 # 加入全局词项表
                word_id_counting += 1 # 序号自增
            
            word_id = word_id_map[word] # 获取词项表对应序号
            
            if word_id not in tmp_word_table: # 局部词项表未收录词
                tmp_word_table[word_id] = True # 加入局部词项表
                pos_table[word_id] = {doc_id:seq_index[doc_id][word]} # 创建倒排记录
            else:
                word_id_table[word_id] +=1
                pos_table[word_id][doc_id] = seq_index[doc_id][word] # 更新倒排记录
    
    return pos_table

## 预处理工作

In [4]:
# 创建停词列表
stopwords_file = "F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\cn_stopwords.txt"
stopwords=create_stopwords(stopwords_file)

## 基于内存建立倒排索引

In [4]:
time_start=time.time()

if __name__=="__main__":
    corpus_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\testdata\\' # 文本语料路径
    start_doc_id, end_doc_id = 10, 2000 # 指定文档集，左闭右开
    
    # 建立顺序索引
    seq_index=create_seq_index(start_doc_id, end_doc_id)
    
    # 由顺序索引建立倒排索引
    word_table = {} # 定义词项表
    pos_table = {} # 定义倒排记录表
    for doc_id in seq_index: # 遍历顺序索引建立倒排索引
        for word in seq_index[doc_id]:
            if word not in word_table: # 未收录词
                word_table[word] = 1 # 加入词项表
                pos_table[word] = {doc_id:seq_index[doc_id][word]} # 创建倒排记录
            else:
                word_table[word] +=1
                pos_table[word][doc_id] = seq_index[doc_id][word]
                
time_end=time.time()
print('time cost',time_end-time_start,'s')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\JOYCE\AppData\Local\Temp\jieba.cache
Loading model cost 0.610 seconds.
Prefix dict has been built succesfully.


time cost 26.87848210334778 s


## 基于外部磁盘建立倒排索引（基于块处理）

In [5]:
time_start=time.time()

if __name__=="__main__":
    corpus_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\testdata\\' # 文本语料路径
    output_path = 'F:\\OneDrive\\Documents\\ThirdYear\\MediaDataAnalysis\\SearchEngine\\output\\' # 输出索引文件路径
    start_doc_id, end_doc_id = 10, 2000 # 指定文档集，左闭右开
    
    batch = 10 # 指定块数
    
    assert (end_doc_id - start_doc_id) % batch == 0 # 保证文档总数是batch的整数倍
    batch_size = int((end_doc_id - start_doc_id)/batch) # 分块进行处理
    
    # 全局变量
    word_id_map ={} # 维护 词项及其序号映射
    word_id_table = {} # 定义词项表
    word_id_counting = 0 # 词项自增序号
    
    tmp_doc_id = start_doc_id # 指定每块起始文档
    batch_count = 0 # 块计数器
    
    while batch_count != batch: # 还有块未处理完时则继续
        # 建立顺序索引
        seq_index = create_seq_index(tmp_doc_id, tmp_doc_id + batch_size)
        # 建立倒排索引
        tmp_pos_table = Invert_in_batch(seq_index)
        # 将块的倒排索引写入磁盘
        np.save(output_path+str(batch_count) + ".npy", tmp_pos_table)
        batch_count += 1
        # 读下一块
        tmp_doc_id += batch_size
        
    # 合并倒排索引
    invert_table = np.load(output_path+"0.npy",allow_pickle=True)[()] # 将nd array转为内置字典
    for table in range(1,batch):
        invert_table_tmp = np.load(output_path + str(table) + ".npy",allow_pickle=True)[()]
        for word_id in invert_table_tmp:
            if word_id in invert_table.keys(): # 若待合并项在原索引表中则更新原表项
                invert_table[word_id].update(invert_table_tmp[word_id])
            else: # 若待合并项不在原索引表中则新增一项
                invert_table[word_id] = invert_table_tmp[word_id]
                
time_end=time.time()
print('time cost',time_end-time_start,'s')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\JOYCE\AppData\Local\Temp\jieba.cache
Loading model cost 0.612 seconds.
Prefix dict has been built succesfully.


time cost 28.827686309814453 s


In [21]:
print(invert_table_dict[word_id_map['教育']])

{12: 0.005763688760806916, 13: 0.0008203445447087777, 14: 0.024324324324324326, 19: 0.0023752969121140144, 20: 0.015056461731493099, 21: 0.0035971223021582736, 27: 0.0045351473922902496, 28: 0.002824858757062147}


In [10]:
# 合并倒排索引
invert_table = np.load(output_path+"0.npy",allow_pickle=True)[()] # 将nd array转为内置字典
for table in range(1,batch):
    invert_table_tmp = np.load(output_path + str(table) + ".npy",allow_pickle=True)[()]
    for word_id in invert_table_tmp:
        if word_id in invert_table.keys(): # 若待合并项在原索引表中则更新原表项
            invert_table[word_id].update(invert_table_tmp[word_id])
        else: # 若待合并项不在原索引表中则新增一项
            invert_table[word_id] = invert_table_tmp[word_id]

In [10]:
print(sorted(invert_table[word_id_map['孩子']].items(), key = lambda kv:(kv[1], kv[0])))

[(187, 7.639419404125286e-05), (1160, 7.810669374365383e-05), (1157, 8.163931749530574e-05), (1204, 0.00017262213015708613), (719, 0.00018254837531945966), (1070, 0.00021146119687037428), (1636, 0.00021992522542335605), (423, 0.00022983222247759135), (1089, 0.00023529411764705883), (306, 0.00032229473853839335), (1104, 0.0003553028957186001), (1429, 0.000419639110365086), (220, 0.0004210526315789474), (422, 0.00044296788482834997), (1273, 0.00044662795891022776), (1759, 0.00047214353163361664), (1264, 0.0005082592121982211), (982, 0.0005263157894736842), (1229, 0.0005509641873278236), (1566, 0.0005530973451327434), (584, 0.0005574136008918618), (30, 0.0006097560975609756), (1244, 0.0006176652254478073), (1369, 0.0006180469715698393), (62, 0.0006234413965087282), (312, 0.00065359477124183), (1804, 0.0006618133686300463), (1007, 0.0006648936170212766), (567, 0.0006655574043261231), (882, 0.0006666666666666666), (1091, 0.0006837606837606838), (672, 0.0006839945280437756), (651, 0.00069396

In [11]:
print(invert_table[word_id_map['孩子']].items())

dict_items([(15, 0.01589825119236884), (21, 0.009592326139088728), (30, 0.0006097560975609756), (31, 0.0037243947858473), (34, 0.003215434083601286), (42, 0.00315955766192733), (45, 0.008383233532934131), (54, 0.0009354536950420954), (55, 0.0014326647564469914), (57, 0.0009302325581395349), (62, 0.0006234413965087282), (67, 0.017391304347826087), (68, 0.0011547344110854503), (81, 0.01310615989515072), (91, 0.0022598870056497176), (102, 0.003239241092086997), (112, 0.006802721088435374), (113, 0.0017497812773403325), (125, 0.0037313432835820895), (130, 0.0011918951132300357), (136, 0.0029411764705882353), (137, 0.0011350737797956867), (148, 0.0010845986984815619), (150, 0.007978723404255319), (164, 0.011019283746556474), (167, 0.03731343283582089), (170, 0.01764705882352941), (171, 0.011573350015639663), (172, 0.02549246813441483), (173, 0.0026827632461435278), (187, 7.639419404125286e-05), (207, 0.0017985611510791368), (211, 0.02403846153846154), (215, 0.0395882818685669), (220, 0.0004

In [6]:
np.save(output_path+ "result.npy", invert_table)