In [1]:
# 对dataset当中的文本进行处理, 基本的处理方式:
# 1. 将字母转化为小写(注: 作者名不进行转化);
# 2. 将标点符号去掉;
# 3. 将单词还原为原型;

import os
import shutil
import string
import math
import numpy as np 
import pandas as pd

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


In [2]:
input_path = "..//dataset"
output_path = ".//newdata//"
for doc in os.listdir(input_path):
    # 大小写处理
    name = doc[:-4].lower() 
    # 存放位置处理
    new_doc_path = os.path.join(output_path, name) 
    # 存储文本
    if os.path.exists(output_path):
        pass
    else:
        os.mkdir(output_path)
    shutil.copyfile(os.path.join(input_path,doc), new_doc_path)
    author_flag = True
    lines = []
    for line in open(new_doc_path).readlines():
        # 删除文本中的标点
        line = line.translate(str.maketrans("","", string.punctuation))
        # 删除文本前后空格
        line = line.strip()
        # 对作者域进行处理
        if author_flag:
            line = line.lower()[7:]
            author_flag = False
        else:
            # 大小写处理
            line = line.lower()
            # 对词形进行还原
            words = line.split(' ')
            # print(words)
            word_list = []
            tagged_sent = pos_tag(words)     # 获取单词词性
            wnl = WordNetLemmatizer()
            for tag in tagged_sent:
                wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
                word_list.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
            line = ' '.join(word_list)
            # print(line)
        lines.append(line)
    # 写入文件 其中Author在第一行, 第二行是对应的文本
    with open(new_doc_path,"w") as f:
        for i in range(len(lines)):
            f.writelines(lines[i])
            if i == 0:
                f.write("\n")


In [3]:
# 下面进行基于域的倒排索引记录表的建立
# 先构建对应的词典类
class IdMap:
    def __init__(self):
        self.str_to_id = {}
        self.id_to_str = []
        
    def __len__(self):
        """Return number of terms stored in the IdMap"""
        return len(self.id_to_str)
        
    def _get_str(self, i):
        """Returns the string corresponding to a given id (`i`)."""
        if i >= self.__len__():
            raise IndexError
        else:
            return self.id_to_str[i]
        
    def _get_id(self, s):
        """Returns the id corresponding to a string (`s`). 
        If `s` is not in the IdMap yet, then assigns a new id and returns the new id.
        """
        if s in self.str_to_id:
            return self.str_to_id[s]
        else:
            self.id_to_str.append(s)
            self.str_to_id[s]=len(self.id_to_str)-1
            return self.str_to_id[s]
            
    def __getitem__(self, key):
        """If `key` is a integer, use _get_str; 
           If `key` is a string, use _get_id;"""
        if type(key) is int:
            return self._get_str(key)
        elif type(key) is str:
            return self._get_id(key)
        else:
            raise TypeError

In [4]:
# 回忆倒排索引的建立过程: 
# 1.首先找到单个文本中的单词文档对 单个文本中文档有域属性
# 2.需要统计每个单词在每个域中出现的次数
class RegionIndex:
    head = 1
    author = 2
    body = 4
    def __init__(self, data_dir):
        self.term_id_map = IdMap()
        self.doc_id_map = IdMap()
        self.data_dir = data_dir
        self.index = []
        doc_region_length = []
        # self.output_dir = output_dir
    
    def parse_pairs(self):
        '''
        将对应的文本域中的单词文本转化为对应pair对
        '''
        td_pairs = []
        for doc in os.listdir(self.data_dir):
            new_path = os.path.join(self.data_dir,doc)
            doc_id = self.doc_id_map[new_path]
            for term in doc.split(' '):
                td_pairs.append((self.term_id_map[term],str(doc_id)+"_head"))
            first_flag = True
            for line in open(new_path).readlines():
                if first_flag:
                    for term in line.strip().split(' '):
                        td_pairs.append((self.term_id_map[term], str(doc_id)+"_author"))
                    first_flag = False
                else:
                    for term in line.strip().split(' '):
                        td_pairs.append((self.term_id_map[term], str(doc_id)+"_body"))
        return td_pairs

    def parse_index(self,td_pairs):
        '''
        将pair对中的一些单词和对应的文本提取出来变为倒排索引
        对应的倒排索引格式:
        [(0, ['0_head_1', '2_body_3', '3_body_2', '4_body_1', '6_body_1']),
         (1, ['0_head_1']),
         (2, ['0_head_1']),
         (3, ['0_author_1', '1_author_1', '2_author_1', '7_author_1']),
         ...
        ]
        '''
        self.index = []
        term_id = -1
        doc = ""
        num = 0
        posting_list = []
        for pair in sorted(td_pairs):
            # 按对应的term进行建立索引
            # print(pair)
            if pair[0] != term_id:
                if term_id != -1:
                    posting_list.append(doc+"_"+str(num))
                    self.index.append((term_id, posting_list))
                    doc = ""
                    num = 0
                term_id = pair[0]
                posting_list = []
            if pair[1] != doc:
                if doc != "":
                    posting_list.append(doc+"_"+str(num))
                    num = 0
                doc = pair[1]
            num += 1
        posting_list.append(doc+"_"+str(num))
        self.index.append((term_id, posting_list))
        return self.index

    def region_retrieve(self, query_list, region):
        '''
        支持域查询, 根据出现查询中单词的文本所在域上的三维空间模型的计算结果来进行返回结果排序
        不同域的查询思路是: 在不同域中进行计算相似度, 然后将相关性按0.5 0.3 0.2进行累加最后排序
        '''
        region_list = []
        query_doc_sim = [0]*len(os.listdir(data_path))
        if region == 1:
            region_list = ["head"]
            query = query_list[0]
            query_doc_sim = self.single_region_retrieve(query, region)
        elif region == 2:
            region_list = ["author"]
            query = query_list[1]
            query_doc_sim = self.single_region_retrieve(query, region)
        elif region == 3:
            region_list = ["head", "author"]
            query1 = query_list[0]
            query2 = query_list[1]
            query_doc_sim1 = self.single_region_retrieve(query1, 1)
            query_doc_sim2 = self.single_region_retrieve(query2, 2)
            for i in range(len(query_doc_sim1)):
                query_doc_sim[i] = query_doc_sim1[i]+query_doc_sim2[i]
        elif region == 4:
            region_list = ["body"]
            query = query_list[2]
            query_doc_sim = self.single_region_retrieve(query, region)
        elif region == 5:
            region_list = ["head", "body"]
            query1 = query_list[0]
            query2 = query_list[2]
            query_doc_sim1 = self.single_region_retrieve(query1, 1)
            query_doc_sim2 = self.single_region_retrieve(query2, 4)
            for i in range(len(query_doc_sim1)):
                query_doc_sim[i] = query_doc_sim1[i]+query_doc_sim2[i]
        elif region == 6:
            region_list = ["author", "body"]
            query1 = query_list[1]
            query2 = query_list[2]
            query_doc_sim1 = self.single_region_retrieve(query1, 2)
            query_doc_sim2 = self.single_region_retrieve(query2, 4)
            for i in range(len(query_doc_sim1)):
                query_doc_sim[i] = query_doc_sim1[i]+query_doc_sim2[i]
        elif region == 7:
            region_list = ["head","author", "body"]
            query0 = query_list[0]
            query1 = query_list[1]
            query2 = query_list[2]
            query_doc_sim0 = self.single_region_retrieve(query0, 1)
            query_doc_sim1 = self.single_region_retrieve(query1, 2)
            query_doc_sim2 = self.single_region_retrieve(query2, 4)
            for i in range(len(query_doc_sim1)):
                query_doc_sim[i] = query_doc_sim1[i]+query_doc_sim2[i]+query_doc_sim0[i]
        else:
            # 默认为三维空间检索
            # 这里给出一个接口
            self.all_retrieve(query_list)
            return
        
        sorted_id = sorted(range(len(query_doc_sim)), key=lambda k: query_doc_sim[k], reverse=True)
        for i in range(len(sorted_id)):
            print("排名第"+str(i+1)+"位的是文档D"+str(sorted_id[i]+1)+", 与Query余弦相似度SC(Q, D"+str(sorted_id[i]+1)+") :",query_doc_sim[sorted_id[i]])

    def single_region_retrieve(self, query, region):
        ''' 
        单个域进行查询的接口
        ''' 
        if region == 1:
            region_list = "head"
        elif region == 2:
            region_list = "author"
        elif region == 4:
            region_list = "body"
        else:
            return

        # 先进行一个词干还原
        
        words = query.split(' ')
        # print(words)
        word_list = []
        tagged_sent = pos_tag(words)     # 获取单词词性
        wnl = WordNetLemmatizer()
        for tag in tagged_sent:
            wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
            word_list.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
        query = ' '.join(word_list)
        query = query.lower().strip().translate(str.maketrans("","", string.punctuation))
        # 对应的idf
        term_doc_num = self.cal_idf()
        doc_region_length = self.cal_doc_region_length(region)
        # 进行计算获得query的向量
        query_word = query.split(" ")
        query_vec = []
        for i in range(len(set(query_word))):
            query_vec.append(0)
            for j in range(len(query_word)):
                if query_word[j] == list(set(query_word))[i]:
                    query_vec[i] += 1
        for i in range(len(query_vec)):
            query_vec[i] = round(math.log10(query_vec[i]+1),4)
        # print(query_vec)
        # 下面进行文档的向量构建
        all_doc_vec = []
        doc_num = len(os.listdir(self.data_dir))
        for i in range(doc_num):
            doc_vec = []
            for j in range(len(set(query_word))):
                # 找到对应的词项id
                word_id = self.term_id_map[list(set(query_word))[j]] 
                # print(list(set(query_word))[j])
                doc_vec.append(0)
                if word_id < len(self.index):
                    posting_list = self.index[word_id]
                    # print(posting_list)
                    for post in posting_list[1]:
                        if post.split('_')[0] == str(i) and post.split('_')[1] == region_list:
                            doc_vec[j] += int(post.split('_')[2])
                    doc_vec[j] = round(math.log10(doc_vec[j]+1),4)*term_doc_num[word_id]
            all_doc_vec.append(doc_vec)
        # print(all_doc_vec)
        all_doc_vec.append(query_vec)
        ans = pd.DataFrame(all_doc_vec)
        query_doc_sim = []
        for i in range(len(all_doc_vec)-1):
            query_doc_sim.append((ans.loc[i,:]*ans.loc[9,:]).sum()/doc_region_length[i])
        return query_doc_sim
            # print("Query和文档D"+str(i+1)+"的相似度SC(Q, D"+str(i+1)+") :", (ans.loc[i,:]*ans.loc[9,:]).sum())
        # sorted_id = sorted(range(len(query_doc_sim)), key=lambda k: query_doc_sim[k], reverse=True)
        # for i in range(len(sorted_id)):
        #     print("排名第"+str(i+1)+"位的是文档D"+str(sorted_id[i]+1)+", 与Query余弦相似度SC(Q, D"+str(sorted_id[i]+1)+") :",query_doc_sim[sorted_id[i]])


    def all_retrieve(self, query):
        ''' 
        计算tf的方式: 词项在文档中出现的次数, tf在原始的值的基础上进行计算 减少文档长度的影响tf = log10(N+1)
        计算余弦相似度, 将对应文本向量中的每一项用tf * idf来进行表示, 
        具体的计算方法: 用对应的在query中出现的term的query的tf*idf * 对应的doc的term的tf*idf 再除以对应doc的向量的长度
        ''' 
        # 先进行一个词干还原
        words = query.split(' ')
        # print(words)
        word_list = []
        tagged_sent = pos_tag(words)     # 获取单词词性
        wnl = WordNetLemmatizer()
        for tag in tagged_sent:
            wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
            word_list.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
        query = ' '.join(word_list)
        query = query.lower().strip().translate(str.maketrans("","", string.punctuation))
        # 对应的idf
        term_doc_num = self.cal_idf()
        self.cal_doc_length()
        # 进行计算获得query的向量
        query_word = query.split(" ")
        query_vec = []
        for i in range(len(set(query_word))):
            query_vec.append(0)
            for j in range(len(query_word)):
                if query_word[j] == list(set(query_word))[i]:
                    query_vec[i] += 1
        for i in range(len(query_vec)):
            query_vec[i] = round(math.log10(query_vec[i]+1),4)
        # print(query_vec)
        # 下面进行文档的向量构建
        all_doc_vec = []
        doc_num = len(os.listdir(self.data_dir))
        for i in range(doc_num):
            doc_vec = []
            for j in range(len(set(query_word))):
                # 找到对应的词项id
                word_id = self.term_id_map[list(set(query_word))[j]] 
                # print(list(set(query_word))[j])
                doc_vec.append(0)
                if word_id < len(self.index):
                    posting_list = self.index[word_id]
                    # print(posting_list)
                    for post in posting_list[1]:
                        if post.split('_')[0] == str(i):
                            doc_vec[j] += int(post.split('_')[2])
                    doc_vec[j] = round(math.log10(doc_vec[j]+1),4)*term_doc_num[word_id]
            all_doc_vec.append(doc_vec)
        # print(all_doc_vec)
        all_doc_vec.append(query_vec)
        ans = pd.DataFrame(all_doc_vec)
        query_doc_sim = []
        for i in range(len(all_doc_vec)-1):
            query_doc_sim.append((ans.loc[i,:]*ans.loc[9,:]).sum()/self.doc_length[i])
            # print("Query和文档D"+str(i+1)+"的相似度SC(Q, D"+str(i+1)+") :", (ans.loc[i,:]*ans.loc[9,:]).sum())
        sorted_id = sorted(range(len(query_doc_sim)), key=lambda k: query_doc_sim[k], reverse=True)
        for i in range(len(sorted_id)):
            print("排名第"+str(i+1)+"位的是文档D"+str(sorted_id[i]+1)+", 与Query余弦相似度SC(Q, D"+str(sorted_id[i]+1)+") :",query_doc_sim[sorted_id[i]])

    def cal_idf(self):
        ''' 
        计算idf的接口 计算idf的方式: 出现该词项的所有文档的数目df idf = log(N/df)
        ''' 
        doc_num = len(os.listdir(self.data_dir))
        term_doc_num = []
        for index in self.index:
            doc_list = []
            for doc in index[1]:
                doc_list.append(doc.split("_")[0])
            term_doc_num.append(len(set(doc_list)))
            # print(term_doc_num)
        for i in range(len(term_doc_num)):
            term_doc_num[i] = round(math.log10(doc_num/term_doc_num[i]),4)
        return term_doc_num

    def cal_doc_length(self):
        ''' 
        计算向量的长度
        ''' 
        doc_num = len(os.listdir(self.data_dir))
        term_doc_num = self.cal_idf()
        # 初始化列表
        self.doc_length = [0]*doc_num
        # !不能这样子初始化 浅拷贝
        #all_doc_vec = [[0] * len(self.index)] * doc_num
        all_doc_vec = [[0 for i in range(len(self.index))] for j in range(doc_num)]
        # print(all_doc_vec)
        for index in self.index:
            term_id = index[0]
            postings = index[1]
            # print(postings)
            for post in postings:
                post_id = post.split('_')[0]
                post_num = post.split('_')[2]
                all_doc_vec[int(post_id)][int(term_id)] += int(post_num)
        for doc_id in range(len(all_doc_vec)):
            for term_id in range(len(doc)):
                all_doc_vec[doc_id][term_id] *= term_doc_num[term_id]
            for i in range(len(all_doc_vec[doc_id])):
                self.doc_length[doc_id] += math.pow(all_doc_vec[doc_id][i], 2)
            self.doc_length[doc_id] = round(math.sqrt(self.doc_length[doc_id]),4)
        return self.doc_length

    def cal_doc_region_length(self, region):
        ''' 
        计算向量的长度
        ''' 
        if region == 1:
            region_list = "head"
        elif region == 2:
            region_list = "author"
        elif region == 4:
            region_list = "body"
        else:
            return

        doc_num = len(os.listdir(self.data_dir))
        term_doc_num = self.cal_idf()
        # 初始化列表
        doc_region_length = [0]*doc_num
        all_doc_vec = [[0 for i in range(len(self.index))] for j in range(doc_num)]
        # print(all_doc_vec)
        for index in self.index:
            term_id = index[0]
            postings = index[1]
            # print(postings)
            for post in postings:
                post_id = post.split('_')[0]
                post_num = post.split('_')[2]
                if post.split('_')[1] == region_list:
                    all_doc_vec[int(post_id)][int(term_id)] += int(post_num)
        for doc_id in range(len(all_doc_vec)):
            for term_id in range(len(doc)):
                all_doc_vec[doc_id][term_id] *= term_doc_num[term_id]
            for i in range(len(all_doc_vec[doc_id])):
                doc_region_length[doc_id] += math.pow(all_doc_vec[doc_id][i], 2)
            doc_region_length[doc_id] = round(math.sqrt(doc_region_length[doc_id]),4)
        return doc_region_length


In [5]:
# 这是一个测试代码
data_path = ".//newdata//"
regionIndex = RegionIndex(data_path)
# regionIndex.parse_pairs()
regionIndex.parse_index(regionIndex.parse_pairs())
regionIndex.cal_idf()
# regionIndex.cal_doc_length()
query_list = ["song","dd",""]
regionIndex.region_retrieve(query_list,3)
# regionIndex.region_retrieve("Wine comes", 7)
# regionIndex.cal_doc_length()


排名第1位的是文档D1, 与Query余弦相似度SC(Q, D1) : 0.06294704689092763
排名第2位的是文档D2, 与Query余弦相似度SC(Q, D2) : 0.0
排名第3位的是文档D3, 与Query余弦相似度SC(Q, D3) : 0.0
排名第4位的是文档D4, 与Query余弦相似度SC(Q, D4) : 0.0
排名第5位的是文档D5, 与Query余弦相似度SC(Q, D5) : 0.0
排名第6位的是文档D6, 与Query余弦相似度SC(Q, D6) : 0.0
排名第7位的是文档D7, 与Query余弦相似度SC(Q, D7) : 0.0
排名第8位的是文档D8, 与Query余弦相似度SC(Q, D8) : 0.0
排名第9位的是文档D9, 与Query余弦相似度SC(Q, D9) : 0.0


In [6]:
# 这是主函数
if __name__ == "__main__":
    # 数据路径
    data_path = ".//newdata//"
    # 初始化类
    myIndex = RegionIndex(data_path)
    print('''
 _____           _                _____      _        _                 _ 
|  __ \         | |              |  __ \    | |      (_)               | |
| |__) |__   ___| |_ _ __ _   _  | |__) |___| |_ _ __ _  _____   ____ _| |
|  ___/ _ \ / _ \ __| '__| | | | |  _  // _ \ __| '__| |/ _ \ \ / / _` | |
| |  | (_) |  __/ |_| |  | |_| | | | \ \  __/ |_| |  | |  __/\ V / (_| | |
|_|   \___/ \___|\__|_|   \__, | |_|  \_\___|\__|_|  |_|\___| \_/ \__,_|_|
                           __/ |                                          
                          |___/                                           
    ''')
    print("-----------------------------正在进行倒排索引构建-------------------------------")
    pairs = myIndex.parse_pairs()
    myIndex.parse_index(pairs)
    print("-----------------------------正在构建倒排索引的IDF-------------------------------")
    myIndex.cal_idf()
    print("-----------------------------正在计算倒排索引文档向量长度-------------------------------")
    myIndex.cal_doc_length()
    print("-----------------------------构建完成, 下面进行按域检索-------------------------------")
    while True:
        print("请选择你要检索的域: ")
        print("1.诗歌名 2.诗人 3.诗歌内容 4.诗歌名和诗人 5.诗歌名和诗歌内容 6.诗人和诗歌内容 7.诗歌名和诗人和诗歌内容 8.全文本检索 9.退出")
        a = input("请选择选项: ")
        if a == "1":
            query = input("请输入诗歌名域查询语句: ")
            print("诗歌名域查询语句: ",query)
            query_list = [query,"",""]
            myIndex.region_retrieve(query_list,1)
        elif a == "2":
            query = input("请输入诗人域查询语句: ")
            print("诗人域查询语句: ",query)
            query_list = ["",query,""]
            myIndex.region_retrieve(query_list,2)
        elif a == "3":
            query = input("请输入诗歌内容域查询语句: ")
            print("诗歌内容域查询语句: ",query)
            query_list = ["","",query]
            myIndex.region_retrieve(query_list,4)
        elif a == "4":
            query1 = input("请输入诗歌名域查询语句: ")
            print("诗歌名域查询语句: ",query1)
            query2 = input("请输入诗人域查询语句: ")
            print("诗人域查询语句: ",query2)
            query_list = [query1,query2,""]
            myIndex.region_retrieve(query_list,3)
        elif a == "5":
            query1 = input("请输入诗歌名域查询语句: ")
            print("诗歌名域查询语句: ",query1)
            query2 = input("请输入诗歌内容域查询语句: ")
            print("诗歌内容域查询语句: ",query2)
            query_list = [query1,"",query2]
            myIndex.region_retrieve(query_list,5)
        elif a == "6":
            query1 = input("请输入诗人域查询语句: ")
            print("诗人域查询语句: ",query1)
            query2 = input("请输入诗歌内容域查询语句: ")
            print("诗歌内容域查询语句: ",query2)
            query_list = ["",query1,query2]
            myIndex.region_retrieve(query_list,6)
        elif a == "7":
            query0 = input("请输入诗歌名域查询语句: ")
            print("诗歌名域查询语句: ",query0)
            query1 = input("请输入诗人域查询语句: ")
            print("诗人域查询语句: ",query1)
            query2 = input("请输入诗歌内容域查询语句: ")
            print("诗歌内容域查询语句: ",query2)
            query_list = [query0,query1,query2]
            myIndex.region_retrieve(query_list,7)
        elif a == "8":
            query = input("请输入查询语句: ")
            myIndex.all_retrieve(query)
        elif a == "9":
            print("-----------------------------感谢您的使用,程序退出-------------------------------")
            print("-----------------------------byebye q(≧▽≦q)-------------------------------")
            break
        else:
            print("-----------------------------您输入错误啦~~~~ ::>_<::-------------------------------")
            print("-----------------------------程序退出-------------------------------")
            break
        print("-----------------------------当前查询完毕, 下面新一轮查询-------------------------------")


 _____           _                _____      _        _                 _ 
|  __ \         | |              |  __ \    | |      (_)               | |
| |__) |__   ___| |_ _ __ _   _  | |__) |___| |_ _ __ _  _____   ____ _| |
|  ___/ _ \ / _ \ __| '__| | | | |  _  // _ \ __| '__| |/ _ \ \ / / _` | |
| |  | (_) |  __/ |_| |  | |_| | | | \ \  __/ |_| |  | |  __/\ V / (_| | |
|_|   \___/ \___|\__|_|   \__, | |_|  \_\___|\__|_|  |_|\___| \_/ \__,_|_|
                           __/ |                                          
                          |___/                                           
    
-----------------------------正在进行倒排索引构建-------------------------------
-----------------------------正在构建倒排索引的IDF-------------------------------
-----------------------------正在计算倒排索引文档向量长度-------------------------------
-----------------------------构建完成, 下面进行按域检索-------------------------------
请选择你要检索的域: 
1.诗歌名 2.诗人 3.诗歌内容 4.诗歌名和诗人 5.诗歌名和诗歌内容 6.诗人和诗歌内容 7.诗歌名和诗人和诗歌内容 8.全文本检索 9.退出
诗歌名域查询语句:  fr