Skip to content

Commit

Permalink
v2-0
Browse files Browse the repository at this point in the history
  • Loading branch information
Huangtuzhi committed Jan 11, 2016
1 parent 9af7799 commit 5df879c
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 6 deletions.
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,37 @@ mongod --dbpath =/opt/mongodb-data --logpath=/opt/mongodb-data/mongodb.log
* 运行 indexer.py 构建倒排索引
* 运行 indexSearcher.py 进行检索

## 文档排名——计算 TF-IDF

搜索引擎检索出文档之后,需要选择和查询最相关的文档返回给用户,因此需要对文档进行评估。一般有下列方法:

* TF-IDF 词频-逆文档频率
* 余弦相似度
* Okapi BM25

看一下 TF-IDF 的计算

```
def caculate_TFIDF(self, word):
score_dictionary = {}
for posting in self.word_dictionary[word]:
DocID = posting[0]
freq = posting[1]
idf = math.log(float(100) / len(self.word_dictionary[word]))
tf = 1 + math.log(int(freq)) if freq > 0 else 0
tfidf_score = tf * idf
score_dictionary[DocID] = tfidf_score
score = sorted(score_dictionary.iteritems(), key=lambda d:d[1], \
reverse = True)
print score
```

idf 是文档总数和该词元出现过文档总数的商。TF-IDF 作为衡量“词元在文档集合中是否特殊”的一个指标。

将算得的 TF-IDF 分数存储在字典中,最后按值进行排序。

## 参考

[https://www.coursera.org/course/textretrieval](https://www.coursera.org/course/textretrieval)
Expand Down
66 changes: 60 additions & 6 deletions indexSearcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ def __init__(self):
# 计算每个文档的 TF-IDF 值,进行排序
def caculate_TFIDF(self, word):
score_dictionary = {}

if not self.word_dictionary.has_key(word):
return 0

for posting in self.word_dictionary[word]:
DocID = posting[0]
freq = posting[1]
Expand All @@ -26,9 +30,60 @@ def caculate_TFIDF(self, word):
score = sorted(score_dictionary.iteritems(), key=lambda d:d[1], reverse = True)
return score

# 计算 BM25
def caculate_BM25(self, word):
pass
def get_wordcount_in_document(self, word, content):
word_list = content.split(' ')
cnt = 0
for one in word_list:
if one == word:
cnt += 1
return cnt

def DocID2Doc(self, DocID):
manager = documentManager()
collection = manager.connect_mongo()
url = collection.find_one({"DocID": DocID})["url"]
return url

# 计算 BM25,设定 c(w,q) 为 1,即查询中每个词出现一次
def caculate_BM25(self, query_words):
manager = documentManager()
collection = manager.connect_mongo()

score_dictionary = {}
b = 0.5 #参数调节因子
k = 10 # 调节因子
avdl = 800 # 文档平均长度

# query_words 中至少一个单元词出现的所有文档
DocId_of_query_words = set([])
for word in query_words.split(' '):

if not self.word_dictionary.has_key(word):
continue

for posting in self.word_dictionary[word]:
DocID = posting[0]
DocId_of_query_words.add(DocID)

for id in DocId_of_query_words:
BM25_score = 0
for word in query_words.split(' '):
content = collection.find_one({"DocID": int(id)})["content"]
freq = self.get_wordcount_in_document(word ,content)

doc_len = len(self.word_dictionary[word])
idf = math.log(float(100) / doc_len)
normalizer = 1 - b + b * (doc_len / avdl)

BM25_score += (float)((k + 1) * freq) / (freq + k * normalizer) * idf
# 计算某个文档对 Query 的 BM25 分数
score_dictionary[id] = BM25_score

score = sorted(score_dictionary.iteritems(), key=lambda d:d[1], reverse = True)

for i in score:
print self.DocID2Doc(int(i[0]))


def retrive_word(self, word):
# 找出 DocID 对应的 url
Expand Down Expand Up @@ -61,6 +116,5 @@ def perform_query(self, query_input):

if __name__ == '__main__':
searcher = indexSearcher()
# 进行搜索
# print searcher.retrive_word('good')
print searcher.perform_query("literature science")
# print searcher.perform_query("literature science")
searcher.caculate_BM25("literature science")

0 comments on commit 5df879c

Please sign in to comment.