v2-0

Huangtuzhi · Jan 11, 2016 · 5df879c · 5df879c
1 parent 9af7799
commit 5df879c
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -36,6 +36,37 @@ mongod --dbpath =/opt/mongodb-data --logpath=/opt/mongodb-data/mongodb.log
 * 运行 indexer.py 构建倒排索引
 * 运行 indexSearcher.py 进行检索
 
+## 文档排名——计算 TF-IDF
+
+搜索引擎检索出文档之后，需要选择和查询最相关的文档返回给用户，因此需要对文档进行评估。一般有下列方法：
+
+* TF-IDF 词频-逆文档频率
+* 余弦相似度
+* Okapi BM25
+
+看一下 TF-IDF 的计算
+
+```
+def caculate_TFIDF(self, word):
+    score_dictionary = {}
+    for posting in self.word_dictionary[word]:
+        DocID = posting[0]
+        freq = posting[1]
+
+        idf = math.log(float(100) / len(self.word_dictionary[word]))
+        tf = 1 + math.log(int(freq)) if freq > 0 else 0
+        tfidf_score = tf * idf
+        score_dictionary[DocID] = tfidf_score
+            
+    score = sorted(score_dictionary.iteritems(), key=lambda d:d[1], \
+    reverse = True)
+    print score
+```
+
+idf 是文档总数和该词元出现过文档总数的商。TF-IDF 作为衡量“词元在文档集合中是否特殊”的一个指标。
+
+将算得的 TF-IDF 分数存储在字典中，最后按值进行排序。
+
 ## 参考
 
 [https://www.coursera.org/course/textretrieval](https://www.coursera.org/course/textretrieval)

diff --git a/indexSearcher.py b/indexSearcher.py
@@ -14,6 +14,10 @@ def __init__(self):
 	# 计算每个文档的 TF-IDF 值，进行排序
 	def caculate_TFIDF(self, word):
 		score_dictionary = {}
+
+		if not self.word_dictionary.has_key(word):
+			return 0
+
 		for posting in self.word_dictionary[word]:
 			DocID = posting[0]
 			freq = posting[1]
@@ -26,9 +30,60 @@ def caculate_TFIDF(self, word):
 		score = sorted(score_dictionary.iteritems(), key=lambda d:d[1], reverse = True)
 		return score
 
-	# 计算 BM25
-	def caculate_BM25(self, word):
-		pass
+	def get_wordcount_in_document(self, word, content):
+		word_list = content.split(' ')
+		cnt = 0
+		for one in word_list:
+			if one == word:
+				cnt += 1
+		return cnt
+
+	def DocID2Doc(self, DocID):
+		manager = documentManager()
+		collection = manager.connect_mongo()
+		url = collection.find_one({"DocID": DocID})["url"]
+		return url
+
+	# 计算 BM25，设定　ｃ(w,q) 为　１，即查询中每个词出现一次
+	def caculate_BM25(self, query_words):
+		manager = documentManager()
+		collection = manager.connect_mongo()
+
+		score_dictionary = {}
+		b = 0.5 #参数调节因子
+		k = 10 # 调节因子
+		avdl = 800 # 文档平均长度
+
+		# query_words 中至少一个单元词出现的所有文档
+		DocId_of_query_words = set([])
+		for word in query_words.split(' '):
+
+			if not self.word_dictionary.has_key(word):
+				continue
+
+			for posting in self.word_dictionary[word]:
+				DocID = posting[0]
+				DocId_of_query_words.add(DocID)
+
+		for id in DocId_of_query_words:
+			BM25_score = 0
+			for word in query_words.split(' '):
+				content = collection.find_one({"DocID": int(id)})["content"]
+				freq = self.get_wordcount_in_document(word ,content)
+
+				doc_len = len(self.word_dictionary[word])
+				idf = math.log(float(100) / doc_len)
+				normalizer = 1 - b + b * (doc_len / avdl) 
+
+				BM25_score += (float)((k + 1) * freq) / (freq + k * normalizer) * idf
+			# 计算某个文档对　Query 的 BM25 分数 
+			score_dictionary[id] = BM25_score
+
+		score = sorted(score_dictionary.iteritems(), key=lambda d:d[1], reverse = True)
+
+		for i in score:
+			print self.DocID2Doc(int(i[0]))
+
 
 	def retrive_word(self, word):
 		# 找出 DocID 对应的 url
@@ -61,6 +116,5 @@ def perform_query(self, query_input):
 
 if __name__ == '__main__':
 	searcher = indexSearcher()
-	# 进行搜索  
-	# print searcher.retrive_word('good')
-	print searcher.perform_query("literature science")	
+	# print searcher.perform_query("literature science")
+	searcher.caculate_BM25("literature science")