### 用了 tf-idf 来编码文档

In [1]:
import json
data_pair = [json.loads(data) for data in open('static/dataset/data_pair.json', 'r')]
print len(data_pair)
print data_pair[0].keys()

19207
[u'website', u'title', u'url', u'poster', u'detail', u'keywords', u'sentence_id', u'images']


In [2]:
import re
import jieba
jieba.dt.tmp_dir = './'
jieba.initialize()

class Dictionary(object):
    def __init__(self):
        self.index2word = []
        self.word2index = {}
        self.wordcount = {}
        self.stopwords = [word.strip().decode('utf-8') for word in open('static/dataset/stopwords.txt', 'r')]
        
    def update(self, words):
        words = re.sub(u'[\r\t\n ]', u' ', words)
        words = list(jieba.cut_for_search(words))
        words = filter(lambda x: x not in self.stopwords, words)
        for word in words:
            if self.word2index.has_key(word):
                self.wordcount[word] += 1
            else:
                self.wordcount[word] = 1
                self.index2word.append(word)
                self.word2index[word] = len(self.index2word) - 1
                
    def shrink(self, min_count=2):
        index2word, word2index, wordcount = [], {}, {}
        for word in self.index2word:
            if self.wordcount[word] >= min_count:
                index2word.append(word)
                word2index[word] = len(index2word) - 1
                wordcount[word] = self.wordcount[word]
        self.index2word = index2word
        self.word2index = word2index
        self.wordcount = wordcount

dic = Dictionary()             
for data in data_pair:
    dic.update(data['title'])
print 'have %d words before shrink' %(len(dic.index2word))
dic.shrink(min_count=5)
print 'have %d words after shrink' %(len(dic.index2word))

Building prefix dict from the default dictionary ...
Loading model from cache ./jieba.cache
Loading model cost 0.193 seconds.
Prefix dict has been built succesfully.


have 23640 words before shrink
have 2721 words after shrink


In [30]:
import numpy as np

class Tfidf(object):
    def __init__(self, dic):
        self.dic = dic
        self.term_freq = []
        self.inverse_doc_freq = {key:0 for key, value in dic.wordcount.iteritems()}
        self.n_docs = 0
    
    def _clean(self, words):
        words = re.sub('[\r\t\n ]', ' ', words)
        words = list(jieba.cut_for_search(words))
        words = filter(lambda x: (x not in self.dic.stopwords) and (x in self.dic.wordcount), words)
        return words
    
    def update(self, words):
        words = self._clean(words)
        #assert len(words) > 0, 'detect useless doc'
        doc_tf = {word:0 for word in words}
        for word in words:
            doc_tf[word] += 1
        self.term_freq.append(doc_tf)
        for term in doc_tf:
            self.inverse_doc_freq[term] += 1
        self.n_docs += 1
    
    def parse(self, words):
        words = self._clean(words)
        doc_tf = {word:0 for word in words}
        for word in words:
            doc_tf[word] += 1
        return self._get_tfidf(doc_tf)
        
    def __getitem__(self, index):
        assert index < self.n_docs, 'out of range'
        return self._get_tfidf(self.term_freq[index])
    
    def __iter__(self):
        for i in range(self.n_docs):
            yield self[i]
        
    def _get_tfidf(self, tf):
        tfidf = {}
        for word, freq in tf.iteritems():
            tfidf[word] = freq * np.log(self.n_docs / (1e-6 + self.inverse_doc_freq[word]))
        tfidf = sorted(tfidf.iteritems(), key=lambda x: x[1], reverse=True)
        tfidf = {k:v for k, v in tfidf} # list to dict
        return tfidf
    
    def to_numpy(self, tfidf):
        doc = np.zeros(len(self.dic.wordcount))
        for word, score in tfidf.iteritems():
            doc[self.dic.word2index[word]] = score
        return doc
    
    def numpy(self):
        return np.array([self.to_numpy(tfidf) for tfidf in self])
    
tfidf = Tfidf(dic)
for data in data_pair:
    tfidf.update(data['title'])
    
docs = tfidf.numpy()
print docs.shape

(19207, 2721)


In [54]:
norm = (docs ** 2).sum(1, keepdims=True)
print norm.shape

normalized_docs = docs / (norm + 1e-6)

(19207, 1)


In [78]:
np.save('docs_tfidf', normalized_docs)

In [79]:
query = u'酒店'
query_tfidf = tfidf.to_numpy(tfidf.parse(query))
print query_tfidf.shape

scores = np.dot(normalized_docs, query_tfidf)
index = scores.argsort()[::-1]
scores = scores[index]
print index[0:10]
print scores[0:10]

(2721,)
[  704 14624  6770  5170 15806 18794  7868 15517 10205  1232]
[ 0.99999991  0.99999991  0.99999991  0.99999991  0.99999991  0.99999991
  0.99999991  0.99999991  0.99999991  0.99999991]


In [80]:
for i in index[0:10]:
    print i, data_pair[i]['title']

704 皮克林宾乐雅酒店
14624 酒店
6770 POD酒店
5170 Floreasca酒店
15806 Hyderbad帕克酒店
18794 帕西托尼酒店
7868 酒店
15517 艾尔酒店
10205 寒舍艾丽酒店
1232 坚果酒店
