# 2.1 Python 手动实现

In [None]:
corpus = ['this is the first document',
        'this is the second second document',
        'and the third one',
        'is this the first document']
words_list = list()
for i in range(len(corpus)):
    words_list.append(corpus[i].split(' '))
print(words_list)
[['this', 'is', 'the', 'first', 'document'],
['this', 'is', 'the', 'second', 'second', 'document'],
['and', 'the', 'third', 'one'],
['is', 'this', 'the', 'first', 'document']]

In [2]:
# 统计词语数量
from collections import Counter
count_list = list()
for i in range(len(words_list)):
    count = Counter(words_list[i])
    count_list.append(count)
print(count_list)
[Counter({'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1}),
Counter({'second': 2, 'this': 1, 'is': 1, 'the': 1, 'document': 1}),
Counter({'and': 1, 'the': 1, 'third': 1, 'one': 1}),
Counter({'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1})]

[Counter({'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1}), Counter({'second': 2, 'this': 1, 'is': 1, 'the': 1, 'document': 1}), Counter({'and': 1, 'the': 1, 'third': 1, 'one': 1}), Counter({'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1})]


[Counter({'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1}),
 Counter({'second': 2, 'this': 1, 'is': 1, 'the': 1, 'document': 1}),
 Counter({'and': 1, 'the': 1, 'third': 1, 'one': 1}),
 Counter({'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1})]

In [4]:
# 定义函数
import math
def tf(word, count):
    return count[word] / sum(count.values())


def idf(word, count_list):
    n_contain = sum([1 for count in count_list if word in count])
    return math.log(len(count_list) / (1 + n_contain))


def tf_idf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

# 输出结果
for i, count in enumerate(count_list):
    print("第 {} 个文档 TF-IDF 统计信息".format(i + 1))
    scores = {word : tf_idf(word, count, count_list) for word in count}
    sorted_word = sorted(scores.items(), key = lambda x : x[1], reverse=True)
    for word, score in sorted_word:
        print("\tword: {}, TF-IDF: {}".format(word, round(score, 5)))
"""
第 1 个文档 TF-IDF 统计信息
	word: first, TF-IDF: 0.05754
	word: this, TF-IDF: 0.0
	word: is, TF-IDF: 0.0
	word: document, TF-IDF: 0.0
	word: the, TF-IDF: -0.04463
第 2 个文档 TF-IDF 统计信息
	word: second, TF-IDF: 0.23105
	word: this, TF-IDF: 0.0
	word: is, TF-IDF: 0.0
	word: document, TF-IDF: 0.0
	word: the, TF-IDF: -0.03719
第 3 个文档 TF-IDF 统计信息
	word: and, TF-IDF: 0.17329
	word: third, TF-IDF: 0.17329
	word: one, TF-IDF: 0.17329
	word: the, TF-IDF: -0.05579
第 4 个文档 TF-IDF 统计信息
	word: first, TF-IDF: 0.05754
	word: is, TF-IDF: 0.0
	word: this, TF-IDF: 0.0
	word: document, TF-IDF: 0.0
	word: the, TF-IDF: -0.04463
"""

# 2.2 使用 gensim 算法包实现
# 使用和 2.1 节相同的语料库 corpus，过程如下

In [None]:
# 获取每个词语的 id 和词频
from gensim import corpora
# 赋给语料库中每个词(不重复的词)一个整数id
dic = corpora.Dictionary(words_list)
new_corpus = [dic.doc2bow(words) for words in words_list]
# 元组中第一个元素是词语在词典中对应的id，第二个元素是词语在文档中出现的次数
print(new_corpus)
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
[(0, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
[(3, 1), (6, 1), (7, 1), (8, 1)],
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]

In [7]:
# 查看每个词语对应的 id
print(dic.token2id)

{'document': 0, 'first': 1, 'is': 2, 'the': 3, 'this': 4, 'second': 5, 'and': 6, 'one': 7, 'third': 8}


In [8]:
# 训练gensim模型并且保存它以便后面的使用
# 训练模型并保存
from gensim import models
tfidf = models.TfidfModel(new_corpus)
tfidf.save("tfidf.model")
# 载入模型
tfidf = models.TfidfModel.load("tfidf.model")
# 使用这个训练好的模型得到单词的tfidf值
tfidf_vec = []
for i in range(len(corpus)):
    string = corpus[i]
    string_bow = dic.doc2bow(string.lower().split())
    string_tfidf = tfidf[string_bow]
    tfidf_vec.append(string_tfidf)
# 输出 词语id与词语tfidf值
print(tfidf_vec)
[[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)],
[(0, 0.10212329019650272), (2, 0.10212329019650272), (4, 0.10212329019650272), (5, 0.9842319344536239)],
[(6, 0.5773502691896258), (7, 0.5773502691896258), (8, 0.5773502691896258)],
[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)]]

[[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)], [(0, 0.10212329019650272), (2, 0.10212329019650272), (4, 0.10212329019650272), (5, 0.9842319344536239)], [(6, 0.5773502691896258), (7, 0.5773502691896258), (8, 0.5773502691896258)], [(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)]]


[[(0, 0.33699829595119235),
  (1, 0.8119707171924228),
  (2, 0.33699829595119235),
  (4, 0.33699829595119235)],
 [(0, 0.10212329019650272),
  (2, 0.10212329019650272),
  (4, 0.10212329019650272),
  (5, 0.9842319344536239)],
 [(6, 0.5773502691896258), (7, 0.5773502691896258), (8, 0.5773502691896258)],
 [(0, 0.33699829595119235),
  (1, 0.8119707171924228),
  (2, 0.33699829595119235),
  (4, 0.33699829595119235)]]

In [9]:
# 句子测试
# 测试一个句子
test_words = "i is the first one"
string_bow = dic.doc2bow(string.lower().split())
string_tfidf = tfidf[string_bow]
print(string_tfidf)
[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)]

[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)]


[(0, 0.33699829595119235),
 (1, 0.8119707171924228),
 (2, 0.33699829595119235),
 (4, 0.33699829595119235)]

## 2.3 使用 sklearn 算法包实现
## 调包

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(corpus)
# 得到语料库所有不重复的词
print(tfidf_vec.get_feature_names())
# 得到每个单词对应的id值
print(tfidf_vec.vocabulary_)
# 得到每个句子所对应的向量，向量里数字的顺序是按照词语的id顺序来的
print(tfidf_matrix.toarray())