In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# TF-IDF

IDF不变，只有改变TF值，才能改变TF-IDF

## 语料库 

In [2]:
corpus = [
    'this is the first document',
    'this is the second second document',
    'and the third one',
    'is this the first document'
]

## 语料库分词

In [3]:
word_list = []
for i in range(len(corpus)):
    word_list.append(corpus[i].split(' '))
print(word_list)

[['this', 'is', 'the', 'first', 'document'], ['this', 'is', 'the', 'second', 'second', 'document'], ['and', 'the', 'third', 'one'], ['is', 'this', 'the', 'first', 'document']]


## 算法 

### 利用python手动实现计算TF-IDF 

$$TF = \frac{某个词在文章中出现次数}{文章总词数}$$
$$IDF = log(\frac{语料库的文档总数}{包含该词的文档数+1})$$
分母之所以要加1，是为了避免分母为0（即所有文档都不包含该词）

- 统计词频

In [4]:
from collections import Counter
countlist = []
for i in range(len(word_list)):
    count = Counter(word_list[i])
    countlist.append(count)
countlist

[Counter({'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1}),
 Counter({'this': 1, 'is': 1, 'the': 1, 'second': 2, 'document': 1}),
 Counter({'and': 1, 'the': 1, 'third': 1, 'one': 1}),
 Counter({'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1})]

- 定义计算tfidf公式的函数

In [5]:
# word可以通过count得到，count可以通过countlist得到

# count[word]可以得到每个单词的词频， sum(count.values())得到整个句子的单词总数
def tf(word, count):
    return count[word] / sum(count.values())

# 统计的是含有该单词的句子数
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)
 
# len(count_list)是指句子的总数，n_containing(word, count_list)是指含有该单词的句子的总数，加1是为了防止分母为0
def idf(word, count_list):
    return math.log(len(count_list) / (1 + n_containing(word, count_list)))

# 将tf和idf相乘
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)        

- 计算每个单词的tfidf值

In [6]:
import math
for i, count in enumerate(countlist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, count, countlist) for word in count}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
	Word: first, TF-IDF: 0.05754
	Word: this, TF-IDF: 0.0
	Word: is, TF-IDF: 0.0
	Word: document, TF-IDF: 0.0
	Word: the, TF-IDF: -0.04463
Top words in document 2
	Word: second, TF-IDF: 0.23105
	Word: this, TF-IDF: 0.0
	Word: is, TF-IDF: 0.0
	Word: document, TF-IDF: 0.0
	Word: the, TF-IDF: -0.03719
Top words in document 3
	Word: and, TF-IDF: 0.17329
	Word: third, TF-IDF: 0.17329
	Word: one, TF-IDF: 0.17329
	Word: the, TF-IDF: -0.05579
Top words in document 4
	Word: first, TF-IDF: 0.05754
	Word: is, TF-IDF: 0.0
	Word: this, TF-IDF: 0.0
	Word: document, TF-IDF: 0.0
	Word: the, TF-IDF: -0.04463


### 用gensim库计算TF-IDF 

优缺点：
- gensim训练出来的tf-idf值左边是词的id，右边是词的tfidf值
- gensim有自动去除停用词的功能，比如the
- gensim会自动去除单个字母，比如i
- gensim会去除没有被训练到的词，比如name
- 所以通过gensim并不能计算每个单词的tfidf值

- 得到每个词的id值及词频

In [7]:
from gensim import corpora
# 赋给语料库中每个词(不重复的词)一个整数id
dictionary = corpora.Dictionary(word_list)
new_corpus = [dictionary.doc2bow(text) for text in word_list]
print("(id,frequency)",new_corpus)
# 元组中第一个元素是词语在词典中对应的id，第二个元素是词语在文档中出现的次数

(id,frequency) [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (2, 1), (3, 1), (4, 1), (5, 2)], [(3, 1), (6, 1), (7, 1), (8, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [8]:
 # 通过下面的方法可以看到语料库中每个词对应的id
print(dictionary.token2id)

{'document': 0, 'first': 1, 'is': 2, 'the': 3, 'this': 4, 'second': 5, 'and': 6, 'one': 7, 'third': 8}


- 训练gensim模型并且保存它以便后面的使用

In [9]:
# 训练模型并保存
from gensim import models
tfidf = models.TfidfModel(new_corpus)
tfidf.save("my_model.tfidf")

# 载入模型
tfidf = models.TfidfModel.load("my_model.tfidf")

# 使用这个训练好的模型得到单词的tfidf值
tfidf_vec = []
for i in range(len(corpus)):
    string = corpus[i]
    string_bow = dictionary.doc2bow(string.lower().split()) # Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples
    string_tfidf = tfidf[string_bow]
    tfidf_vec.append(string_tfidf)
print(tfidf_vec)

[[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)], [(0, 0.10212329019650272), (2, 0.10212329019650272), (4, 0.10212329019650272), (5, 0.9842319344536239)], [(6, 0.5773502691896258), (7, 0.5773502691896258), (8, 0.5773502691896258)], [(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)]]


#### 测试集 

In [10]:
string = 'the i first second name'
string_bow = dictionary.doc2bow(string.lower().split())
string_tfidf = tfidf[string_bow]
print(string_tfidf)

[(1, 0.4472135954999579), (5, 0.8944271909999159)]


### 用sklearn库计算TF-IDF 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(corpus)

# 得到语料库所有不重复的词
print(tfidf_vec.get_feature_names())

# 得到每个单词对应的id值
print(tfidf_vec.vocabulary_)

# 得到每个句子所对应的向量
# 向量里数字的顺序是按照词语的id顺序来的
print(tfidf_matrix.toarray())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]


### 用jieba.analyse库计算TF-IDF 

缺点：
基于该框架的TF-IDF效果一般，在垂直领域效果不一定好，适用于通用领域。
- 该框架使用的是默认IDF值的文件（不是针对项目文件计算的IDF）
- IDF本身是先有词才能计算，而该框架是先提供IDF值才能计算最终TF-IDF值。

In [12]:
import jieba.analyse

In [13]:
content = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本有7000万元增加至10000万元"

- jieba.analyse.TFIDF(idf_path=None) 新建 TFIDF 实例， idf_path 为 IDF 频率文件
    - idf默认为jieba内置语料库计算而得
    - idf_path在实际使用时，需要替换对应语料库的路径

In [14]:
jieba.analyse.set_idf_path('./word_dict/word.txt') # file_name 为自定义语料库路径

In [15]:
tags = jieba.analyse.extract_tags(content, topK=20, withWeight = False, allowPOS=())
tags

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\13416\AppData\Local\Temp\jieba.cache
Loading model cost 0.426 seconds.
Prefix dict has been built successfully.


['吉林',
 '增资',
 '欧亚',
 '置业',
 '此外',
 '全资',
 '有限公司',
 '4.3',
 '注册资本',
 '7000',
 '增加',
 '10000',
 '万元',
 '子公司',
 '亿元',
 '公司']

In [16]:
# 关键词提取所使用的停止词（stop words）文本语料库可以切换成自定义语料库的路径
jieba.analyse.set_stop_words('./word_dict/stopwords.txt')

- sentence 为待提取文本
- topK 提取前多少个关键字
- withWeight 是否返回每个关键词的权重
- allowPOS是允许的提取的词性，allowPOS=‘ns’, ‘n’, ‘vn’, ‘v’，提取地名、名词、动名词、动词，默认为空，即不筛选

In [17]:
for word, weight in jieba.analyse.extract_tags(content, withWeight=True):
    print("%s %s" % (word, weight))

吉林 1.1385492859904762
增资 1.1385492859904762
欧亚 0.8169207307466667
置业 0.7291915001247619
此外 0.5692746429952381
全资 0.5692746429952381
有限公司 0.5692746429952381
4.3 0.5692746429952381
注册资本 0.5692746429952381
7000 0.5692746429952381
增加 0.5692746429952381
10000 0.5692746429952381
万元 0.5062157353885715
子公司 0.32667339341714285
亿元 0.21423289597285716
公司 0.1668319020752381
