# TF-IDF 词频-逆文本频率

### TF(Term Frequency),
衡量⼀个term在⽂档中出现得有多频繁。  
TF(t) = (t出现在⽂档中的次数) / (⽂档中的term总数).

###  IDF: Inverse Document Frequency
衡量⼀个term有多重要。

有些词出现的很多，但是明显不是很有卵⽤。⽐如’is'，’the‘，’and‘之类
的。
为了平衡，我们把罕见的词的重要性（weight）搞⾼，
把常见词的重要性搞低。
IDF(t) = log_e(⽂档总数 / 含有t的⽂档总数).  

### TF-IDF = TF * IDF

# 举例

⼀个⽂档有100个单词，其中单词baby出现了3次。  
那么，TF(baby) = (3/100) = 0.03.  

现在我们如果有10M的⽂档， baby出现在其中的1000个⽂档中。  
那么，IDF(baby) = log(10,000,000 / 1,000) = 4  


所以， TF-IDF(baby) = TF(baby) * IDF(baby) = 0.03 * 4 = 0.12  

## python实现TF-IDF

In [43]:
from collections import defaultdict
import math
import operator


"""
函数说明:创建数据样本
Returns:
    dataset - 实验样本切分的词条
    classVec - 类别标签向量
"""

def loadDataSet():
    dataset = [ ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],    # 切分的词条
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'] ]
    classVec = [0, 1, 0, 1, 0, 1]  # 类别标签向量，1代表好，0代表不好
    return dataset, classVec


"""
函数说明：特征选择TF-IDF算法
Parameters:
     list_words:词列表
Returns:
     dict_feature_select:特征选择词字典
"""

def feature_select(list_words):
    #总词频统计
    doc_frequency=defaultdict(int)
    for word_list in list_words:
        for i in word_list:
            doc_frequency[i]+=1
 
    #计算每个词的TF值
    word_tf={}  #存储没个词的tf值
    for i in doc_frequency:
        word_tf[i]=doc_frequency[i]/sum(doc_frequency.values())
 
    #计算每个词的IDF值
    doc_num=len(list_words)
    word_idf={} #存储每个词的idf值
    word_doc=defaultdict(int) #存储包含该词的文档数
    for i in doc_frequency:
        for j in list_words:
            if i in j:
                word_doc[i]+=1
    for i in doc_frequency:
        word_idf[i]=math.log(doc_num/(word_doc[i]+1))
 
    #计算每个词的TF*IDF的值
    word_tf_idf={}
    for i in doc_frequency:
        word_tf_idf[i]=word_tf[i]*word_idf[i]
 
    # 对字典按值由大到小排序
    dict_feature_select=sorted(word_tf_idf.items(),key=operator.itemgetter(1),reverse=True)
    return dict_feature_select
 
if __name__=='__main__':
    data_list, label_list = loadDataSet() #加载数据
    features = feature_select(data_list) #所有词的TF-IDF值
    print(features)
    print(len(features))


[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
[('to', 0.0322394037469742), ('stop', 0.0322394037469742), ('worthless', 0.0322394037469742), ('my', 0.028288263356383563), ('dog', 0.028288263356383563), ('him', 0.028288263356383563), ('stupid', 0.028288263356383563), ('has', 0.025549122992281622), ('flea', 0.025549122992281622), ('problems', 0.025549122992281622), ('help', 0.025549122992281622), ('please', 0.025549122992281622), ('maybe', 0.025549122992281622), ('not', 0.025549122992281622), ('take', 0.025549122992281622), ('park', 0.025549122992281622), ('dalmation', 0.025549122992281622), ('is', 0.025549122992281622), ('so', 0.025549122992281622), ('cute', 0.0255491229922

## NLTK实现TF-IDF

In [30]:
from nltk.text import TextCollection
import nltk

sents = ['this is sentence one', 'this is sentence two', 'this is sentence three']

sents = [nltk.word_tokenize(sent) for sent in sents]
print(sents)

corpus = TextCollection(sents)

# 计算IDF
corpus.idf('this') # log_e(3/3) = 0
corpus.idf('three') # log_e(3/1)=1.0986

# 计算TF
corpus.tf('three', nltk.word_tokenize('one two three, go')) # 1/5
corpus.tf('three', 'one two three, go')  # 1/17 this is wrong

#计算tf-idf
corpus.tf_idf('three', nltk.word_tokenize('one two three, go')) # 1/5 * log_e(3/1)


[['this', 'is', 'sentence', 'one'], ['this', 'is', 'sentence', 'two'], ['this', 'is', 'sentence', 'three']]


0.21972245773362198

## scikit-learn实现TF-IDF

### 基于TfidfVectorizer

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(smooth_idf=True)
# print(tfidf)

corpus_en = ['this is sentence one', 'this is sentence two', 'this is sentence three']

# 分词后的中文语料库
corpus_cn = ['我 来到 北京大学', '他 来到 了 网易 行研 大厦', '小明 硕士 毕业 于 中国 科学院', '我 爱 北京天安门']


# tfidf的词向量
result_cn = tfidf.fit_transform(corpus_en).toarray()
print(result_cn)

# 关键词
word = tfidf.get_feature_names()
print(word)

stopwords = tfidf.get_stop_words()

# 统计关键词
for k, v in tfidf.vocabulary_.items():
    print(k, v)


[[0.41285857 0.69903033 0.41285857 0.41285857 0.         0.        ]
 [0.41285857 0.         0.41285857 0.41285857 0.         0.69903033]
 [0.41285857 0.         0.41285857 0.41285857 0.69903033 0.        ]]
['is', 'one', 'sentence', 'this', 'three', 'two']
this 3
is 0
sentence 2
one 1
two 5
three 4


### 基于TfidfVectorizer

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

corpus = ['aaa ccc aaa aaa', 
          'aaa aaa', 
          'aaa aaa aaa', 
          'aaa aaa aaa aaa',
          'aaa bbb aaa bbb aaa',
          'ccc aaa aaa ccc aaa'
         ]

corpus_en = ['this is sentence one', 'this is sentence two', 'this is sentence three']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus_en)

# 获取词袋模型中的所有词语   
word = vectorizer.get_feature_names()  
print(word) 

# 获取每个词在该行（文档）中出现的次数
counts =  X.toarray()
print (counts)

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
#tfidf = transformer.fit_transform(counts) #与上一行的效果完全一样
#print(tfidf)
print(tfidf.toarray())


['is', 'one', 'sentence', 'this', 'three', 'two']
[[1 1 1 1 0 0]
 [1 0 1 1 0 1]
 [1 0 1 1 1 0]]
[[0.41285857 0.69903033 0.41285857 0.41285857 0.         0.        ]
 [0.41285857 0.         0.41285857 0.41285857 0.         0.69903033]
 [0.41285857 0.         0.41285857 0.41285857 0.69903033 0.        ]]


## jieba实现TF-IDF

jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())  
sentence 为待提取的文本  
topK 为返回几个 TF/IDF 权重最大的关键词，默认值为 20  
withWeight 为是否一并返回关键词权重值，默认值为 False  
allowPOS 仅包括指定词性的词，默认值为空，即不筛选  


In [44]:
import jieba.analyse

text = '关键词是能够表达文档中心内容的词语，常用于计算机系统标引论文内容特征、\
信息检索、系统汇集以供读者检阅。关键词提取是文本挖掘领域的一个分支，是文本检索、\
文档比较、摘要生成、文档分类和聚类等文本挖掘研究的基础性工作'

keywords=jieba.analyse.extract_tags(text, topK=5, withWeight=False, allowPOS=())
print(keywords)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\86153\AppData\Local\Temp\jieba.cache
Loading model cost 0.647 seconds.
Prefix dict has been built successfully.


['文档', '文本', '关键词', '挖掘', '文本检索']
