# 词向量构建与表示

## TF-IDF

In [1]:
import pandas as pd
import numpy as np 
import jieba.posseg
import jieba.analyse
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
%load_ext watermark
%watermark -m -v -p numpy,pandas,jieba,sklearn

CPython 3.7.3
IPython 7.6.1

numpy 1.16.4
pandas 0.24.2
jieba 0.40
sklearn 0.21.2

compiler   : MSC v.1915 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
CPU cores  : 4
interpreter: 64bit


In [3]:
def data_preprocess(text):
    l = []
    pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd'] # 定义选取的词性
    seg = jieba.posseg.cut(text) # 分词
    for i in seg:
        if i.word and i.flag in pos: # 词性筛选
            l.append(i.word)
    return l

In [4]:
def get_keywords_tfidf(data, top_k):
    id_list, title_list, abstract_list = data['index'], data['title'], data['abstract']
    corpus = []
    for index in range(len(id_list)):
        text = '%s。 %s' % (title_list[index], abstract_list[index])
        text = data_preprocess(text) # 文本预处理
        text = ' '.join(text) # 按 sklearn 要求用空格分隔
        corpus.append(text)
        
    # 构造词频矩阵    
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    
    # 统计每个词的 tf-idf 权值
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(X)
    
    # 获取词袋模型里的关键词
    word = vectorizer.get_feature_names()
    
    # 获取 tf-idf 矩阵
    weight = tfidf.toarray()
    for i in range(len(weight)):
        print('---------这里输出第', i+1, '篇文本的词语 tf-idf--------')
        df = pd.DataFrame(weight[i], index = word, columns=['tfidf'])
        df = df.sort_values(by=['tfidf'], ascending=False)
        top_k_arr = np.array(df[:top_k].reset_index())
        for j in top_k_arr:
            print(j)
    return corpus

In [5]:
data_path = './data/summary_sample.csv' # 1103 条语料
data = pd.read_csv(data_path, names=['title','abstract'])
data = data.reset_index()
data = data[:10]
data.head()

Unnamed: 0,index,title,abstract
0,0,可穿戴技术十大设计原则,本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：1...
1,1,经济学人：智能手机将成为“真正的个人电脑”,2007年乔布斯向人们展示iPhone并宣称“它将会改变世界”，还有人认为他在夸大其词，然而...
2,2,雅虎宣布剥离阿里巴巴股份,雅虎发布2014年第四季度财报，并推出了免税方式剥离其持有的阿里巴巴集团15％股权的计划，打...
3,3,51信用卡管家，预计2015年放贷额度远超30亿,2014年，51信用卡管家跟宜信等P2P公司合作，推出线上信贷产品“瞬时贷”，其是一种纯在线...
4,4,如何选择正确的编程语言进行学习,目前世界上有着几百种编程语言，我应该学哪个?如何选择“正确”的编程语言进行学习?我所学的语言...


In [6]:
raw_data = get_keywords_tfidf(data, 10)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\mfx66\AppData\Local\Temp\jieba.cache
Loading model cost 0.894 seconds.
Prefix dict has been built successfully.


---------这里输出第 1 篇文本的词语 tf-idf--------


NameError: name 'wt' is not defined

------------

**作者：** Daniel Meng

**GitHub：** [LibertyDream](https://github.com/LibertyDream)

**博客：** [明月轩](https://libertydream.github.io/)