## 读取文件

In [47]:
import os
from os.path import join

os.getcwd()
data_path = join(os.getcwd(), "Data")

f = open(join(data_path, '2020年中央一号文件.txt'), "r", encoding="utf-8")
t = f.read()
f.close()

## 分词与词频统计

In [48]:
import jieba
import jieba.analyse as analyse

# 分词
ls = jieba.lcut(t)
txt = " ".join(ls)

# 添加自定义词典
jieba.suggest_freq(('十九大'), True)

# 添加停用词
stopwords = ['2020']
ls = [x for x in ls if len(x) > 1 and x not in stopwords]

# 基于 TF-IDF 算法的关键词抽取
kw = "  ".join(analyse.extract_tags(t, topK=20, withWeight=False, allowPOS=()))

# 词频统计
counts = {}
for word in ls:
    if len(word) == 1:
        continue
    else:
        counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)

In [49]:
# 导出词频结果
import numpy as np
import pandas as pd

df = pd.DataFrame(items, columns=['关键词', '频次'])
df.to_csv("./Result/01_词频统计结果.csv", index=None, encoding='utf_8_sig')

In [50]:
df.head(10)

Unnamed: 0,关键词,频次
0,农村,80
1,建设,48
2,农业,44
3,乡村,43
4,加强,40
5,工作,39
6,推进,33
7,脱贫,31
8,服务,25
9,全面,24


## wordcloud 库绘制词云图

In [52]:
import wordcloud

w = wordcloud.WordCloud(font_path="./Font/simhei.ttf",
                        width=1000, height=700, background_color="white")
w.generate(txt)
w.to_file("./Result/02_wordcloud.png")

<wordcloud.wordcloud.WordCloud at 0x1c74734f1c8>

## Pyecharts 绘制动态词云

In [53]:
from pyecharts import options as opts
from pyecharts.charts import Page, WordCloud
from pyecharts.globals import SymbolType


def wordcloud_diamond() -> WordCloud:
    words = items[:100] # 绘制前 100 个词
    c = (
        WordCloud()
        .add("", words, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
        .set_global_opts(title_opts=opts.TitleOpts(title="2020中央一号文件词云"))
    )
    return c


wordcloud_diamond().render('./Result/03_2020中央一号文件词云图.html')

'C:\\Users\\mudaozi\\Documents\\WeChatPlatform\\2020中央一号文件说了啥？\\Result\\03_2020中央一号文件词云图.html'