# 爬取弹幕`xml`文件

In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
def crawl_danmaku(cid):
    url = 'https://comment.bilibili.com/{}.xml'.format(cid)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    danmaku = soup.select('d')
    danmaku_list = [danmaku[i].get_text() for i in range(len(danmaku))]
    
    return danmaku_list

# 获取关键词

In [4]:
import jieba
import jieba.analyse

In [5]:
def extract_keywords(danmaku_list, top=200):
    seg = [' '.join(jieba.cut(dan)) for dan in danmaku_list]

    sentence = ' '.join(seg)

    tags = dict(jieba.analyse.extract_tags(sentence, top, withWeight=True))
    
    return tags

# 画出词云

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud, ImageColorGenerator

In [7]:
def gen_wordcloud(tags, mask, font, scale=4):
    coloring = plt.imread(mask)
    image_colors = ImageColorGenerator(coloring)
    wordCloud = WordCloud(background_color="white",
                          mask=coloring,
                          font_path=font,
                          random_state=50,
                          max_words=3000,
                          scale=scale).generate_from_frequencies(tags)
    wordCloud.recolor(color_func=image_colors)
    return wordCloud

In [8]:
def save_wordcloud(wordCloud, output):
    wordCloud.to_file(output)

In [9]:
def draw_wordcloud(wordCloud, dpi=200):
    plt.figure(dpi=dpi)
    plt.imshow(wordCloud)
    plt.axis("off")
    plt.show()

# 爬取弹幕实践：bilibili拜年祭

In [10]:
def crawl_and_plot(cid, output, mask, font='C://Windows/Fonts/msyh.ttc', scale=4, top_words=1000, preview=False):
    print('Crawling danmaku list...', end=' ')
    danmaku_list = crawl_danmaku(cid)
    print('Done')
    
    print('Extracting top {} words...'.format(top_words), end=' ')
    tags_sentence = extract_keywords(danmaku_list, top_words)
    print('Done')
    
    print('Generating wordcloud...', end=' ')
    wordCloud = gen_wordcloud(tags_sentence, mask, font, scale)
    print('Done')
    
    print('Drawing and saving wordcloud...', end=' ')
    if preview:
        draw_wordcloud(wordCloud)
    save_wordcloud(wordCloud, output)
    print('Done')