# 爬取弹幕`xml`文件

In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
def crawl_danmaku(cid):
    url = 'https://comment.bilibili.com/{}.xml'.format(cid)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    danmaku = soup.select('d')
    danmaku_list = [danmaku[i].get_text() for i in range(len(danmaku))]
    
    return danmaku_list

# 获取关键词

In [4]:
import jieba
import jieba.analyse

In [5]:
def extract_keywords(danmaku_list, top=500):
    seg = [' '.join(jieba.cut(dan)) for dan in danmaku_list]

    sentence = ' '.join(seg)

    tags = jieba.analyse.extract_tags(sentence, top)
    tags_sentence = ' '.join(tags)
    return tags_sentence

In [6]:
def extract_keywords_ww(danmaku_list, top=500):
    seg = [' '.join(jieba.cut(dan)) for dan in danmaku_list]

    sentence = ' '.join(seg)

    tags = dict(jieba.analyse.extract_tags(sentence, top, withWeight=True))
    return tags

# 画出词云

In [7]:
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud, ImageColorGenerator

In [8]:
def gen_wordcloud(tags_sentence, mask, font, scale=4):
    coloring = plt.imread(mask)
    image_colors = ImageColorGenerator(coloring)
    wordCloud = WordCloud(background_color="white",
                          mask=coloring,
                          font_path=font,
                          random_state=50,
                          max_words=3000,
                          scale=scale).generate(tags_sentence)
    wordCloud.recolor(color_func=image_colors)
    return wordCloud

In [9]:
def gen_wordcloud_ww(tags, mask, font, scale=4):
    coloring = plt.imread(mask)
    image_colors = ImageColorGenerator(coloring)
    wordCloud = WordCloud(background_color="white",
                          mask=coloring,
                          font_path=font,
                          random_state=50,
                          max_words=3000,
                          scale=scale).generate_from_frequencies(tags)
    wordCloud.recolor(color_func=image_colors)
    return wordCloud

In [10]:
def save_wordcloud(wordCloud, output):
    wordCloud.to_file(output)

In [11]:
def draw_wordcloud(wordCloud, dpi=200):
    plt.figure(dpi=dpi)
    plt.imshow(wordCloud)
    plt.axis("off")
    plt.show()

# 爬取弹幕实践：bilibili拜年祭

In [12]:
def crawl_and_plot(cid, output, mask, font='C://Windows/Fonts/msyh.ttc',
                   with_weight=True, scale=4, top_words=1000, preview=False):
    print('Crawling danmaku list...', end=' ')
    danmaku_list = crawl_danmaku(cid)
    print('Done')
    
    print('Extracting top {} words...'.format(top_words), end=' ')
    if with_weight:
        tags = extract_keywords_ww(danmaku_list, top_words)
    else:
        tags_sentence = extract_keywords(danmaku_list, top_words)
    print('Done')
    
    print('Generating wordcloud...', end=' ')
    if with_weight:
        wordCloud = gen_wordcloud_ww(tags, mask, font, scale)
    else:
        wordCloud = gen_wordcloud(tags_sentence, mask, font, scale)
    print('Done')
    
    print('Drawing and saving wordcloud...', end=' ')
    if preview:
        draw_wordcloud(wordCloud)
    save_wordcloud(wordCloud, output)
    print('Done')

2018年拜年祭弹幕`cid`：

- 1P: `32005501`
- 2P: `32005515`
- 3P: `32005584`
- 4P: `32005717`
- 5P: `32005720`

In [13]:
from PIL import Image

In [29]:
with Image.open('干杯2.jpg') as im:
    img_size = im.size

    w = img_size[0] / 2
    h = img_size[1] / 2

    x, y = 0, 0
    region = im.crop((x, y, x + w, y + h))
    region.save("ganbei_1.jpg")

    x, y = w, 0
    region = im.crop((x, y, x + w, y + h))
    region.save("ganbei_2.jpg")

    x, y = 0, h
    region = im.crop((x, y, x + w, y + h))
    region.save("ganbei_3.jpg")

    x, y = w, h
    region = im.crop((x, y, x + w, y + h))
    region.save("ganbei_4.jpg")

In [32]:
pnums = range(1, 6)
pnames = ['汪', '王', '财', '往', '旺']
cids = ['32005501', '32005515', '32005584', '32005717', '32005720']
masks = ['ganbei_1.jpg', 'ganbei_2.jpg', 'bilibili22.jpg' ,'ganbei_3.jpg', 'ganbei_4.jpg']

for pnum, pname, cid, mask in zip(pnums, pnames, cids, masks):
    pstring = 'P{}({}, cid={})'.format(pnum, pname, cid)
    print(pstring)
    crawl_and_plot(cid, output='{}.png'.format(pstring), with_weight=False, scale=8, mask=mask, top_words=500)

P1(汪, cid=32005501)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
P2(王, cid=32005515)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
P3(财, cid=32005584)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
P4(往, cid=32005717)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
P5(旺, cid=32005720)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done


In [33]:
pnums = range(1, 6)
pnames = ['汪', '王', '财', '往', '旺']
cids = ['32005501', '32005515', '32005584', '32005717', '32005720']
masks = ['ganbei_1.jpg', 'ganbei_2.jpg', 'bilibili22.jpg' ,'ganbei_3.jpg', 'ganbei_4.jpg']

for pnum, pname, cid, mask in zip(pnums, pnames, cids, masks):
    pstring = 'P{}({}, cid={})'.format(pnum, pname, cid)
    print(pstring)
    crawl_and_plot(cid, output='WW_{}.png'.format(pstring), with_weight=True, scale=8, mask=mask, top_words=500)

P1(汪, cid=32005501)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
P2(王, cid=32005515)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
P3(财, cid=32005584)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
P4(往, cid=32005717)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
P5(旺, cid=32005720)
Crawling danmaku list... Done
Extracting top 500 words... Done
Generating wordcloud... Done
Drawing and saving wordcloud... Done
