# 0. Web Crawler

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
base_url = 'https://weibo.com/liuxiaolingtong'

In [None]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2727.400'}

In [None]:
response = requests.get(base_url, headers=headers)

In [None]:
response.status_code

In [None]:
response.text

## Selenium

In [None]:
from selenium import webdriver

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.headless = True

In [None]:
driver = webdriver.Chrome(executable_path='G:\Code Python\Data Science\chromedriver.exe', options=chrome_options)

In [None]:
driver.get(base_url)

In [None]:
driver.current_url

In [None]:
html_doc = driver.page_source

In [None]:
soup = BeautifulSoup(html_doc, 'html5lib')

In [None]:
feeds = soup.find_all(class_='WB_feed_detail')

In [None]:
len(feeds)

## Mobile

In [None]:
mobile_url = 'https://m.weibo.cn/u/1211441627'

In [None]:
driver.get(mobile_url)

In [None]:
html_doc = driver.page_source

In [None]:
soup = BeautifulSoup(html_doc, 'lxml')

In [None]:
len(soup.find_all(class_='card', attrs='m-panel'))

In [None]:
js_load_page = '''
xmlhttp = new XMLHttpRequest();
xmlhttp.open("GET","https://m.weibo.cn/api/container/getIndex?type=uid&value=1211441627&containerid=1076031211441627&page=3",true);
xmlhttp.setRequestHeader("Accept", "application/json, text/plain, */*");
xmlhttp.setRequestHeader("MWeibo-Pwa", "1");
xmlhttp.setRequestHeader("X-Requested-With", "XMLHttpRequest");
xmlhttp.send(null);
'''

In [None]:
driver.execute_async_script(script=js_load_page)

## Mobile + XHR + JSON

In [None]:
xhr_url = 'https://m.weibo.cn/api/container/getIndex'

params = {'type':'uid',
          'value':'1211441627',
          'containerid':'1076031211441627',
          'page':1}

In [None]:
response = requests.get(xhr_url, params=params)

In [None]:
feed_dict = response.json()

In [None]:
feed_dict['ok']

In [None]:
sample_feed = feed_dict['data']['cards'][2]

In [None]:
sample_feed['mblog']['text']

In [None]:
sample_feed['mblog']['reposts_count']
sample_feed['mblog']['comments_count']
sample_feed['mblog']['attitudes_count']

In [None]:
from tqdm import tqdm_notebook

In [None]:
posts = []

In [None]:
for page in tqdm_notebook(range(1, 1000)):
    params['page'] = page
    response = requests.get(xhr_url, params=params)
    feed_dict = response.json()
    
    print("\n>>>> PAGE: {} <<<<".format(page))
    
    if feed_dict['ok'] == 1:
        # 当页微博数
        nb_blogs = len(feed_dict['data']['cards'])
        
        for i in range(nb_blogs):
            # 普通微博
            if feed_dict['data']['cards'][i]['card_type'] == 9:
            
                mblog = feed_dict['data']['cards'][i]['mblog']
                mblog_id = mblog['id']
                date = mblog['created_at']
                text = mblog['text']
                device = mblog['source']
                reposts_count = mblog['reposts_count']
                comments_count = mblog['comments_count']
                attitudes_count = mblog['attitudes_count']

                print(">>>> {} {}<<<<".format(date, text[:30]))
                posts.append([mblog_id, date, text, reposts_count, comments_count, attitudes_count])
    
    else:
        break

# 1. Data Clearning

In [None]:
import numpy as np
import pandas as pd

In [None]:
mblog_df = pd.DataFrame(posts)
mblog_df.columns = ['id', 'date', 'text', 'reposts', 'comments', 'like']

In [None]:
mblog_df.head()

In [None]:
mblog_df[:3]['date'] = '11-14'

In [None]:
def complete_date(date):
    
    if len(date) < 6:
        return '2018-' + date
    
    else:
        return date

In [None]:
mblog_df['date'] = mblog_df['date'].apply(complete_date)

In [None]:
mblog_df['date'] = pd.to_datetime(mblog_df['date'])

In [None]:
#mblog_df = pd.read_csv('six_little_child_weibo_181115.csv', index_col='id')

In [None]:
mblog_df.to_csv('six_little_child_weibo_181115.csv', index_label='id')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
soup = BeautifulSoup(mblog_df.iloc[1]['text'], 'html.parser')

In [None]:
at_dict = {}

In [None]:
def get_at(text):
    at_list = []
    soup = BeautifulSoup(text, 'lxml')
    
    # 如果本条微博内有外链
    if soup.find_all('a'):
        # 遍历外链
        for a in soup.find_all('a'):
            # 是否为@
            if '@' in a.text:
                at_user = a.text[1:]
                at_list.append(at_user)
            
                if at_user in at_dict:
                    at_dict[at_user] += 1
                else:
                    at_dict[at_user] = 1
                
    return at_list

In [None]:
mblog_df['at'] = mblog_df['text'].apply(get_at)

In [None]:
img_response = requests.get('http:'+'//h5.sinaimg.cn/m/emoticon/icon/others/h_woshou-9ec25c8391.png')

In [None]:
with open('woshou.png', 'wb') as f:
    f.write(img_response.content)

In [None]:
emoji_dict = {}

def get_emoji(text):    
    emoji_list = []
    soup = BeautifulSoup(text, 'lxml')
    # 如果本条微博有图标
    if soup.find_all(class_='url-icon'):
        # 遍历图标
        for span in soup.find_all('span', class_='url-icon'):
            # 如果该图标是表情
            if 'alt' in span.img.attrs:
                emoji_id = span.img['alt'][1:-1]
                emoji_src = span.img['src']
                emoji_list.append(emoji_id)

                if emoji_id not in emoji_dict:
                    emoji_dict[emoji_id] = 1
                    
                    img_response = requests.get('http:' + emoji_src)
                    with open('emoji/{}.png'.format(emoji_id), 'wb') as f:
                        f.write(img_response.content)
                else:
                    emoji_dict[emoji_id] += 1
    
    return emoji_list

In [None]:
mblog_df['emoji'] = mblog_df['text'].apply(get_emoji)

In [None]:
mblog_df.head()

In [None]:
at_df = pd.DataFrame.from_dict(columns=['counts'], orient='index', data=at_dict).sort_values('counts', ascending=False)

In [None]:
at_df.head()

In [None]:
emoji_df = pd.DataFrame.from_dict(columns=['counts'], orient='index', data=emoji_dict).sort_values('counts', ascending=False)

In [None]:
emoji_df.head(10)

# 2. Visualization

In [None]:
from pyecharts import Bar, Scatter

In [None]:
bar = Bar("六小龄童最常用的表情(Top 10)")

In [None]:
bar.add('Emoji', emoji_df[:10].index, emoji_df['counts'][:10], 
        xaxis_interval=0, xaxis_rotate=90, xaxis_label_textsize=18)

In [None]:
scatter = Scatter("六小龄童最常用的表情")

In [None]:
scatter.add('Emoji', emoji_df.index, emoji_df['counts'],
            xaxis_type='category', is_visualmap=True, visual_type='size', 
            visual_dimension=1,visual_range=[1,1600])

In [None]:
bar_at = Bar("六小龄童最常@的账号(Top 20)")

In [None]:
bar_at.add('@', at_df.index[:20], at_df['counts'][:20], 
            xaxis_interval=0, xaxis_rotate=30, xaxis_name_size=10)

# 3. WordCloud

In [None]:
import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
six_weibo_list = [BeautifulSoup(mblog_df.loc[i, 'text'], 'html.parser').text for i in mblog_df.index]

In [None]:
six_weibo_texts = ''.join(six_weibo_list)

In [None]:
seg_gen = jieba.cut(six_weibo_texts, cut_all=False)

In [None]:
seg_list = [i for i in seg_gen]

In [None]:
seg_texts = ' '.join(seg_list)

In [None]:
from PIL import Image

In [None]:
monkey_mask=np.array(Image.open("images/wukong.jpg"))

In [None]:
stopwords=set(STOPWORDS) | {'网页', '链接', '微博', '博文'}

In [None]:
wordcloud = WordCloud(background_color='white', font_path='C:\Windows\Fonts\simhei.ttf',
                      max_words=200, collocations=False, mask=monkey_mask,
                      stopwords=stopwords)

In [None]:
wordcloud.generate(text=seg_texts)

In [None]:
wordcloud.to_file('six_wordcloud.jpg')

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1) 

image_colors = ImageColorGenerator(monkey_mask)
plt.imshow(wordcloud.recolor(color_func=image_colors))
plt.axis("off")
plt.show()

# 4. Text Generation

In [None]:
from gensim.models import KeyedVectors