# 1. 数据采集

In [None]:
import requests

In [None]:
import pandas as pd
import json

In [None]:
import datetime
from tqdm.notebook import tqdm

In [None]:
from fake_useragent import UserAgent

In [None]:
ua = UserAgent()

In [None]:
headers = {"User-Agent": ua.random}

In [None]:
headers

### 1.1 数据来源

https://www.enlightent.cn/research/rank/weiboSearchRank

In [None]:
hotsearch_url = "https://www.enlightent.cn/research/top/getWeiboHotSearchDayAggs.do"

In [None]:
data = {"type":"realTimeHotSearchList",
        "t":"1366368047",
        "accessToken":"lgYyzbiLF/SBZo+HvxKEDKCN87th6Bfb9pnwZUGPfcj8P/W4TH2Z+BdxMuj2JqLcAliCQVgnsYfQcvfnehhhBA==",
        "date":"2019/01/01"}

In [None]:
response = requests.post(hotsearch_url, 
                         headers=headers,
                         data=data)

In [None]:
response.status_code

In [None]:
date_start = datetime.date(2019,1,1) # 2019年初
#date_end = datetime.date(2019,1,2)
date_end = datetime.date.today()     # 当日日期

In [None]:
raw_response_dict = {}

for i in tqdm(range((date_end - date_start).days + 1)):
            
    date = date_start + datetime.timedelta(days=i)
    date_str = date.strftime("%Y/%m/%d")
    payload["date"] = date_str
    
    response = requests.post(hotsearch_url, headers={"User-Agent": ua.random}, data=payload)
    raw_response_dict[date_str] = json.loads(response.text)
    

# 2. 数据整理

In [None]:
response_list = []

for k,v in response_dict.items():
    for i in v:
        i["date"] = k
        response_list.append(i)

In [None]:
len(response_list)

In [None]:
hot_df = pd.DataFrame(response_list)

In [None]:
hot_df["date"] = hot_df["date"].apply(lambda x: pd.to_datetime(x, format="%Y/%m/%d"))

In [None]:
hot_df.head()

In [None]:
hot_df.to_csv("weibo_hotsearch_2019_with_date.csv", index=None)

In [None]:
hot_df = pd.read_csv("weibo_hotsearch_2019_with_date.csv")

In [None]:
hot_df

# 3. 文本分析

### 3.1 人名提取

In [None]:
import jieba
import jieba.posseg as pseg

In [None]:
seg_list = jieba.cut("张若昀唐艺昕咋还不结婚")
seg_list = [i for i in seg_list]

In [None]:
seg_list

In [None]:
pseg_list = pseg.cut("张若昀唐艺昕咋还不结婚")
pseg_list = [(w,t) for w,t in pseg_list]

In [None]:
# nr:人名，r:代词, d:副词 v:动词
pseg_list

In [None]:
jieba.add_word("李现", tag="nr")

In [None]:
jieba.add_word("华晨宇", tag="nr")

In [None]:
#jieba.add_word("", tag="")

In [None]:
def take_nr(text):
    pseg_list = pseg.cut(text)
    pseg_list = [(w,t) for w,t in pseg_list]
    nr_list = [i[0] for i in pseg_list if (len(i[0])>1) and (i[1] == "nr")]
    return nr_list

In [None]:
take_nr("张若昀唐艺昕咋还不结婚")

In [None]:
hot_df["cele_0"] =  hot_df["keyword"].apply(lambda x: take_nr(x)[0] if take_nr(x) else None)

In [None]:
hot_df["cele_1"] =  hot_df["keyword"].apply(lambda x: take_nr(x)[1] if len(take_nr(x))>1 else None)

In [None]:
hot_df.head()

![](zhifubao_id.jpeg)

In [None]:
hot_df.info()

In [None]:
hot_df.to_csv("weibo_hotsearch_2019_names_extracted.csv", index=None)

In [None]:
hot_df = pd.read_csv("weibo_hotsearch_2019_names_extracted.csv", parse_dates=["count"])

### 3.2 动作提取

In [None]:
def take_v(text):
    pseg_list = pseg.cut(text)
    pseg_list = [(w,t) for w,t in pseg_list]
    v_list = [i[0] for i in pseg_list if i[1] == "v"]
    return v_list

In [None]:
verbs_list = [take_v(x) for x in hot_df["keyword"]]

In [None]:
verbs_list

In [None]:
hotsearch_list = []

for i in verbs_list:
    hotsearch_list.append(" ".join(i))
    
hotsearch_text = " ".join(hotsearch_list)

In [None]:
hotsearch_text[:500]

### 3.2.1 词云绘制

In [None]:
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

In [None]:
weibo_mask=np.array(Image.open("weibo_logo.jpg"))

In [None]:
wordcloud = WordCloud(background_color='white', font_path='C:\Windows\Fonts\simhei.ttf',
                      max_words=100, collocations=False, mask=weibo_mask,
                      stopwords=STOPWORDS)

In [None]:
wordcloud.generate(text=hotsearch_text)

In [None]:
wordcloud.to_file('weibo_hotsearch_2019_keywords.jpg')

# 4. 2019“司图杯”微博热搜颁奖典礼

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc("font",family="SimHei",size="10")

### 4.1 上榜次数最多

In [None]:
hot_df["cele_0"].value_counts().head()

In [None]:
hot_df["cele_0"] = hot_df["cele_0"].replace("明星", None)

In [None]:
hot_counts_df = pd.DataFrame(hot_df["cele_0"].value_counts()).reset_index()

In [None]:
hot_counts_df.head()

In [None]:
count_chart = sns.barplot(x="index", y="cele_0", data=hot_counts_df[hot_counts_df["cele_0"]>100])
count_chart.set_xticklabels(count_chart.get_xticklabels(), rotation=45)
"2019微博热搜上榜频率"

### 4.2 热搜指数最高

In [None]:
hot_df[hot_df["cele_0"].notnull()].groupby("cele_0")["searchCount"].sum().sort_values(ascending=False).head()

In [None]:
hot_searchCount_df = hot_df[hot_df["cele_0"].notnull()].groupby("cele_0")["searchCount"].sum().sort_values(ascending=False).reset_index()

In [None]:
search_chart = sns.barplot(x="cele_0", y="searchCount", data=hot_searchCount_df[:20])
search_chart.set_xticklabels(search_chart.get_xticklabels(), rotation=45)
"2019微博热搜搜索指数"

### 4.3 平均排名最高

In [None]:
groupby_rank = hot_df[hot_df["cele_0"].notnull()].groupby("cele_0")["rank"]

In [None]:
# 限定范围：热搜100次以上
idx_morethan100 = groupby_rank.count() > 100

In [None]:
groupby_rank.mean()[idx_morethan100].sort_values().head()

In [None]:
hot_rank_df = groupby_rank.mean()[idx_morethan100].sort_values().reset_index()

In [None]:
rank_chart = sns.barplot(x="cele_0", y="rank", data=hot_rank_df)
rank_chart.set_xticklabels(rank_chart.get_xticklabels(), rotation=45)
"2019微博热搜平均排名（100次热搜以上人物）"

### 4.4 最佳助攻（作为后出现的名字）

In [None]:
hot_df["cele_1"].value_counts().head()

In [None]:
hot_df["cele_1"] = hot_df["cele_1"].replace(["辟谣", "晋级"],None)

In [None]:
partner_df = pd.DataFrame(hot_df["cele_1"].value_counts()).reset_index()

In [None]:
partner_chart = sns.barplot(x="index", y="cele_1", data=partner_df[:20])
partner_chart.set_xticklabels(partner_chart.get_xticklabels(), rotation=45)
"2019微博热搜最佳助攻"

### 4.5 热搜同框

In [None]:
cp_df = hot_df[hot_df["cele_1"].notnull()][["cele_0", "cele_1"]]

In [None]:
cp_df["count"] = 1

In [None]:
cp_df.head()

In [None]:
cp_df = cp_df.groupby(["cele_0", "cele_1"]).sum()

In [None]:
hotcp_dict = cp_df[cp_df["count"] > 5].to_dict()

In [None]:
hotcp_dict

In [None]:
hotcp_list = []

for k,v in hotcp_dict["count"].items():
    hotcp_list.append({"from": k[0],
                       "to": k[1],
                       "value": v})
   

In [None]:
hotcp_list