#### Reference Link: 
Google translate:  
https://py-googletrans.readthedocs.io/en/latest/   
https://blog.csdn.net/weixin_38819889/article/details/103602436  
https://zhuanlan.zhihu.com/p/390801784  
NPL sentiment anslysis: https://github.com/isnowfy/snownlp   
Altair tutorial: https://altair-viz.github.io/gallery/index.html  
Word cloud tutorial: https://www.datacamp.com/tutorial/wordcloud-python

In [1]:
from datetime import datetime
from googletrans import Translator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from snownlp import SnowNLP
import time
from matplotlib.colors import LinearSegmentedColormap

## 1. read and explore the data

In [2]:
# read the data
df = pd.read_csv('weibo_hot_search.csv')

df.head()

Unnamed: 0,Rank,Time,Hot Word in Chinese,Hot Word in English,Type Info in Chinese,Type Info in English,Hotness,Href
0,1,4/8/23 23:15,胃病转向胃癌有5个信号,Gastric disease has 5 signals to turn to gastr...,健康,health,1308773,https://s.weibo.com/weibo?q=%23%E8%83%83%E7%97...
1,2,4/8/23 23:15,林俊杰买虚拟地产浮亏91%,Lin Junjie bought a virtual real estate floati...,明星,star,1000365,https://s.weibo.com/weibo?q=%23%E6%9E%97%E4%BF...
2,3,4/8/23 23:15,这是现实版人在画中游,This is the real version of the person in the ...,其他,other,820829,https://s.weibo.com/weibo?q=%23%E8%BF%99%E6%98...
3,4,4/8/23 23:15,被闹钟惊醒对身体危害大,Wake up by the alarm clock is harmful to the body,健康,health,662430,https://s.weibo.com/weibo?q=%23%E8%A2%AB%E9%97...
4,5,4/8/23 23:15,罗云熙下海动图,Luo Yunxi goes down the sea,剧集,Drama,638454,https://s.weibo.com/weibo?q=%23%E7%BD%97%E4%BA...


In [3]:
df_type = df[['Type Info in English','Href']].groupby('Type Info in English').count()
df_type

Unnamed: 0_level_0,Href
Type Info in English,Unnamed: 1_level_1
Ceremony,69
Drama,248
Movie,88
Variety show,133
games,90
health,299
music,130
other,544
sports,116
star,411


In [4]:
len(df)

2173

In [5]:
df.dtypes

Rank                    object
Time                    object
Hot Word in Chinese     object
Hot Word in English     object
Type Info in Chinese    object
Type Info in English    object
Hotness                 object
Href                    object
dtype: object

In [6]:
# select the columns that related to sentiment analysis
df_sentiments = df[['Rank','Time','Hot Word in Chinese','Hot Word in English','Type Info in English','Hotness']]

df_sentiments.head()

Unnamed: 0,Rank,Time,Hot Word in Chinese,Hot Word in English,Type Info in English,Hotness
0,1,4/8/23 23:15,胃病转向胃癌有5个信号,Gastric disease has 5 signals to turn to gastr...,health,1308773
1,2,4/8/23 23:15,林俊杰买虚拟地产浮亏91%,Lin Junjie bought a virtual real estate floati...,star,1000365
2,3,4/8/23 23:15,这是现实版人在画中游,This is the real version of the person in the ...,other,820829
3,4,4/8/23 23:15,被闹钟惊醒对身体危害大,Wake up by the alarm clock is harmful to the body,health,662430
4,5,4/8/23 23:15,罗云熙下海动图,Luo Yunxi goes down the sea,Drama,638454


In [7]:
# Define functions to get the sentiment score and keywords of the text
def get_sentiment_and_keywords(text):
    s = SnowNLP(text)
    sentiment = s.sentiments
    words = s.words 
    keywords = s.keywords(3)
    return pd.Series([sentiment, words, keywords], index=['Sentiment Score', 'Words', 'Keywords'])

In [8]:
# Apply the get_sentiment_and_keywords function to each text in the Hot Word in Chinese column
df_sentiments = df_sentiments.copy()
df_sentiments[['Sentiment Score', 'Words','Keywords']] = df_sentiments['Hot Word in Chinese'].apply(lambda x: get_sentiment_and_keywords(x))

df_sentiments.head(10)

Unnamed: 0,Rank,Time,Hot Word in Chinese,Hot Word in English,Type Info in English,Hotness,Sentiment Score,Words,Keywords
0,1,4/8/23 23:15,胃病转向胃癌有5个信号,Gastric disease has 5 signals to turn to gastr...,health,1308773,0.523398,"[胃病, 转向, 胃癌, 有, 5, 个, 信号]","[信号, 转向, 胃癌]"
1,2,4/8/23 23:15,林俊杰买虚拟地产浮亏91%,Lin Junjie bought a virtual real estate floati...,star,1000365,0.863452,"[林, 俊杰, 买, 虚拟, 地产, 浮, 亏, 91%]","[虚拟, 地产, 买]"
2,3,4/8/23 23:15,这是现实版人在画中游,This is the real version of the person in the ...,other,820829,0.926068,"[这, 是, 现实, 版, 人, 在, 画, 中, 游]","[画, 版, 中]"
3,4,4/8/23 23:15,被闹钟惊醒对身体危害大,Wake up by the alarm clock is harmful to the body,health,662430,0.659852,"[被闹, 钟, 惊醒, 对, 身体, 危害, 大]","[身体, 危害, 钟]"
4,5,4/8/23 23:15,罗云熙下海动图,Luo Yunxi goes down the sea,Drama,638454,0.997661,"[罗, 云, 熙, 下海, 动, 图]","[云, 动, 下海]"
5,6,4/8/23 23:15,原来胃是这样被弄坏的,It turns out that the stomach is damaged like ...,health,544211,0.152585,"[原来, 胃, 是, 这样, 被, 弄坏, 的]","[弄坏, 胃]"
6,7,4/8/23 23:15,再就业男团红到什么水平了,What is the level of re -employment men's grou...,Variety show,514613,0.72904,"[再, 就业, 男团, 红, 到, 什么, 水平, 了]","[就业, 红, 男团]"
7,8,4/8/23 23:15,你觉得工资越高越幸福吗,"Do you think the higher the salary, the happie...",vote,510240,0.960481,"[你, 觉得, 工资, 越, 高, 越, 幸福, 吗]","[幸福, 高, 越]"
8,9,4/8/23 23:15,懒羊羊在第一季第十一集出门忘带羊角,Lazy sheep goes out in the eleventh episode of...,Drama,449985,0.96402,"[懒, 羊羊, 在, 第一, 季, 第十一, 集, 出门, 忘, 带, 羊, 角]","[第十一, 出门, 集]"
9,10,4/8/23 23:15,王宝强回归综艺效果,Wang Baoqiang returns to variety show,Variety show,440473,0.989406,"[王, 宝强, 回归, 综艺, 效果]","[回归, 宝强, 效果]"


## 2. data cleaning

In [9]:
# drop NaN data
print(len(df_sentiments))
df_sentiments = df_sentiments.dropna()
print(len(df_sentiments))

2173
2172


In [10]:
# convert the 'Time' column to timestap
df_sentiments.Time[0]

'4/8/23 23:15'

In [11]:
# Convert Time column to pandas timestamp type
df_sentiments['Time'] = pd.to_datetime(df_sentiments['Time'])
df_sentiments.Time[0]

Timestamp('2023-04-08 23:15:00')

In [12]:
# If there is an empty string or other value in the Hotness column that cannot be converted to an integer, 
# you can try to convert it using the pd.to_numeric method.
# For example, you can use the following code to replace a value that cannot be converted to an integer with NaN:
df_sentiments['Hotness'] = pd.to_numeric(df_sentiments['Hotness'], errors='coerce').astype('Int64')

df_sentiments.Hotness[0]

1308773

## 3. Aggregate data by morning or afternoon of a day

In [13]:
# Use Grouper to group by time
grouped_time = df_sentiments.groupby(pd.Grouper(key='Time', freq='12H'))

# Aggregate the grouped data
grouped_morning_evening = grouped_time.agg({'Hotness': 'sum', 'Sentiment Score': 'mean'})
grouped_morning_evening.columns = ['Sum of Hotness', 'Average Sentiment Score']

# View Results
grouped_morning_evening

Unnamed: 0_level_0,Sum of Hotness,Average Sentiment Score
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-08 12:00:00,15865590,0.649828
2023-04-09 00:00:00,15470662,0.629696
2023-04-09 12:00:00,3059240,0.64194
2023-04-10 00:00:00,20275393,0.554922
2023-04-10 12:00:00,14280924,0.608875
2023-04-11 00:00:00,20647420,0.613309
2023-04-11 12:00:00,16254567,0.728962
2023-04-12 00:00:00,16547939,0.618528
2023-04-12 12:00:00,18175698,0.597663
2023-04-13 00:00:00,16880988,0.659408


In [14]:
grouped_time = df_sentiments.groupby(pd.Grouper(key='Time', freq='12H'))
# Aggregate the grouped data
grouped_morning_evening = grouped_time.agg({'Hotness': 'sum', 'Sentiment Score': 'mean','Hot Word in English':'sum'})
grouped_morning_evening.columns = ['Sum of Hotness', 'Average Sentiment Score', 'Sum of Hot Word in English']

# View Results
grouped_morning_evening = grouped_morning_evening.reset_index()
grouped_morning_evening

Unnamed: 0,Time,Sum of Hotness,Average Sentiment Score,Sum of Hot Word in English
0,2023-04-08 12:00:00,15865590,0.649828,Gastric disease has 5 signals to turn to gastr...
1,2023-04-09 00:00:00,15470662,0.629696,Chen MengChangyue Embers Makeup ArtistThese tr...
2,2023-04-09 12:00:00,3059240,0.64194,Li Kuncheng's deathZhang Yimou's first filming...
3,2023-04-10 00:00:00,20275393,0.554922,Are you willing to work overtime on May 1st?1 ...
4,2023-04-10 12:00:00,14280924,0.608875,SandstormZheng Xiaolong commented on Zhao Liyi...
5,2023-04-11 00:00:00,20647420,0.613309,The first -generation running man reunitedWang...
6,2023-04-11 12:00:00,16254567,0.728962,"After 00, the net red Ma Ruoshan died in a car..."
7,2023-04-12 00:00:00,16547939,0.618528,SeparateDon't wear shoes in the fitting roomCh...
8,2023-04-12 12:00:00,18175698,0.597663,Dig frequent nostrils or increase the risk of ...
9,2023-04-13 00:00:00,16880988,0.659408,Chen Yanxi asked who Chen Xiao is the murderer...


In [15]:
def get_time_of_day(ts):
    hour = ts.hour
    if hour < 12:
        return str(ts.date()) + ' a.m.'
    else:
        return str(ts.date()) + ' p.m.'

# Convert the data in the Time column to a string representation
grouped_morning_evening['Time'] = grouped_morning_evening['Time'].apply(get_time_of_day)

# View Results
grouped_morning_evening.head()

Unnamed: 0,Time,Sum of Hotness,Average Sentiment Score,Sum of Hot Word in English
0,2023-04-08 p.m.,15865590,0.649828,Gastric disease has 5 signals to turn to gastr...
1,2023-04-09 a.m.,15470662,0.629696,Chen MengChangyue Embers Makeup ArtistThese tr...
2,2023-04-09 p.m.,3059240,0.64194,Li Kuncheng's deathZhang Yimou's first filming...
3,2023-04-10 a.m.,20275393,0.554922,Are you willing to work overtime on May 1st?1 ...
4,2023-04-10 p.m.,14280924,0.608875,SandstormZheng Xiaolong commented on Zhao Liyi...


In [16]:
min(grouped_morning_evening['Sum of Hotness'])

1132291

In [17]:
import altair as alt
import seaborn as sns

source = grouped_morning_evening
brush = alt.selection(type='interval', encodings=['x'])

color_scale = alt.Scale(
    domain=(min(grouped_morning_evening['Sum of Hotness']), max(grouped_morning_evening['Sum of Hotness'])),
    scheme='blues',
    type="linear",
    nice=False,
    clamp=True
)

bars = alt.Chart().mark_bar().encode(
    x='Time:O',
    y=alt.Y('mean(Sum of Hotness):Q', axis=alt.Axis(title='Sum of Hotness')),
    color=alt.Color('Sum of Hotness:Q', scale=color_scale, legend=alt.Legend(title="Hotness", orient='top', labelFontSize=9, titleFontSize=10,symbolSize=100)),
    opacity=alt.condition(brush, alt.OpacityValue(1), alt.OpacityValue(0.3)),
).add_selection(
    brush
)

line = alt.Chart().mark_rule(color='firebrick').encode(
    y='mean(Sum of Hotness):Q',
    size=alt.SizeValue(3)
).transform_filter(
    brush
)

chart2 = alt.layer(bars, line, data=source, title={'text': "Overall Hotness Over Time", 'fontSize': 20})
chart2


In [18]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

out = widgets.Output()

# Define word cloud generation functions
def generate_wordcloud(text, time):
    """生成词云图"""
    fig, ax = plt.subplots(figsize=(15, 8))
    wordcloud = WordCloud(background_color='white', width=800, height=400).generate(text)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(time, fontsize=15)
    ax.axis('off')
    fig.suptitle("Word Cloud Over Time", fontsize=22)
    plt.show()

# Define sliders and event handlers
slider = widgets.IntSlider(min=0, max=grouped_morning_evening.shape[0]-1, step=1, value=0, 
                           description='Time Period', 
                           style={'handle_color': 'blue', 'font_size': '16pt'},
                           layout=widgets.Layout(width='80%', height='30px'),
                           continuous_update=True,
                           value_readout_format='d')

time_output = widgets.Output()
with time_output:
    print(grouped_morning_evening['Time'][slider.value])

def on_value_change(change):
    row_index = change['new']
    text = grouped_morning_evening['Sum of Hot Word in English'][row_index]
    time = grouped_morning_evening['Time'][row_index]
    with time_output:
        clear_output(wait=True)
        print(time)
    with out:
        clear_output(wait=True)
        generate_wordcloud(text, time)

slider.observe(on_value_change, names='value')

# Put the slider and output in the same container and style them to be horizontally centered and vertically aligned
container = widgets.VBox([time_output, slider, out], layout=widgets.Layout(justify_content='center', align_items='center'))
style = '<style>.widget-box{display: flex; justify-content: center;}</style>'
display(HTML(style))
display(container)

# Show initial word cloud map
on_value_change({'new': 0})


VBox(children=(Output(), IntSlider(value=0, description='Time Period', layout=Layout(height='30px', width='80%…

In [19]:
# Use Grouper to group by time and Type Info in English
grouped_time_type = df_sentiments.groupby([pd.Grouper(key='Time', freq='12H'), 'Type Info in English'])

# Aggregate the grouped data
grouped_morning_evening_type = grouped_time_type.agg({'Hotness': 'sum', 'Sentiment Score': 'mean'})
grouped_morning_evening_type.columns = ['Sum of Hotness', 'Average Sentiment Score']

# View Results
grouped_morning_evening_type

Unnamed: 0_level_0,Unnamed: 1_level_0,Sum of Hotness,Average Sentiment Score
Time,Type Info in English,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-04-08 12:00:00,Drama,2596649,0.783628
2023-04-08 12:00:00,Movie,451057,0.861646
2023-04-08 12:00:00,Variety show,1099451,0.604811
2023-04-08 12:00:00,games,155449,0.500000
2023-04-08 12:00:00,health,3072247,0.475543
...,...,...,...
2023-04-29 12:00:00,health,402773,0.539417
2023-04-29 12:00:00,music,507501,0.712462
2023-04-29 12:00:00,other,1164428,0.572278
2023-04-29 12:00:00,sports,346206,0.618609


In [20]:
grouped_morning_evening_type = grouped_morning_evening_type.reset_index()
grouped_morning_evening_type['Time'] = grouped_morning_evening_type['Time'].apply(get_time_of_day)
grouped_morning_evening_type

Unnamed: 0,Time,Type Info in English,Sum of Hotness,Average Sentiment Score
0,2023-04-08 p.m.,Drama,2596649,0.783628
1,2023-04-08 p.m.,Movie,451057,0.861646
2,2023-04-08 p.m.,Variety show,1099451,0.604811
3,2023-04-08 p.m.,games,155449,0.500000
4,2023-04-08 p.m.,health,3072247,0.475543
...,...,...,...,...
413,2023-04-29 p.m.,health,402773,0.539417
414,2023-04-29 p.m.,music,507501,0.712462
415,2023-04-29 p.m.,other,1164428,0.572278
416,2023-04-29 p.m.,sports,346206,0.618609


In [21]:
grouped_morning_evening_type.Time[:30]

0     2023-04-08 p.m.
1     2023-04-08 p.m.
2     2023-04-08 p.m.
3     2023-04-08 p.m.
4     2023-04-08 p.m.
5     2023-04-08 p.m.
6     2023-04-08 p.m.
7     2023-04-08 p.m.
8     2023-04-08 p.m.
9     2023-04-08 p.m.
10    2023-04-09 a.m.
11    2023-04-09 a.m.
12    2023-04-09 a.m.
13    2023-04-09 a.m.
14    2023-04-09 a.m.
15    2023-04-09 a.m.
16    2023-04-09 a.m.
17    2023-04-09 a.m.
18    2023-04-09 a.m.
19    2023-04-09 a.m.
20    2023-04-09 p.m.
21    2023-04-09 p.m.
22    2023-04-09 p.m.
23    2023-04-09 p.m.
24    2023-04-09 p.m.
25    2023-04-09 p.m.
26    2023-04-09 p.m.
27    2023-04-09 p.m.
28    2023-04-09 p.m.
29    2023-04-09 p.m.
Name: Time, dtype: object

In [22]:
hotness_sum = grouped_morning_evening_type['Sum of Hotness']
value_to_fill = max(hotness_sum) + min(hotness_sum)*50
# value_to_fill = 10000000
grouped_morning_evening_type = grouped_morning_evening_type.assign(bar_y=value_to_fill)
grouped_morning_evening_type

Unnamed: 0,Time,Type Info in English,Sum of Hotness,Average Sentiment Score,bar_y
0,2023-04-08 p.m.,Drama,2596649,0.783628,13633128
1,2023-04-08 p.m.,Movie,451057,0.861646,13633128
2,2023-04-08 p.m.,Variety show,1099451,0.604811,13633128
3,2023-04-08 p.m.,games,155449,0.500000,13633128
4,2023-04-08 p.m.,health,3072247,0.475543,13633128
...,...,...,...,...,...
413,2023-04-29 p.m.,health,402773,0.539417,13633128
414,2023-04-29 p.m.,music,507501,0.712462,13633128
415,2023-04-29 p.m.,other,1164428,0.572278,13633128
416,2023-04-29 p.m.,sports,346206,0.618609,13633128


In [23]:
import altair as alt

source = grouped_morning_evening_type.reset_index()

#cmap = colors.TwoSlopeNorm(vmin=0, vcenter=0.5, vmax=1)
color_scale = alt.Scale(
    domain=(0, 0.5, 1),
    range=["red", "yellow", "green"], # white
    type="linear",
    nice=False,
    clamp=True
)

bars = alt.Chart().mark_bar(opacity=0.6).encode(
    x=alt.X('Time:O', axis=alt.Axis(grid=True, gridDash=[4, 2]), scale=alt.Scale(paddingInner=0)),
    y=alt.Y('mean(bar_y):Q', axis=alt.Axis(title='Hotness')),
    color=alt.Color('mean(Average Sentiment Score):Q', scale=color_scale, legend=alt.Legend(title="Sentiment Score", orient='top', labelFontSize=9, titleFontSize=10,symbolSize=100)),
)

line = alt.Chart(source).mark_line().encode(
    x='Time:O',
    y='Sum of Hotness:Q',
    color='Type Info in English:N',
)

points = alt.Chart(source).mark_circle(size=50).encode(
    x='Time:O',
    y='Sum of Hotness:Q',
    color='Type Info in English:N',
)


chart1 = alt.layer(bars,line+points, data=source).properties(
    title={
        "text": ["Mood Changes Over Time"],
        "fontSize": 20,
        "anchor": "middle",
    }
)

# 定义config对象

chart1

## 4. Put the 3 figures above into one figure

In [24]:
config = alt.Config(
    view=alt.ViewConfig(
        width=800,
        height=200
    ),
    title=alt.TitleConfig(
        fontSize=20
    )
)    

combined_chart = alt.vconcat(chart2, chart1,spacing=30, config=config)
combined_chart.display()

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

out = widgets.Output()

# Define word cloud generation functions
def generate_wordcloud(text, time):
    """生成词云图"""
    fig, ax = plt.subplots(figsize=(15, 8))
    wordcloud = WordCloud(background_color='white', width=800, height=400).generate(text)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(time, fontsize=15)
    ax.axis('off')
    fig.suptitle("Word Cloud Over Time", fontsize=22)
    plt.show()

# Define sliders and event handlers
slider = widgets.IntSlider(min=0, max=grouped_morning_evening.shape[0]-1, step=1, value=0, 
                           description='Time Period', 
                           style={'handle_color': 'blue', 'font_size': '16pt'},
                           layout=widgets.Layout(width='80%', height='30px'),
                           continuous_update=True,
                           value_readout_format='d')

time_output = widgets.Output()
with time_output:
    print(grouped_morning_evening['Time'][slider.value])

def on_value_change(change):
    row_index = change['new']
    text = grouped_morning_evening['Sum of Hot Word in English'][row_index]
    time = grouped_morning_evening['Time'][row_index]
    with time_output:
        clear_output(wait=True)
        print(time)
    with out:
        clear_output(wait=True)
        generate_wordcloud(text, time)

slider.observe(on_value_change, names='value')

on_value_change({'new': 0}) # Add this line of code to display the word cloud for the first time period by default

# Put the slider and output in the same container and style them to be horizontally centered and vertically aligned
container = widgets.VBox([time_output, slider, out], layout=widgets.Layout(justify_content='center', align_items='center'))

# Display the container in Jupyter Notebook
display(container)


VBox(children=(Output(), IntSlider(value=0, description='Time Period', layout=Layout(height='30px', width='80%…

In [25]:
# combined_chart.save('combined_chart.html')


# from IPython.display import IFrame
# IFrame(src='./combined_chart.html', width=900, height=400)
