# 阿根廷夺冠的舆情分析

通过本实验要求学生理解舆情分析的流程，熟悉在Python中进行舆情分析（数据获取、趋势分析、热帖分析、热门用户分析）的实现方法。

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("./data/nlp_spider_2.csv")
data

Unnamed: 0,评论发表时间,评论内容,评论发表地点,点赞数,评论数,转发数
0,Wed Dec 21 17:27:43 +0800 2022,"<a href=""https://m.weibo.cn/search?containeri...",广西,2,0,2
1,Wed Dec 21 11:41:03 +0800 2022,迪巴拉：<br /><br />“当我在决赛中出场时，我知道我是上去踢点球的。我本来打算打一...,北京,41,4,41
2,Tue Dec 20 21:12:33 +0800 2022,"<a href=""https://m.weibo.cn/search?containeri...",江苏,44,1,44
3,Wed Dec 21 09:16:03 +0800 2022,🇦🇷⭐️⭐️⭐️<br /> 三星美图✌<br /><...,广东,205,19,205
4,Wed Dec 21 09:48:31 +0800 2022,"<a href=""https://m.weibo.cn/search?containeri...",河北,10,1,10
...,...,...,...,...,...,...
1018,Mon Dec 19 02:36:01 +0800 2022,"啊啊啊啊啊啊啊啊啊好深情的看向奖杯🏆🏆🏆🏆🏆🏆🏆<a href=""https://m.we...",山东,590,39,590
1019,Mon Dec 19 07:22:59 +0800 2022,8年前，梅西距离大力神杯，这么近却那么远…<br /><br />8年后，梅西笑意盈盈的，手...,山东,60,0,60
1020,Mon Dec 19 02:58:52 +0800 2022,"<a href=""https://m.weibo.cn/search?containeri...",湖北,352,29,352
1021,Mon Dec 19 08:07:00 +0800 2022,"今日心情<span class=""url-icon""><img alt=[嘻嘻] src=""...",四川,117,10,117


In [3]:
# 假设有一个名为 "dates" 的列，其中包含时间字符串
data['timestamps'] = pd.to_datetime(data['评论发表时间'], format="%a %b %d %H:%M:%S %z %Y")
data

Unnamed: 0,评论发表时间,评论内容,评论发表地点,点赞数,评论数,转发数,timestamps
0,Wed Dec 21 17:27:43 +0800 2022,"<a href=""https://m.weibo.cn/search?containeri...",广西,2,0,2,2022-12-21 17:27:43+08:00
1,Wed Dec 21 11:41:03 +0800 2022,迪巴拉：<br /><br />“当我在决赛中出场时，我知道我是上去踢点球的。我本来打算打一...,北京,41,4,41,2022-12-21 11:41:03+08:00
2,Tue Dec 20 21:12:33 +0800 2022,"<a href=""https://m.weibo.cn/search?containeri...",江苏,44,1,44,2022-12-20 21:12:33+08:00
3,Wed Dec 21 09:16:03 +0800 2022,🇦🇷⭐️⭐️⭐️<br /> 三星美图✌<br /><...,广东,205,19,205,2022-12-21 09:16:03+08:00
4,Wed Dec 21 09:48:31 +0800 2022,"<a href=""https://m.weibo.cn/search?containeri...",河北,10,1,10,2022-12-21 09:48:31+08:00
...,...,...,...,...,...,...,...
1018,Mon Dec 19 02:36:01 +0800 2022,"啊啊啊啊啊啊啊啊啊好深情的看向奖杯🏆🏆🏆🏆🏆🏆🏆<a href=""https://m.we...",山东,590,39,590,2022-12-19 02:36:01+08:00
1019,Mon Dec 19 07:22:59 +0800 2022,8年前，梅西距离大力神杯，这么近却那么远…<br /><br />8年后，梅西笑意盈盈的，手...,山东,60,0,60,2022-12-19 07:22:59+08:00
1020,Mon Dec 19 02:58:52 +0800 2022,"<a href=""https://m.weibo.cn/search?containeri...",湖北,352,29,352,2022-12-19 02:58:52+08:00
1021,Mon Dec 19 08:07:00 +0800 2022,"今日心情<span class=""url-icon""><img alt=[嘻嘻] src=""...",四川,117,10,117,2022-12-19 08:07:00+08:00


In [None]:
data

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023 entries, 0 to 1022
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype                                
---  ------      --------------  -----                                
 0   评论发表时间      1023 non-null   object                               
 1   评论内容        1023 non-null   object                               
 2   评论发表地点      1019 non-null   object                               
 3   点赞数         1023 non-null   int64                                
 4   评论数         1023 non-null   int64                                
 5   转发数         1023 non-null   int64                                
 6   timestamps  1023 non-null   datetime64[ns, pytz.FixedOffset(480)]
dtypes: datetime64[ns, pytz.FixedOffset(480)](1), int64(3), object(3)
memory usage: 56.1+ KB


In [7]:
comment_data = data['评论内容']
import re
links = []
for i in comment_data:
    link = re.findall(r'[\u4e00-\u9fa5]+', i)
    for txt in link:
        links.append(txt)

sentences = ""
for i in links:
    sentences = sentences+i+","

In [8]:
import jieba
import jieba.analyse
 
keywords=jieba.analyse.extract_tags(sentences,topK=10,withWeight=True,allowPOS=('n','nr','ns'))
#withweight:是否同时返回每个关键词的权重;allowPOS:词性过滤，为空表示不过滤，若提供则仅返回符合词性要求的关键词  默认为allowPOS=‘ns’, ‘n’, ‘vn’, ‘v’，提取地名、名词、动名词、动词
#nr  人名
#参考对应词性  https://blog.csdn.net/zhuzuwei/article/details/79029904
print('TF-IDF result')
for item in keywords:
    print(item[0],item[1])
print('=========================')
keywords_1=jieba.analyse.textrank(sentences,topK=10,withWeight=True,allowPOS=('n','nr','ns'))
print('TextRank result')
for item in keywords_1:
    print(item[0], item[1])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\12973\AppData\Local\Temp\jieba.cache
Loading model cost 0.658 seconds.
Prefix dict has been built successfully.


TF-IDF result
啊啊啊 2.40038725561875
梅西 1.4169660368125
奖杯 0.70385127015
点球 0.67016398885
决赛 0.583520591281875
出场 0.55159398979625
深情 0.5308441175325
笑意 0.523951547219375
阿根廷 0.496877193711875
评论 0.423562804173125
TextRank result
啊啊啊 1.0
梅西 0.8711485203934533
出场 0.6115194422642928
评论 0.6115194422642928
决赛 0.6073908445796297
内容 0.6073908445796297
深情 0.5616529782021603
奖杯 0.442700724707974
距离 0.3218010884684084
笑意 0.3218010884684084


只保留在中国的地点

In [21]:
# 假设有一个名为 "location" 的列，其中包含地点信息
# 使用 isin() 函数过滤出所有在中国的地点
china_locations = data.loc[data['评论发表地点'].isin(['北京', '上海', '天津', '重庆', '河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南', '广东', '广西', '海南', '四川', '贵州', '云南', '西藏', '陕西', '甘肃', '青海', '宁夏', '新疆']), :]

data = data[data['评论发表地点'].isin(china_locations['评论发表地点'])]


# 分组并求和
df_sum = df.groupby('评论发表地点')['点赞数'].sum()

# 将结果转换为数据帧
df_sum = pd.DataFrame({'评论发表地点': df_sum.index, '点赞数': df_sum.values})


from pyecharts.charts import Map  # 注意这里与老版本pyecharts调用的区别
from pyecharts import options as opts
import random
data_province =  [list(z) for z in zip(df_sum['评论发表地点'], df_sum['点赞数'])]
china_province = (
Map()
        .add('', data_province, 'china')
        .set_global_opts(
        title_opts=opts.TitleOpts(title='Provinces of China'),
        visualmap_opts=opts.VisualMapOpts(
            min_=0,
            max_=100000,
            is_piecewise=True)
    )
        .render(path='中国省级地图.html')
)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 848 entries, 0 to 1022
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype                                
---  ------      --------------  -----                                
 0   评论发表时间      848 non-null    object                               
 1   评论内容        848 non-null    object                               
 2   评论发表地点      848 non-null    object                               
 3   点赞数         848 non-null    int32                                
 4   评论数         848 non-null    int64                                
 5   转发数         848 non-null    int64                                
 6   timestamps  848 non-null    datetime64[ns, pytz.FixedOffset(480)]
dtypes: datetime64[ns, pytz.FixedOffset(480)](1), int32(1), int64(2), object(3)
memory usage: 49.7+ KB


In [20]:
data_province

[['上海', 32248],
 ['云南', 2],
 ['内蒙古', 191],
 ['北京', 748003],
 ['吉林', 30233],
 ['四川', 16310],
 ['天津', 271],
 ['宁夏', 3693],
 ['安徽', 1480],
 ['山东', 129747],
 ['山西', 259],
 ['广东', 198243],
 ['广西', 90],
 ['新疆', 0],
 ['江苏', 92760],
 ['江西', 3415],
 ['河北', 7212],
 ['河南', 88913],
 ['浙江', 5795],
 ['海南', 208],
 ['湖北', 591],
 ['湖南', 6055],
 ['甘肃', 1],
 ['福建', 6702],
 ['西藏', 1],
 ['贵州', 1895],
 ['辽宁', 1215],
 ['重庆', 10992],
 ['陕西', 1530],
 ['青海', 0],
 ['黑龙江', 6488]]