# 获取话题数据

In [30]:
'''
在Jupyter中安装第三方库和在本地安装类似，
区别在于，Jupyter要在开头加一个感叹号。
格式：！pip install 某包
'''
# 导入需要用到的库：
import os
import re # 正则表达式提取文本
from jsonpath import jsonpath # 解析json数据   
import requests # 发送请求
import pandas as pd  #存取CSV文件
import datetime

In [17]:
# 定义一个转换时间字符串的函数，因为爬取到的时间戳是GMT格式的，
# 需要转换成标准格式：
def trans_time(v_str):
    # """"转换GMT时间为标准格式""""""
    GMT_FORMAT = '%a %b %d %H:%M:%S +0800 %Y'
    timeArray = datetime.datetime.strptime(v_str,GMT_FORMAT)
    ret_time = timeArray.strftime("%Y-%m-%d %H:%M:%S")
    return ret_time

def get_weibo_list(v_keyword, v_max_page):
    # ""
    # 爬取微博内容列表
    # ：param v_keyword: 搜索关键字
    # ：param v_max_page: 爬取前几页
    # ：return: None
    # """"""""""""
    #请求头
    headers1 = {
        "User-Agent": "Mozila/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Geoko) Chrome/99.0.4844.51 Mobile Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "accept-encoding": "gzip,deflate,br",
    }

    for page in range(1, v_max_page + 1):
        print('===开始爬取第{}页微博==='.format(page))
        # 请求地址
        url =  'https://m.weibo.cn/api/container/getIndex'
        #请求参数
        params = {
            "containerid": "100103type=1&q={}".format(v_keyword),
            "page_type": "searchall",
            "page": page
        }
        # 发送请求
        r = requests.get(url, headers=headers1, params=params)
        '''
        r = requests.get(url, headers=headers, params=params) 出现“'list' object has no attribute 'items'”
        解决办法：这里的headers被主函数里的headers覆盖，需换一个
        '''
        print(r.status_code)
        # print(r.json)
        # 解析json数据
        cards = r.json()["data"]["cards"]
        # 微博内容
        text_list = jsonpath(cards, '$..mblog.text')
        # 微博内容-正则表达式数据清洗
        dr = re.compile(r'<[^>]+>', re.S)
        text2_list = []
        print('text_list is:')
        print(text_list)
        if not text_list: #如果没有获取微博内容，进入下一轮循环
            continue
        if type(text_list) == list and len(text_list) > 0:
            for text in text_list:
                text2 = dr.sub('',text) #正则表达式提取微博内容
                print(text2)
                text2_list.append(text2)
        #微博创建时间
        time_list = jsonpath(cards,'$..mblog.created_at')
        time_list = [trans_time(v_str=i) for i in time_list]
        # 微博作者
        author_list = jsonpath(cards, '$..mblog.user.screen_name')
        # 微博ID
        id_list = jsonpath(cards, '$..mblog.id')
        #微博bid
        bid_list = jsonpath(cards,'$..mblog.bid') 
        # 关注数
        follow_count_list = jsonpath(cards,'$..mblog.user.follow_count')
        # 粉丝数
        followers_count_list = jsonpath(cards,'$..mblog.user.followers_count')
        # 转发数
        reposts_count_list = jsonpath(cards, '$..mblog.reposts_count')
        # 评论数
        comments_count_list = jsonpath(cards,'$..mblog.comments_count')
        # 点赞数
        attitudes_count_list = jsonpath(cards, '$..mblog.attitudes_count')
        #把数据保存成DataFrame数据
        df = pd.DataFrame(
                {
                    '话题': search_keyword,
                    '页码': [page] * len(id_list),
                    '微博id': id_list,
                    '微博bid': bid_list,
                    '微博作者': author_list,
                    '关注数': follow_count_list,
                    '粉丝数': followers_count_list,
                    '发布时间': time_list,
                    '微博内容': text2_list,
                    '转发数': reposts_count_list,
                    '评论数': comments_count_list,
                    '点赞数': attitudes_count_list,
                }
        )
        # 表头
        if os.path.exists(v_weibo_file):
            headers = None
        else:
            headers = ['话题','页码','微博id','微博bid','微博作者','关注数','粉丝数','发布时间','微博内容','转发数','评论数','点赞数']
        # 保存到csv文件
        df.to_csv(v_weibo_file, mode='a+', index=False, header=headers, encoding='utf_8_sig')
        print('csv保存成功：{}'.format(v_weibo_file))

if __name__=='__main__':
    # 爬取前几页
    max_search_page = 100 #爬前n页
    # 利用循环语句爬取多个话题
    search_list = ['关注俄罗斯乌克兰局势','俄罗斯乌克兰','俄罗斯天然气','俄乌局势','俄乌冲突','俄乌战争','俄乌局势观察团',
    '俄乌局势最新进展','俄军','美向乌派遣网络攻击部队','俄方称乌克兰放弃加入军事联盟','乌克兰国民卫队司令部被摧毁','欧洲议员质问北约为哪里带来了和平',
    '乌军事设施遭导弹袭击','白俄外长称解决俄乌冲突主要障碍是西方','俄军登陆行动开始于黑海和亚速海','乌克兰丘古耶夫机场遭袭画面','俄方称美方没有保持沉默的空间',
    '外交部回应俄乌局势有关问题','现场直击俄乌局势','俄乌在切尔诺贝利交火','俄乌外长会谈','乌政府向基辅居民发放超1万支自动步枪','拜登重申美军不会参与俄乌冲突',
    '乌总统称西方已完全放弃了乌克兰','中俄元首通电话','汪文斌就俄乌问题反问美方是不是心虚','王毅阐述中方对乌克兰问题五点立场','110秒看清乌克兰局势','军事新闻',
    '俄方称乌方拒绝和谈','俄乌问题该如何结束','乌克兰不接受最后通牒','俄军在乌克兰进入静默状态']
    for search_keyword in search_list:
        # 保存文件名
        v_weibo_file = '微博清单_{}_前{}页.csv'.format(search_keyword,max_search_page)
        # 如果csv文件存在，先删除之
         if os.path.exists(v_weibo_file):
            os.remove(v_weibo_file)
            print('微博清单存在，已删除:{}'.format(v_weibo_file))    
        # 调用爬取微博数据函数
        get_weibo_list(v_keyword=search_keyword,v_max_page=max_search_page)  
        # 数据清洗-去重
        df = pd.read_csv(v_weibo_file)
        #删除重复数据
        df.drop_duplicates(subset=['微博id'],inplace=True, keep='first')
        #再次保存csv文件
        df.to_csv(v_weibo_file,index=False,encoding='utf-8-sig') 

    print('数据清洗完成')


# 获取评论数据

In [29]:
# 导入需要用到的库：
import requests as r # 发送请求
import os
import re  # 正则表达式提取文本
from lxml import etree
import pprint # pprint()模块打印出来的数据结构更加完整，每行为一个数据结构，更加方便阅读打印输出结果

# weibo_id = input('输入微博ID')
# Cookie = input('请输入您的Cookie')

#####获取微博的信息：id、Cookie
#1、人民日报：【现场直击：乌克兰#基辅实时画面#】
weibo_id = 4740676987913565
Cookie = 'SUB=_2A25P39x3DeRhGeFL6loZ8y3EzTmIHXVtI-Q_rDV6PUJbkdAKLUuhkW1NQnS_I2wd2YWbF5oKTVQA_PcRWFVbpKFQ; WEIBOCN_FROM=1110106030; _T_WM=25406783168; XSRF-TOKEN=a9ca44; MLOGIN=1; mweibo_short_token=d59789a2ff; M_WEIBOCN_PARAMS=uicode=20000061&fid=4740676987913565&oid=4740676987913565'

base_url = 'https://m.weibo.cn/comments/hotflow?id={weibo_id}&mid={weibo_id}&max_id={max_id}&max_id_type={max_type}'
headers = {
    'cookie':'WEIBOCN_FROM=1110106030; loginScene=102003; SUB=_2A25P39x3DeRhGeFL6loZ8y3EzTmIHXVtI-Q_rDV6PUJbkdAKLUuhkW1NQnS_I2wd2YWbF5oKTVQA_PcRWFVbpKFQ; MLOGIN=1; _T_WM=75062629474; XSRF-TOKEN=c41b36; M_WEIBOCN_PARAMS=oid=4794387181601045&luicode=20000061&lfid=4794387181601045&uicode=20000061&fid=4794387181601045; mweibo_short_token=975880d3ba',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
}
# 获取用户信息
user_url = 'https://weibo.cn/{user_id}/info' 
rows = []

# 爬取第一页,，没有max_id
url = 'https://m.weibo.cn/comments/hotflow?id={weibo_id}&mid={weibo_id}&max_id_type=0'.format(weibo_id = weibo_id)
response = r.get(url,headers=headers,timeout=10)
jsondata = response.json()
# pprint.pprint(jsondata)

comments = jsondata['data']
# pprint.pprint(comments)
max_id = comments['max_id']
# pprint.pprint(max_id)
max_type = comments['max_id_type']
comments1 = comments['data']
# pprint.pprint(comments1)

# for comment in comments1:
#     text = comment['text']
#     id = comment['user']['id']
#     user_name = comment['user']['screen_name']
#     gender = comment['user']['gender']
#     fans = comment['user']['followers_count']
#     # print(text)
#     # print(user_name)

#     # 另一些用户数据
#     url2 = user_url.format(user_id = id)
#     # print(url2)
#     userinfo = r.get(url2, headers=headers, timeout=10)
#     root = etree.HTML(userinfo.content)
#     location = root.xpath("//div[7]/text()[3]")
#     if not location:
#         location = '/'
#     else:
#         location = str(location)
#     user_bir = root.xpath("//div[7]/text()[4]")
#     if not user_bir:
#         user_bir = '/'
#     else:
#         user_bir = str(user_bir)
#     row = [user_name,text,gender,user_bir,location,fans]
#     rows.append(row)
# # pprint.pprint(rows)
# print('测试完毕!')

######爬取所有数据
dest_filename = 'C:/Users/Maose/Desktop/测试.xlsx'
i = 1
# page = input('想要爬几页：')
page = 2
for i in range(1,page):
    i = i+1
    print('正在爬取第%i'%i)
    url = base_url.format(max_id=max_id, weibo_id = weibo_id, max_type=max_type)
    response = r.get(url,headers=headers,timeout=10)
    jsondata = response.json()
    # pprint.pprint(jsondata)

    comments = jsondata['data']
    # pprint.pprint(comments)
    max_id = comments['max_id']
    # pprint.pprint(max_id)
    max_type = comments['max_id_type']
    comments1 = comments['data']
    # pprint.pprint(comments1)

    for comment in comments1:
        text = comment['text']
        id = comment['user']['id']
        user_name = comment['user']['screen_name']
        gender = comment['user']['gender']
        fans = comment['user']['followers_count']
        # print(text)
        # print(user_name)

        # 另一些用户数据
        url2 = user_url.format(user_id = id)
        # print(url2)
        userinfo = r.get(url2, headers=headers, timeout=10)
        root = etree.HTML(userinfo.content)
        location = root.xpath("//div[7]/text()[3]")
        if not location:
            location = '/'
        else:
            location = str(location)
        user_bir = root.xpath("//div[7]/text()[4]")
        if not user_bir:
            user_bir = '/'
        else:
            user_bir = str(user_bir)
        row = [user_name,text,gender,user_bir,location,fans]
        rows.append(row)
# pprint.pprint(rows)

正在爬取第2


In [43]:
import pandas as pd
headers = ['用户名','评论','性别','生日','地址','粉丝数']
comment_weibo = '人民日报;[现场直击：乌克兰#基辅实时画面#]'
comment_weibo_file = '微博评论—{}.csv'.format(comment__weibo)
data3 = pd.DataFrame(rows,columns=headers)
data3.to_csv(comment_weibo_file,encoding='utf-8-sig')
print('数据写入完成')

NameError: name 'comment__weibo' is not defined

# 获取微博转发数据

In [49]:
# 导入需要用到的库：
import requests as r # 发送请求
import os
import re  # 正则表达式提取文本
from lxml import etree
import pprint # pprint()模块打印出来的数据结构更加完整，每行为一个数据结构，更加方便阅读打印输出结果
import datetime
from jsonpath import jsonpath # 解析json数据  
import pandas as pd #用于写入csv文件

# 时间转换成标准格式：
def trans_time(v_str):
    # """"转换GMT时间为标准格式""""""
    GMT_FORMAT = '%a %b %d %H:%M:%S +0800 %Y'
    timeArray = datetime.datetime.strptime(v_str,GMT_FORMAT)
    ret_time = timeArray.strftime("%Y-%m-%d %H:%M:%S")
    return ret_time


# 爬取前几页
max_repost_page = 20
#####获取微博的信息：id、Cookie
'''
####1、人民日报：【现场直击：乌克兰#基辅实时画面#】
# link：https://m.weibo.cn/2803301701/4740676987913565
repost_weibo = '人民日报:[现场直击：乌克兰#基辅实时画面#]'
weibo_id = 4740676987913565
Cookie = 'SUB=_2A25P39x3DeRhGeFL6loZ8y3EzTmIHXVtI-Q_rDV6PUJbkdAKLUuhkW1NQnS_I2wd2YWbF5oKTVQA_PcRWFVbpKFQ; WEIBOCN_FROM=1110106030; _T_WM=25406783168; MLOGIN=1; M_WEIBOCN_PARAMS=oid=4740676987913565&luicode=20000061&lfid=4740676987913565&uicode=20000061&fid=4740676987913565; XSRF-TOKEN=963550; mweibo_short_token=91695bf85c'
base_url3 = 'https://m.weibo.cn/api/statuses/repostTimeline?id={weibo_id}&page={page}'
user_url = 'https://weibo.cn/{user_id}/info' 
# 请求头
header3 = {
    'cookie':'SUB=_2A25P39x3DeRhGeFL6loZ8y3EzTmIHXVtI-Q_rDV6PUJbkdAKLUuhkW1NQnS_I2wd2YWbF5oKTVQA_PcRWFVbpKFQ; WEIBOCN_FROM=1110106030; _T_WM=25406783168; MLOGIN=1; M_WEIBOCN_PARAMS=oid=4740676987913565&luicode=20000061&lfid=4740676987913565&uicode=20000061&fid=4740676987913565; XSRF-TOKEN=963550; mweibo_short_token=91695bf85c',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62',
    'accept-encoding':'gzip, deflate, br'
}
'''

###2、央视新闻：【王毅阐述#中方对当前乌克兰问题的五点立场#】
# link：https://m.weibo.cn/2656274875/4740951744185390
repost_weibo = '王毅阐述#中方对当前乌克兰问题的五点立场#'
weibo_id = 4740951744185390
Cookie = 'vSUB=_2A25P39x3DeRhGeFL6loZ8y3EzTmIHXVtI-Q_rDV6PUJbkdAKLUuhkW1NQnS_I2wd2YWbF5oKTVQA_PcRWFVbpKFQ; WEIBOCN_FROM=1110106030; _T_WM=25406783168; MLOGIN=1; XSRF-TOKEN=66141d; M_WEIBOCN_PARAMS=oid=4740951744185390&luicode=20000061&lfid=4740951744185390&uicode=20000061&fid=4740951744185390'
base_url3 = 'https://m.weibo.cn/api/statuses/repostTimeline?id={weibo_id}&page={page}'
user_url = 'https://weibo.cn/{user_id}/info' 
# 请求头
header3 = {
    'cookie': 'SUB=_2A25P39x3DeRhGeFL6loZ8y3EzTmIHXVtI-Q_rDV6PUJbkdAKLUuhkW1NQnS_I2wd2YWbF5oKTVQA_PcRWFVbpKFQ; WEIBOCN_FROM=1110106030; _T_WM=25406783168; MLOGIN=1; XSRF-TOKEN=66141d; M_WEIBOCN_PARAMS=oid=4740951744185390&luicode=20000061&lfid=4740951744185390&uicode=20000061&fid=4740951744185390',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62',
    'accept-encoding':'gzip, deflate, br'
}

rows3 = [] # 用于存数据
page = 1 #从第一页开始爬取
for page in range(1,max_repost_page+1):
    print('===开始爬取第{}页微博转发'.format(page))
    # 请求地址
    url3 = base_url3.format(weibo_id = weibo_id, page=page)
    # 发送请求
    response = r.get(url3, headers=header3, timeout=5)
    jsondata = response.json()
    # pprint.pprint(jsondata)
    reposts = jsondata['data']['data']
    # pprint.pprint(reposts)
    for repost in reposts:
        user_name = repost['user']['screen_name'] #用户名
        id = repost['user']['id']  #用户id
        fans = repost['user']['followers_count'] #用户的粉丝数
        follower = repost['user']['follow_count'] #用户关注数
        time_list = repost['created_at'] #转发时间
        time_list = trans_time(time_list) #调用时间转化函数
        text = repost['text'] #转发的文本
        row = [user_name,id,fans,follower,time_list,text]
        rows3.append(row)
# pprint.pprint(rows)
headers = ['用户名','用户id','粉丝数','关注数','转发时间','转发内容']
# repost_weibo_file = '微博转发前20页-人民日报.csv'  #爬取转发数据：人民日报：【现场直击：乌克兰#基辅实时画面#】
# repost_weibo_file = '微博转发_{}.csv'.format(repost_weibo)
repost_weibo_file = '微博转发前20页-央视新闻.csv' ##爬取转发数据：央视新闻：【王毅阐述#中方对当前乌克兰问题的五点立场#】
data3 = pd.DataFrame(rows3,columns=headers)
data3.to_csv(repost_weibo_file,encoding='utf-8-sig')
print('数据写入完成')


===开始爬取第1页微博转发
===开始爬取第2页微博转发
===开始爬取第3页微博转发
===开始爬取第4页微博转发
===开始爬取第5页微博转发
===开始爬取第6页微博转发
===开始爬取第7页微博转发
===开始爬取第8页微博转发
===开始爬取第9页微博转发
===开始爬取第10页微博转发
===开始爬取第11页微博转发
===开始爬取第12页微博转发
===开始爬取第13页微博转发
===开始爬取第14页微博转发
===开始爬取第15页微博转发
===开始爬取第16页微博转发
===开始爬取第17页微博转发
===开始爬取第18页微博转发
===开始爬取第19页微博转发
===开始爬取第20页微博转发
数据写入完成


In [36]:
rows3

[['乐乐Happy21', 5519472056, '69', 1164, '2022-03-27 16:04:39', '转发微博'],
 ['珠溪语文', 5892834576, '5483', 651, '2022-03-22 20:59:46', '转发微博'],
 ['三分中锋考辛斯', 5675785432, '68', 768, '2022-03-10 12:54:35', '转发微博'],
 ['那所有的快乐都专属于你', 7744778518, '1059', 39, '2022-02-27 12:19:26', '转发微博'],
 ['江湖一甩一条街', 6074671610, '159', 677, '2022-02-27 09:20:33', '早点结束！'],
 ['知返aaa', 6537169432, '49', 1425, '2022-02-27 09:04:16', '转发微博'],
 ['开车慢点开车', 7310898282, '40', 532, '2022-02-26 21:50:55', '转发微博'],
 ['江南VB江南', 5774177130, '33', 170, '2022-02-26 20:44:08', '转发微博'],
 ['浮苼若夢i',
  2241305685,
  '930',
  697,
  '2022-02-26 15:20:15',
  '<a  href="https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E5%9F%BA%E8%BE%85%E5%AE%9E%E6%97%B6%E7%94%BB%E9%9D%A2%23&extparam=%23%E5%9F%BA%E8%BE%85%E5%AE%9E%E6%97%B6%E7%94%BB%E9%9D%A2%23" data-hide=""><span class="surl-text">#基辅实时画面#</span></a><a  href="https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E4%BF%84%E4%B9%8C%E5%B1%80%E5%8A%