In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from lxml import etree
import random
import re
import asyncio
from pyppeteer import launch
import nest_asyncio
nest_asyncio.apply()

In [29]:
async def fetch_webpage(url):
    """
    使用 Pyppeteer 获取网页内容
    """
    browser = await launch(headless=True)
    page = await browser.newPage()
    await page.goto(url)
    content = await page.content()
    await browser.close()
    return content

In [30]:
def get_random_headers()->dict:
    """
    随机生成一个header，用于伪造浏览器

    Returns:
        dict: 返回一个浏览器header
    """
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
    ]

    headers = {
        "User-Agent": random.choice(user_agents),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh-Hans;q=0.9",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",  # Do Not Track Request Header
        "Accept-Encoding": "gzip, deflate, br"
    }

    return headers

In [31]:
def get_url_list(origin_url:str,date:str)->list:
    """
    根据源页面的url获取源页面所有新闻的url,获取的新闻的url格式为：
    https://news.cctv.com/year/month/day/abcdfjafa.shtml
        \n year 例如：2024
        \n month 例如：05
        \n day 例如：22
    Args:
        origin_url (str): 央视新闻网首页以及其各种分类界面
        date (str): 日期字符串，例如 "2024/05/22"
    Returns:
        list: 包含所有新闻界面url和首页图片url的一个列表 \n
            [[page_url,img_url],...]
    """
    content = asyncio.get_event_loop().run_until_complete(fetch_webpage(origin_url))
    html = etree.HTML(content,parser=etree.HTMLParser())
    node_list = html.xpath('//*[@id="newslist"]/li')
    url_list = []
    for node in node_list:
        page_url = node.xpath('div[@class="image"]/a/@href')[0]
        image_url = node.xpath('div[1]/a/img/@data-echo')[0]
        url_list.append({'page_url':page_url,'image_url':image_url})
    return url_list
origin_url = "https://news.cctv.com/"
url_list = get_url_list(origin_url,date="2024/05/22")

  gc.collect()


In [32]:
def get_all_kind_page(origin_url_dict:dict,date:str)->dict:
    """获得所有分类的新闻的URL，分类包括：国内、国际、经济、社会、法治、文娱、科技、生活、军事

    Args:
        origin_url_dict (dict): {"新闻种类":origin_url,...}
        date (str): year/month/day

    Returns:
        dict: {"新闻种类1":["url1","url2",...],
               "新闻种类1":["url1","url2",...],
                ....}
    """
    res = dict()
    for news_kind, origin_url in origin_url_dict.items():
        url_list = get_url_list(origin_url=origin_url,date=date)
        res[news_kind] = url_list
    return res

In [33]:
origin_url_dict = {
    "首页":"https://news.cctv.com",
    "国内":"https://news.cctv.com/china",
    "国际":"https://news.cctv.com/world",
    "经济":"https://jingji.cctv.com/",
    "社会":"https://news.cctv.com/society",
    "法治":"https://news.cctv.com/law",
    "文娱":"https://news.cctv.com/ent",
    "科技":"https://news.cctv.com/tech",
    "生活":"https://news.cctv.com/life",
    "军事":"https://military.cctv.com",
    "人物":"https://people.cctv.com",
    
}

In [34]:
origin_url = "https://news.cctv.com/tech"
url_list = get_url_list(origin_url,date="2024/05/22")
url_list

[{'page_url': 'https://news.cctv.com/2024/05/23/ARTI8lVsmu4u2SLo7PzCrezW240523.shtml',
  'image_url': 'https://p3.img.cctvpic.com/photoworkspace/2024/05/23/2024052312050076590.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/23/ARTIbrAY2tgPhDsuoJXuXWFb240523.shtml',
  'image_url': 'https://p4.img.cctvpic.com/photoworkspace/2024/05/23/2024052311453871103.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/23/ARTIz8Z2Ykh0a693fVSvsAow240523.shtml',
  'image_url': 'https://p5.img.cctvpic.com/photoworkspace/2024/05/23/2024052310480612460.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/23/ARTIznns2ETeYzOWMCl21YbA240523.shtml',
  'image_url': 'https://p5.img.cctvpic.com/photoworkspace/2024/05/23/2024052307052831119.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/22/ARTIa5pZ8DGJ1gNXqEs3gTkL240522.shtml',
  'image_url': 'https://p2.img.cctvpic.com/photoworkspace/2024/05/22/2024052218325560813.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/22/ARTI6KnUmHCDpGYglzMBj8OS240522.shtml'

In [36]:
def page_parse(page_url:str)->dict:
    """
    对每个页面进行解析，获取数据
    需要的数据有：新闻标题、来源、时间、内容、作者信息

    Args:
        page_url (str): 新闻页面URL

    Returns:
        dict: 每个新闻界面的数据
        {"title":新闻标题(str),
         "tag":新闻唯一标识(str),
         "time":新闻时间(str),
         "content": [   
                       {
                        "type": text(文本) |text-blod(粗体文本) | imge_url(图片链接) | img_desc(图片描述),
                        "data": text | url
                        },
                        ...
                    ],
          "author": 编辑信息(str)
         }
    """
    response = requests.get(page_url,headers=get_random_headers())
    response.encoding = 'utf-8'
    content = response.text
    html = etree.HTML(content,parser=etree.HTMLParser())
    title = html.xpath('//*[@id="title_area"]/h1/text()')[0]
    tag = page_url.split('/')[-1].split('.')[0]
    time = html.xpath('//*[@id="title_area"]/div[1]/text()[2]')[0]
    time = ' '.join(time.split()[1:])
    source = html.xpath('//*[@id="title_area"]/div[1]/a/text()')[0]
    # 处理news content
    news_img_nodes = html.xpath('//*[@id="content_area"]/p[contains(@class,"photo") and contains(@style,"text") and img]')
    news_img_desc_nodes = html.xpath('//*[@id="content_area"]/p[contains(@class,"photo") and contains(@style,"text") and not(*)]')
    news_content_nodes = html.xpath('//*[@id="content_area"]/p')
    content = []
    for node in news_content_nodes:
        if node in news_img_nodes: #取出图片链接
            img_url = 'https:' + node.xpath('img/@src')[0]
            content.append({'type':'img_url','data':img_url})
        elif node in news_img_desc_nodes:
            img_desc = node.xpath('text()')[0].strip()
            content.append({'type':'img_desc','data':img_desc})
        else:
            text = node.xpath('text()')[0].strip()
            if text == "":
                text = node.xpath('strong/text()')[0]
                content.append({'type':'text-blod','data':text})
            else:
                content.append({'type':'text','data':text})
    author = html.xpath('//*[@id="page_body"]/div[1]/div[3]/div[1]/span/text()')
    news_info = dict()
    news_info['title'] = title
    news_info['tag'] = tag
    news_info['time'] = time
    news_info['content'] = content
    news_info['author'] = author
    return news_info
info = page_parse('https://news.cctv.com/2024/05/22/ARTIyvrd1nW2ptdpz1rveHzK240522.shtml')
    
    
    

In [37]:
content = asyncio.get_event_loop().run_until_complete(fetch_webpage('https://news.cctv.com'))

In [38]:
html = etree.HTML(content,parser=etree.HTMLParser())

In [39]:
res = html.xpath('//*[@id="newslist"]/li')
page_url = res[1].xpath('div[@class="image"]/a/@href')
image_url = res[1].xpath('div[1]/a/img/@data-echo')

In [40]:
page_url = 'https://news.cctv.com/2024/05/22/ARTIyvrd1nW2ptdpz1rveHzK240522.shtml'
response = requests.get(page_url,headers=get_random_headers())
response.encoding = 'utf-8'
content = response.text
content

'         <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\r\n<meta name="viewport" content="width=device-width,initial-scale=1.0, minimun-scale=1.0,maximum-scale=1.0,user-scalable=no">\r\n<meta name="filetype" content="1"> \r\n<meta name="publishedtype" content="1"> \r\n<meta name="pagetype" content="1"> \r\n\r\n    <meta name="author" content="甄涛"> \r\n    <meta name="source" content="人民网"> \r\n\r\n\r\n\r\n<title>新疆：“黄金通道”释放“黄金价值”_新闻频道_央视网(cctv.com)</title>\r\n<meta name="catalogs" content="PAGE1354759511941341"> \r\n<meta name="contentid" content="ARTIyvrd1nW2ptdpz1rveHzK240522">\r\n<meta name=keywords content="黄金通道 新疆外贸 阿拉山口站 新疆大学 新三样">\r\n<meta name="spm-id" content="C73544894212">\r\n<meta name=description content="“今年公司试水了货运包机出口业务，这将成为我们外贸的主要增长点。”新疆喀什中顺电子商务有限公司负责人张琪“五一”假期后，马不停蹄地在

In [41]:
html = etree.HTML(content,parser=etree.HTMLParser())


In [42]:
title = html.xpath('//*[@id="title_area"]/h1/text()')[0]
title

'新疆：“黄金通道”释放“黄金价值”'

In [43]:
tag = page_url.split('/')[-1].split('.')[0]
tag

'ARTIyvrd1nW2ptdpz1rveHzK240522'

In [44]:
time = html.xpath('//*[@id="title_area"]/div[1]/text()[2]')[0]
time = ' '.join(time.split()[1:])
time

'2024年05月22日 20:49:40'

In [45]:
source = html.xpath('//*[@id="title_area"]/div[1]/a/text()')[0]
source

'人民网'

In [46]:
news_content = html.xpath('//*[@id="content_area"]/p/text()')
news_content = [s.strip() for s in news_content]
news_content = [s for s in news_content if s]
news_content

['“今年公司试水了货运包机出口业务，这将成为我们外贸的主要增长点。”新疆喀什中顺电子商务有限公司负责人张琪“五一”假期后，马不停蹄地在疆外出差，学经验、请人才。下半年，张琪准备大展手脚，至少完成50个班次货运包机业务，力争公司外贸额翻一番。',
 '4月26日，乌鲁木齐—萨莱诺中欧班列顺利开行，这标志着新疆中欧班列境外南通道跨“两海”线路成功运行。吴奇摄',
 '外贸企业创新、引才、拓市场，口岸货畅、人忙、贸易兴，是新疆外贸形势持续向好的缩影。前4月，新疆外贸进出口总值1379亿元人民币，同比增长49.5%，增速居全国第2位，连续30个月保持正增长，开放优势不断释放，再现着“使者相望于道，商旅不绝于途”的古丝路繁华。',
 '5月15日，一辆运载着香蕉、柑橘等水果的冷藏厢式货车抵达霍尔果斯公路口岸，在“口岸直通”作业模式下，从装运上车到通关出境仅用1小时，当天就能出现在哈萨克斯坦阿拉木图市民的餐桌上。',
 '“口岸直通”模式由乌鲁木齐海关创新推出，去年11月，该模式在阿拉山口公路口岸正式运行，车辆平均通行时间从34.5小时压缩到5小时。当前，这项改革正逐渐推广到全疆。',
 '霍尔果斯公路口岸，等待通关的汽车。人民网 李欣洋摄',
 '霍尔果斯海关副关长陈鹏德介绍，通过推出提前申报和集中查验，配套数据自动提取、多国车牌智能识别等科技手段，从根本上提高了公路口岸出口货物的时效性。据统计，前3月，霍尔果斯公路口岸出口果蔬11.2万吨，同比增长6.9倍。',
 '不仅仅是通关时效，该模式也给企业带来了真金白银的甜头。',
 '“启用新模式后，每车运输成本可减少1000元左右。”说起该项改革的好处，新疆乐果果业有限公司负责人张朋亮算了一笔“经济账”，“目前已累计为企业节约成本超160万元，实现效益和效率的双向奔赴。”',
 '5月10日，一辆满载苹果的冷链运输车从乌鲁木齐市发车前往哈萨克斯坦阿拉木图市。这是新疆首次实现果蔬冷链卡航直达中亚市场。图为乌昌海关关员对冷链卡航模式出口鲜苹果进行验放。杨逸萌摄',
 '口岸是外贸发展的生命线。通关效能的提升，让经贸走得更远。前四月，新疆与197个国家和地区产生贸易往来，对东盟、RCEP其他成员国、非洲、拉美均实现3倍以上的出口增速，贸易伙伴更趋多元。',
 '今年以来，中国新疆持续巩固与周边国家互访交流成果，以中亚五国为重点，积

In [47]:
news_img_nodes = html.xpath('//*[@id="content_area"]/p[contains(@class,"photo") and contains(@style,"text") and img]')
news_img_desc_nodes = html.xpath('//*[@id="content_area"]/p[contains(@class,"photo") and contains(@style,"text") and not(*)]')
news_content_nodes = html.xpath('//*[@id="content_area"]/p')


In [54]:
content = []
for node in news_content_nodes:
    if node in news_img_nodes: #取出图片链接
        img_url = 'https:' + node.xpath('img/@src')[0]
        content.append({'type':'img_url','data':img_url})
    elif node in news_img_desc_nodes:
        img_desc = node.xpath('text()')[0].strip()
        content.append({'type':'img_desc','data':img_desc})
    else:
        text = node.xpath('text()')[0].strip()
        content.append({'type':'text','data':text})

        


In [55]:
content

[{'type': 'text',
  'data': '“今年公司试水了货运包机出口业务，这将成为我们外贸的主要增长点。”新疆喀什中顺电子商务有限公司负责人张琪“五一”假期后，马不停蹄地在疆外出差，学经验、请人才。下半年，张琪准备大展手脚，至少完成50个班次货运包机业务，力争公司外贸额翻一番。'},
 {'type': 'img_url',
  'data': 'https://p3.img.cctvpic.com/photoworkspace/contentimg/2024/05/22/2024052220435240761.png'},
 {'type': 'img_desc',
  'data': '4月26日，乌鲁木齐—萨莱诺中欧班列顺利开行，这标志着新疆中欧班列境外南通道跨“两海”线路成功运行。吴奇摄'},
 {'type': 'text',
  'data': '外贸企业创新、引才、拓市场，口岸货畅、人忙、贸易兴，是新疆外贸形势持续向好的缩影。前4月，新疆外贸进出口总值1379亿元人民币，同比增长49.5%，增速居全国第2位，连续30个月保持正增长，开放优势不断释放，再现着“使者相望于道，商旅不绝于途”的古丝路繁华。'},
 {'type': 'text', 'data': ''},
 {'type': 'text', 'data': ''},
 {'type': 'text',
  'data': '5月15日，一辆运载着香蕉、柑橘等水果的冷藏厢式货车抵达霍尔果斯公路口岸，在“口岸直通”作业模式下，从装运上车到通关出境仅用1小时，当天就能出现在哈萨克斯坦阿拉木图市民的餐桌上。'},
 {'type': 'text',
  'data': '“口岸直通”模式由乌鲁木齐海关创新推出，去年11月，该模式在阿拉山口公路口岸正式运行，车辆平均通行时间从34.5小时压缩到5小时。当前，这项改革正逐渐推广到全疆。'},
 {'type': 'img_url',
  'data': 'https://p5.img.cctvpic.com/photoworkspace/contentimg/2024/05/22/2024052220435223190.png'},
 {'type': 'img_desc', 'data': '霍尔果斯公路口岸，等待通关的汽车。人民网 

In [168]:
author = html.xpath('//*[@id="page_body"]/div[1]/div[3]/div[1]/span/text()')
author

['编辑：甄涛', '责任编辑：刘亮']

In [67]:
with open('sample.json','w') as f:
    json.dump([info],f,ensure_ascii=False)