In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from lxml import etree
import random
import re
import asyncio
from pyppeteer import launch
import nest_asyncio
nest_asyncio.apply()

In [2]:
response = requests.get('http://api.uomg.com/api/rand.qinghua')

In [3]:
async def fetch_webpage(url):
    """
    使用 Pyppeteer 获取网页内容
    """
    browser = await launch(headless=True)
    page = await browser.newPage()
    await page.goto(url)
    content = await page.content()
    await browser.close()
    return content

In [4]:
def get_random_headers()->dict:
    """
    随机生成一个header，用于伪造浏览器

    Returns:
        dict: 返回一个浏览器header
    """
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
    ]

    headers = {
        "User-Agent": random.choice(user_agents),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh-Hans;q=0.9",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",  # Do Not Track Request Header
        "Accept-Encoding": "gzip, deflate, br"
    }

    return headers

In [5]:
def get_url_list(origin_url:str,date:str)->list:
    """
    根据源页面的url获取源页面所有新闻的url,获取的新闻的url格式为：
    https://news.cctv.com/year/month/day/abcdfjafa.shtml
        \n year 例如：2024
        \n month 例如：05
        \n day 例如：22
    Args:
        origin_url (str): 央视新闻网首页以及其各种分类界面
        date (str): 日期字符串，例如 "2024/05/22"
    Returns:
        list: 包含所有新闻界面url和首页图片url的一个列表 \n
            [{"page_url":page_url,"img_url": img_url},...]
    """
    content = asyncio.get_event_loop().run_until_complete(fetch_webpage(origin_url))
    html = etree.HTML(content,parser=etree.HTMLParser())
    node_list = html.xpath('//*[@id="newslist"]/li')
    url_list = []
    for node in node_list:
        page_url = node.xpath('div[@class="image"]/a/@href')[0]
        image_url = node.xpath('div[1]/a/img/@data-echo')[0]
        url_list.append({'page_url':page_url,'img_url':image_url})
    return url_list
origin_url = "https://news.cctv.com/"
url_list = get_url_list(origin_url,date="2024/05/22")

In [6]:
def page_parse(page_url:str)->dict:
    """
    对每个页面进行解析，获取数据
    需要的数据有：新闻标题、来源、时间、内容、作者信息

    Args:
        page_url (str): 新闻页面URL

    Returns:
        dict: 每个新闻界面的数据
        {"title":新闻标题(str),
         "tag":新闻唯一标识(str),
         "time":新闻时间(str),
         "content": [   
                       {
                        "type": text(文本) |text-blod(粗体文本) | imge_url(图片链接) | img_desc(图片描述),
                        "data": text | url
                        },
                        ...
                    ],
          "author": 编辑信息(str)
         }
    """
    response = requests.get(page_url,headers=get_random_headers())
    response.encoding = 'utf-8'
    content = response.text
    html = etree.HTML(content,parser=etree.HTMLParser())
    title = html.xpath('//*[@id="title_area"]/h1/text()')[0]
    tag = page_url.split('/')[-1].split('.')[0]
    time = html.xpath('//*[@id="title_area"]/div[1]/text()[2]')[0]
    time = ' '.join(time.split()[1:])
    source = html.xpath('//*[@id="title_area"]/div[1]/a/text()')[0]
    # 处理news content
    news_img_nodes = html.xpath('//*[@id="content_area"]/p[contains(@class,"photo") and contains(@style,"text") and img]')
    news_img_desc_nodes = html.xpath('//*[@id="content_area"]/p[contains(@class,"photo") and contains(@style,"text") and not(*)]')
    news_content_nodes = html.xpath('//*[@id="content_area"]/p')
    content = []
    for node in news_content_nodes:
        if node in news_img_nodes: #取出图片链接
            img_url = 'https:' + node.xpath('img/@src')[0]
            content.append({'type':'img_url','data':img_url})
        elif node in news_img_desc_nodes:
            img_desc = node.xpath('text()')[0].strip()
            content.append({'type':'img_desc','data':img_desc})
        else:
            text = node.xpath('text()')[0].strip()
            if text == "":
                text = node.xpath('strong/text()')[0]
                content.append({'type':'text-blod','data':text})
            else:
                content.append({'type':'text','data':text})
    author = html.xpath('//*[@id="page_body"]/div[1]/div[3]/div[1]/span/text()')
    news_info = dict()
    news_info['title'] = title
    news_info['tag'] = tag
    news_info['time'] = time
    news_info['content'] = content
    news_info['author'] = author
    return news_info
info = page_parse('https://news.cctv.com/2024/05/22/ARTIyvrd1nW2ptdpz1rveHzK240522.shtml')
    
    
    

In [7]:
def get_all_kind_page(origin_url_dict:dict,date:str)->dict:
    """获得所有分类的新闻的URL，分类包括：国内、国际、经济、社会、法治、文娱、科技、生活、军事

    Args:
        origin_url_dict (dict): {"新闻种类":origin_url,...}
        date (str): year/month/day

    Returns:
        dict: {"新闻种类1":["url1","url2",...],
               "新闻种类1":["url1","url2",...],
                ....}
    """
    res = dict()
    for news_kind, origin_url in origin_url_dict.items():
        url_list = get_url_list(origin_url=origin_url,date=date)
        for url_dict in url_list:
            page_url = url_dict['page_url']
            img_url = url_dict['img_url']
            details = page_parse(page_url)
            sample = dict()
            sample['page_url'] = page_url
            sample['item_img_url'] = img_url
            sample['details'] = details
            res.append(sample)
            
    return res

In [8]:
origin_url_dict = {
    "首页":"https://news.cctv.com",
    "国内":"https://news.cctv.com/china",
    "国际":"https://news.cctv.com/world",
    "经济":"https://jingji.cctv.com/",
    "社会":"https://news.cctv.com/society",
    "法治":"https://news.cctv.com/law",
    "文娱":"https://news.cctv.com/ent",
    "科技":"https://news.cctv.com/tech",
    "生活":"https://news.cctv.com/life",
    "军事":"https://military.cctv.com",
    "人物":"https://people.cctv.com",
    
}

In [9]:
origin_url = "https://news.cctv.com/tech"
url_list = get_url_list(origin_url,date="2024/05/22")
url_list

[{'page_url': 'https://news.cctv.com/2024/05/24/ARTIwvO10s0078MLfeWj97Gk240524.shtml',
  'img_url': 'https://p5.img.cctvpic.com/photoworkspace/2024/05/24/2024052416040299383.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/24/ARTIWjGpxxjtZv3qjMwFPmAw240524.shtml',
  'img_url': 'https://p5.img.cctvpic.com/photoworkspace/2024/05/24/2024052415345915170.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/24/ARTIWomvyMuz38J8kOOqZ4ty240524.shtml',
  'img_url': 'https://p2.img.cctvpic.com/photoworkspace/2024/05/24/2024052414374711718.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/24/ARTIBO9Jy6W1g393TvSuSm5X240524.shtml',
  'img_url': 'https://p1.img.cctvpic.com/photoworkspace/2024/05/24/2024052414144673685.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/24/ARTIG33zkkyZxiFvj20ap5AC240524.shtml',
  'img_url': 'https://p3.img.cctvpic.com/photoworkspace/2024/05/24/2024052410201812324.jpg'},
 {'page_url': 'https://news.cctv.com/2024/05/24/ARTIBrtiRdggHbeKzGUNPJJC240524.shtml',
  'img_u

In [10]:
content = asyncio.get_event_loop().run_until_complete(fetch_webpage('https://news.cctv.com'))

In [11]:
html = etree.HTML(content,parser=etree.HTMLParser())

In [12]:
res = html.xpath('//*[@id="newslist"]/li')
page_url = res[1].xpath('div[@class="image"]/a/@href')
image_url = res[1].xpath('div[1]/a/img/@data-echo')

In [13]:
page_url = 'https://news.cctv.com/2024/05/24/ARTI1t3LV1vj3Hi3s863pEVN240524.shtml'
response = requests.get(page_url,headers=get_random_headers())
response.encoding = 'utf-8'
content = response.text
content

'         <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\r\n<meta name="viewport" content="width=device-width,initial-scale=1.0, minimun-scale=1.0,maximum-scale=1.0,user-scalable=no">\r\n<meta name="filetype" content="1"> \r\n<meta name="publishedtype" content="1"> \r\n<meta name="pagetype" content="1"> \r\n\r\n    <meta name="author" content="刘洁"> \r\n    <meta name="source" content="央视网"> \r\n\r\n\r\n\r\n<title>缩短从“枝头”到“舌尖”的距离 “高铁+”助力农产品抢“鲜”上市_新闻频道_央视网(cctv.com)</title>\r\n<meta name="catalogs" content="PAGE9moqzeXHoOre7Psm9zbv161009"> \r\n<meta name="contentid" content="ARTI1t3LV1vj3Hi3s863pEVN240524">\r\n<meta name=keywords content="樱桃 高铁">\r\n<meta name="spm-id" content="C73544894212">\r\n<meta name=description content="近期，山西省运城、临汾等地的樱桃陆续成熟上市，为了缩短新鲜樱桃从产地到消费者手中的时间，太原铁路部门推出樱桃高铁快运服务

In [14]:
html = etree.HTML(content,parser=etree.HTMLParser())


In [15]:
title = html.xpath('//*[@id="title_area"]/h1/text()')[0]
title

'缩短从“枝头”到“舌尖”的距离 “高铁+”助力农产品抢“鲜”上市'

In [16]:
tag = page_url.split('/')[-1].split('.')[0]
tag

'ARTI1t3LV1vj3Hi3s863pEVN240524'

In [19]:
time = html.xpath('//*[@id="title_area"]/div[2]/text()')[0]
time = ' '.join(time.split()[-2:])
time

'2024年05月24日 17:05:34'

In [21]:
source = html.xpath('//*[@id="title_area"]/div[1]/text()')[0].split()[0]
source

'来源：央视网'

In [22]:
news_content = html.xpath('//*[@id="content_area"]/p/text()')
news_content = [s.strip() for s in news_content]
news_content = [s for s in news_content if s]
news_content

['近期，山西省运城、临汾等地的樱桃陆续成熟上市。为了缩短新鲜樱桃从产地到消费者手中的时间，太原铁路部门推出樱桃高铁快运服务，实现生鲜“当日达”。',
 '在运城市临猗县的一处樱桃种植基地，果农们忙着采摘。樱桃果皮娇嫩、怕磕碰、不耐储存，对运输时效和运输环境有很高的要求。为此，农户和铁路部门合作，通过高铁快运让消费者可以品尝到新鲜樱桃。',
 '樱桃种植户李林红表示，早上采摘，下午客户就能吃上樱桃，非常鲜、非常美。',
 '太原铁路部门在临汾、侯马、运城三地的高铁站开辟上站绿色通道，利用不载客的高铁确认车、预留车厢和日常有快递业务的50余趟高铁动车组进行樱桃快运。同时，各高铁车站安排专人负责，确保樱桃快速转运、快速安检、快速装车。',
 '5月以来，已有10余吨山西运城、临汾等地的樱桃通过高铁快速送达北京、天津、上海、成都等地区。']

In [23]:
news_img_nodes = html.xpath('//*[@id="content_area"]/p[contains(@class,"photo") and contains(@style,"text") and img]')
news_img_desc_nodes = html.xpath('//*[@id="content_area"]/p[contains(@class,"photo") and contains(@style,"text") and not(*)]')
news_content_nodes = html.xpath('//*[@id="content_area"]/p')


[<Element p at 0x12f2e8800>,
 <Element p at 0x12f2eb4c0>,
 <Element p at 0x12f2e83c0>]

In [28]:
content = []
for node in news_content_nodes:
    if node in news_img_nodes: #取出图片链接
        img_url = 'https:' + node.xpath('img/@src')[0]
        content.append({'type':'img_url','data':img_url})
    elif node in news_img_desc_nodes:
        img_desc = node.xpath('text()')[0].strip()
        content.append({'type':'img_desc','data':img_desc})
        print("img_desc",img_desc)
    else:
        text = node.xpath('text()')
        if text:
            text = text[0].strip()
            content.append({'type':'text','data':text})

IndexError: list index out of range

In [55]:
content

[{'type': 'text',
  'data': '“今年公司试水了货运包机出口业务，这将成为我们外贸的主要增长点。”新疆喀什中顺电子商务有限公司负责人张琪“五一”假期后，马不停蹄地在疆外出差，学经验、请人才。下半年，张琪准备大展手脚，至少完成50个班次货运包机业务，力争公司外贸额翻一番。'},
 {'type': 'img_url',
  'data': 'https://p3.img.cctvpic.com/photoworkspace/contentimg/2024/05/22/2024052220435240761.png'},
 {'type': 'img_desc',
  'data': '4月26日，乌鲁木齐—萨莱诺中欧班列顺利开行，这标志着新疆中欧班列境外南通道跨“两海”线路成功运行。吴奇摄'},
 {'type': 'text',
  'data': '外贸企业创新、引才、拓市场，口岸货畅、人忙、贸易兴，是新疆外贸形势持续向好的缩影。前4月，新疆外贸进出口总值1379亿元人民币，同比增长49.5%，增速居全国第2位，连续30个月保持正增长，开放优势不断释放，再现着“使者相望于道，商旅不绝于途”的古丝路繁华。'},
 {'type': 'text', 'data': ''},
 {'type': 'text', 'data': ''},
 {'type': 'text',
  'data': '5月15日，一辆运载着香蕉、柑橘等水果的冷藏厢式货车抵达霍尔果斯公路口岸，在“口岸直通”作业模式下，从装运上车到通关出境仅用1小时，当天就能出现在哈萨克斯坦阿拉木图市民的餐桌上。'},
 {'type': 'text',
  'data': '“口岸直通”模式由乌鲁木齐海关创新推出，去年11月，该模式在阿拉山口公路口岸正式运行，车辆平均通行时间从34.5小时压缩到5小时。当前，这项改革正逐渐推广到全疆。'},
 {'type': 'img_url',
  'data': 'https://p5.img.cctvpic.com/photoworkspace/contentimg/2024/05/22/2024052220435223190.png'},
 {'type': 'img_desc', 'data': '霍尔果斯公路口岸，等待通关的汽车。人民网 

In [168]:
author = html.xpath('//*[@id="page_body"]/div[1]/div[3]/div[1]/span/text()')
author

['编辑：甄涛', '责任编辑：刘亮']

In [2]:
from scrapy import Scrapy

scr = Scrapy(None)
scr.page_parse('https://news.cctv.com/2024/05/25/ARTIVdnhTY7Pv5Wigb0WoN8A240525.shtml')

{'title': '中欧班列累计开行突破9万列！',
 'tag': 'ARTIVdnhTY7Pv5Wigb0WoN8A240525',
 'time': '2024年05月25日 08:49:33',
 'content': [{'type': 'text-blod', 'data': '央视网消息'},
  {'type': 'text',
   'data': '：2024年5月25日8时40分，随着X8157次中欧班列（西安—马拉舍维奇）从西安国际港站开出，至此中欧班列累计开行突破9万列，发送货物超870万标箱、货值超3800亿美元，保持安全稳定畅通运行。'},
  {'type': 'img_url',
   'data': 'https://p5.img.cctvpic.com/photoworkspace/contentimg/2024/05/25/2024052509095154229.jpg'},
  {'type': 'text-blod', 'data': '开行规模不断扩大'},
  {'type': 'text',
   'data': '。中欧班列是新型国际运输组织方式，具有便利快捷、安全稳定、绿色经济等显著优势，已成为广受欢迎的国际公共产品，开行数量保持强劲增长态势，特别是2016年统一品牌后，中欧班列迎来了规范开行、快速发展新阶段。2016年至2023年，中欧班列年开行数量由1702列增加到超1.7万列，增长近10倍，年均增长39.5%。开行万列所需时间由开行之初的90个月缩短为现在的7个月。自2020年5月起，中欧班列已连续48个月单月开行数量保持在千列以上。中欧班列年运输货值由2016年80亿美元增长到2023年的567亿美元。货物品类由开行初期的笔记本电脑、打印机等IT产品，逐步扩大到服装鞋帽、汽车及配件、日用百货、食品、木材、家具、化工品、机械设备等53大类5万余种。2023年以来，中国制造“新三样”——新能源汽车、锂离子电池产品和光伏产品正在成为中欧班列运量新的增长点，市场需求旺盛，目前中欧班列综合重箱率稳定在100%。'},
  {'type': 'text-blod', 'data': '通道能力持续增强'},
  {'type': 'text',
   'data': '。在国内，先后实施了兰新铁路精河至阿拉山口段增

In [37]:
response = requests.get('https://news.cctv.com/2024/05/25/ARTIVdnhTY7Pv5Wigb0WoN8A240525.shtml')
response.encoding = 'utf-8'
html = etree.HTML(response.text,etree.HTMLParser())
html.xpath('//*[@id="content_area" or @id="text_area"]/p[contains(@class,"photo") and contains(@style,"text") and not(*)]')



[]