In [45]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from lxml import etree
import random
import re
import asyncio
from pyppeteer import launch
import nest_asyncio

In [46]:
async def fetch_webpage(url):
    """
    使用 Pyppeteer 获取网页内容
    """
    browser = await launch(headless=True)
    page = await browser.newPage()
    await page.goto(url)
    content = await page.content()
    await browser.close()
    return content

In [47]:
def get_random_headers()->dict:
    """
    随机生成一个header，用于伪造浏览器

    Returns:
        dict: 返回一个浏览器header
    """
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.48",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
    ]

    headers = {
        "User-Agent": random.choice(user_agents),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh-Hans;q=0.9",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",  # Do Not Track Request Header
        "Accept-Encoding": "gzip, deflate, br"
    }

    return headers

In [146]:
def get_url_list(origin_url:str,date:str)->list:
    """
    根据源页面的url获取源页面所有新闻的url,获取的新闻的url格式为：
    https://news.cctv.com/year/month/day/abcdfjafa.shtml
        \n year 例如：2024
        \n month 例如：05
        \n day 例如：22
    Args:
        origin_url (str): 央视新闻网首页以及其各种分类界面
        date (str): 日期字符串，例如 "2024/05/22"
    Returns:
        list: 包含所有新闻界面url和首页图片url的一个列表 \n
            [[page_url,img_url],...]
    """
    # response = requests.get(url=origin_url,headers=get_random_headers(),)
    # response.encoding = 'utf-8'
    content = asyncio.get_event_loop().run_until_complete(fetch_webpage(origin_url))
    # prefix = "https://" + origin_url.split('/')[2] + "/"
    # pattern = re.compile(f'{prefix}{date}/[A-Za-z0-9]+.shtml')
    # url_list = pattern.findall(content)
    # url_list = list(set(url_list))
    html = etree.HTML(content,parser=etree.HTMLParser())
    node_list = html.xpath('//*[@id="newslist"]/li')
    url_list = []
    for node in node_list:
        page_url = node.xpath('div[@class="image"]/a/@href')[0]
        image_url = node.xpath('div[1]/a/img/@data-echo')[0]
        url_list.append([page_url,image_url])
    return url_list
origin_url = "https://news.cctv.com/"
url_list = get_url_list(origin_url,date="2024/05/22")

In [9]:
def get_all_kind_page(origin_url_dict:dict,date:str)->dict:
    """获得所有分类的新闻的URL，分类包括：国内、国际、经济、社会、法治、文娱、科技、生活、军事

    Args:
        origin_url_dict (dict): {"新闻种类":origin_url,...}
        date (str): year/month/day

    Returns:
        dict: {"新闻种类1":["url1","url2",...],
               "新闻种类1":["url1","url2",...],
                ....}
    """
    res = dict()
    for news_kind, origin_url in origin_url_dict.items():
        url_list = get_url_list(origin_url=origin_url,date=date)
        res[news_kind] = url_list
    return res

In [10]:
origin_url_dict = {
    "首页":"https://news.cctv.com",
    "国内":"https://news.cctv.com/china",
    "国际":"https://news.cctv.com/world",
    "经济":"https://jingji.cctv.com/",
    "社会":"https://news.cctv.com/society",
    "法治":"https://news.cctv.com/law",
    "文娱":"https://news.cctv.com/ent",
    "科技":"https://news.cctv.com/tech",
    "生活":"https://news.cctv.com/life",
    "军事":"https://military.cctv.com",
    "人物":"https://people.cctv.com",
    
}

In [57]:
origin_url = "https://news.cctv.com/tech"
url_list = get_url_list(origin_url,date="2024/05/22")
url_list

['https://news.cctv.com/2024/05/22/ARTI6KnUmHCDpGYglzMBj8OS240522.shtml',
 'https://news.cctv.com/2024/05/22/ARTIMifs5JW2FINegykcMevM240522.shtml',
 'https://news.cctv.com/2024/05/22/ARTIqD698Rmtv7xca7nEs6OI240522.shtml',
 'https://news.cctv.com/2024/05/22/ARTIa5pZ8DGJ1gNXqEs3gTkL240522.shtml']

In [None]:
def page_parse(page_url:str)->dict:
    """
    对每个页面进行解析，获取数据
    需要的数据有：新闻标题、来源、时间、内容、作者信息

    Args:
        page_url (str): 新闻页面URL

    Returns:
        dict: 每个新闻界面的数据
    """
    

In [61]:
content = asyncio.get_event_loop().run_until_complete(fetch_webpage('https://news.cctv.com'))

  content = asyncio.get_event_loop().run_until_complete(fetch_webpage('https://news.cctv.com'))


In [62]:
html = etree.HTML(content,parser=etree.HTMLParser())

In [141]:
res = html.xpath('//*[@id="newslist"]/li')
page_url = res[1].xpath('div[@class="image"]/a/@href')
image_url = res[1].xpath('div[1]/a/img/@data-echo')