In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import json
import os
import time
import logging

In [2]:
BASE_URL = 'https://cn.reuters.com/news/archive/topic-cn-top-news?'

HEADERS = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/78.0.3904.97 Safari/537.36'
}

params_base = {
    'view': 'page',
    'page': '{!s}',
    'pageSize': 10
}

In [4]:
params = params_base
params['page'] = 1
url = BASE_URL + urlencode(params)
resp = requests.get(url, headers=HEADERS)

In [5]:
resp

<Response [200]>

In [6]:
html = BeautifulSoup(resp.text, 'lxml')

In [7]:
articles = html.select('div.news-headline-list article')

In [8]:
titles = [tag.select('h3.story-title')[0] for tag in articles]

In [9]:
titles

[<h3 class="story-title">
 								中国10月银行结售汇逆差扩至44亿美元 外汇市场预期更趋平稳</h3>, <h3 class="story-title">
 								易纲称央行要发挥好LPR引导作用 促进实际贷款利率下行</h3>, <h3 class="story-title">
 								中国10月财政收入同比增速创近一年半新高 支出为近一年低点（更新版）</h3>, <h3 class="story-title">
 								焦点：华为称美国再度延长许可没意义 重申受到不公待遇</h3>, <h3 class="story-title">
 								滴滴将在东京使用奔驰、特斯拉等车型提供高端打车服务</h3>, <h3 class="story-title">
 								谈判前景转淡人民币缩量收跌至半个月新低 短期料难出方向</h3>, <h3 class="story-title">
 								中国政府坚定支持香港特首林郑月娥--外交部</h3>, <h3 class="story-title">
 								中国三季度消费趋势指数保持高位 三线城市更追求潮流时尚--尼尔森</h3>, <h3 class="story-title">
 								中国股市收涨近1%创两周最大升幅 市场寄望当局采取经济刺激措施</h3>, <h3 class="story-title">
 								焦点：日本考虑发行50年期公债以支撑收益率</h3>]

In [15]:
datss = [tag.select('span.timestamp')[0] for tag in articles]

In [16]:
datss

[<span class="timestamp">19:29 BJT</span>,
 <span class="timestamp">18:58 BJT</span>,
 <span class="timestamp">18:40 BJT</span>,
 <span class="timestamp">18:08 BJT</span>,
 <span class="timestamp">17:44 BJT</span>,
 <span class="timestamp">17:37 BJT</span>,
 <span class="timestamp">16:37 BJT</span>,
 <span class="timestamp">16:33 BJT</span>,
 <span class="timestamp">16:09 BJT</span>,
 <span class="timestamp">15:53 BJT</span>]

In [10]:
contents = [tag.select('div.story-content p')[0] for tag in articles]

In [11]:
class Page:

    _count = 0

    def __init__(self, titles:list=None, contents:list=None):
        self._data = dict()
        if titles and contents:
            for i, (t, c) in enumerate(zip(titles, contents)):
                self._data[f'article_{i}'] = dict(
                    title=t,
                    content=c
                )
        self.count = type(self)._count
        type(self)._count += 1
    
    @classmethod
    def reset_page_count(cls):
        cls._count = 0
    
    def __getitem__(self, index:int):
        if index < 0 or index > len(self._data):
            raise IndexError(f'Index "{index}" out of range')
        else:
            return self._data[f'article_{index}']
    
    def __str__(self):
        name = type(self).__name__
        return f'{name}_{self.count}(article_count={len(self._data)})'
    
    __repr__ = __str__

In [12]:
Page.reset_page_count()
p1 = Page([i.text.strip() for i in titles], 
          [i.text.strip() for i in contents])

In [13]:
p1

Page_0(article_count=10)

In [14]:
p1[0]

{'title': '中国10月银行结售汇逆差扩至44亿美元 外汇市场预期更趋平稳',
 'content': '路透北京11月19日 - 中国国家外汇管理局周二公布数据显示，10月银行结售汇逆差为44亿美元，虽高于上月的34亿美元逆差规模，但低于前九个月平均水平；1-10月累计结售汇逆差526亿美元。'}