In [3]:
%load_ext lab_black

In [4]:
# from __future__ import print_function
import bs4 as bs  # import BeautifulSoup as bs
import re
import requests
import pandas as pd
import json
from datetime import date
import os
import random
import time

In [3]:
# from IPython.display import display

from six.moves.urllib import parse

In [9]:
def news_parser(news_url):
    """parse news from HTML to a readable article
    """

    print("Parsing news at\n %s" % news_url)
    news_id = re.search(pattern="\d{7}", string=news_url).group(0)
    news_title = (news_url.split("/"))[-1]

    news_page = requests.get(news_url)
    news_page.encoding = "utf-8"
    news = bs.BeautifulSoup(news_page.text, "lxml")

    article = [""]  # pseudo initialization, in the case some news cannot be parsed

    # step1: extract all HTML strings containing 'APOLLO_STATE'
    APOLLO_string = []
    scripts = news.find_all("script")
    for script in scripts:
        sString = script.string
        if sString != None and sString.find("APOLLO_STATE") > 0:
            APOLLO_string.append(sString)

    # step2: extract contents from {}
    if APOLLO_string != []:
        regBrackets = r"\{(.*?)\}"
        matches = re.finditer(regBrackets, APOLLO_string[0], re.MULTILINE | re.DOTALL)

        bracket_content = []
        # bracket_content is the content between {}
        for match in matches:
            bracket_content.append(match.group(1))

        # step3: match sentence
        article_end_index = bracket_content.index(
            '"type":"reading-100-percent-completion-tracker"'
        )
        sentence = []
        # sentence[] contains the sentence behind "type":"text","data":, and that's the article sentence
        sen_code = '(?<="type":"p","children":\[{"type":"text","data":).*$'
        for content in bracket_content[0 : article_end_index + 1]:
            sen = re.findall(sen_code, content)
            if not sen == [] and not sen == ['"\\n"']:  # clean blank lists and \\n
                sentence += sen

        # step4: clean sentences and combine them into an article
        for i in range(len(sentence)):
            sentence[i] = sentence[i].strip('"')
        article = ""

        for s in sentence:
            article += s
            # change: new sentence has a space ahead
            article += " "
        article = [article]

    # record date and build a dataframe
    news_time = news.find("time")
    if news_time == None:
        news_time = ["cannot parse"]
    else:
        news_time = news_time["datetime"]
    now = date.today()

    news_df = pd.DataFrame(
        data={
            "news_id": news_id,
            "title": news_title,
            "article": article,
            "release_time": news_time,
            "collecting_date": now,
            "URL": news_url,
        }
    )

    time.sleep(random.random() * 5)
    return news_df

In [5]:
def comments_parser(comments_url):
    """parse comments and return a dataframe
    """

    print("Parsing comments at\n %s" % comments_url)
    comments_page = requests.get(comments_url)
    comments_page.encoding = "utf-8"
    comments = bs.BeautifulSoup(comments_page.text, "lxml")

    words = comments.find_all(class_="card-content-action")
    # the very raw comments, every "card-content-action" is a person's comment block.
    user_name = []
    sentence = []
    comment_time = []
    for word in words:  # every word is a comment block
        user_name.append(word.find(attrs={"class": "comment-author-name"}).text)
        sentence.append(
            word.find(attrs={"class": "comment-content"})
            .text.strip()
            .replace("\n", " ")
        )
        comment_time.append((word.find("time"))["datetime"])

    sentence_cleaned = []
    for s in sentence:
        if "@******" in s:
            sentence_cleaned.append((s.split("@******"))[1].strip())
        else:
            sentence_cleaned.append(s)

    news_id = comments_url[-7:]
    now = date.today()
    comments_df = pd.DataFrame(
        data={
            "user_name": user_name,
            "comment_raw": sentence,
            "comment_cleaned": sentence_cleaned,
            "date": comment_time,
            "news_id": news_id,
            "is_reply": True,
            "collecting_date": now,
        }
    )

    for index, row in comments_df.iterrows():
        if "@******" in row["comment_raw"]:
            comments_df.at[index, "is_reply"] = True
        else:
            comments_df.at[index, "is_reply"] = False

    time.sleep(random.random() * 3)
    return comments_df

## Collect all URLs of the articles
**Method**  
Since SCMP uses infinite roll to load old articles, I catch the URL of every request and find its pattern. Some short pages use 0,20,40,60 as their identifier, while other pages may use timestamps. For those timestamps, I collect them manually.
I've also tried to download rolled down pages as mHTML, but bs.find_all("a", href=True) or bs.find_all(class_=) doesn't work as in online pages.

In [36]:
url_list = []

https://www.scmp.com/coronavirus/greater-china

In [37]:
for i in range(4):
    url = "https://apigw.scmp.com/content-delivery/v1?operationName=QueueById&variables=%7B%22itemLimit%22%3A20%2C%22offset%22%3A{offset}%2C%22name%22%3A%22section_top_505356%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%223f30ddb476061fe29f0dba291c35330b61a58fb0aaf08096d985f6b81e107840%22%7D%7D".format(
        offset=i * 20
    )
    headers = {"apikey": "MyYvyg8M9RTaevVlcIRhN5yRIqqVssNY"}
    text = requests.get(url=url, headers=headers).text
    result = json.loads(text)["data"]["queue"]["items"]
    for item in result:
        url_list.append(item["urlAlias"])
        # print(item["urlAlias"])  # item["headline"],

https://www.scmp.com/coronavirus/asia

In [38]:
# 0-60,step=20
for i in range(4):
    url = "https://apigw.scmp.com/content-delivery/v1?operationName=QueueById&variables=%7B%22itemLimit%22%3A20%2C%22offset%22%3A{offset}%2C%22name%22%3A%22section_top_505326%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%223f30ddb476061fe29f0dba291c35330b61a58fb0aaf08096d985f6b81e107840%22%7D%7D".format(
        offset=i * 20
    )
    headers = {"apikey": "MyYvyg8M9RTaevVlcIRhN5yRIqqVssNY"}
    text = requests.get(url=url, headers=headers).text
    result = json.loads(text)["data"]["queue"]["items"]
    for item in result:
        url_list.append(item["urlAlias"])
        # print(item["urlAlias"])  # item["headline"],

https://www.scmp.com/coronavirus/europe

In [39]:
# 0-60,step=20
for i in range(4):
    url = "https://apigw.scmp.com/content-delivery/v1?operationName=QueueById&variables=%7B%22itemLimit%22%3A20%2C%22offset%22%3A{offset}%2C%22name%22%3A%22section_top_505325%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%223f30ddb476061fe29f0dba291c35330b61a58fb0aaf08096d985f6b81e107840%22%7D%7D".format(
        offset=i * 20
    )
    headers = {"apikey": "MyYvyg8M9RTaevVlcIRhN5yRIqqVssNY"}
    text = requests.get(url=url, headers=headers).text
    result = json.loads(text)["data"]["queue"]["items"]
    for item in result:
        url_list.append(item["urlAlias"])
        # print(item["urlAlias"])  # item["headline"],

https://www.scmp.com/coronavirus/us-canada

In [40]:
# 0-60,step=20
for i in range(4):
    url = "https://apigw.scmp.com/content-delivery/v1?operationName=QueueById&variables=%7B%22itemLimit%22%3A20%2C%22offset%22%3A{offset}%2C%22name%22%3A%22section_top_505354%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%223f30ddb476061fe29f0dba291c35330b61a58fb0aaf08096d985f6b81e107840%22%7D%7D".format(
        offset=i * 20
    )
    headers = {"apikey": "MyYvyg8M9RTaevVlcIRhN5yRIqqVssNY"}
    text = requests.get(url=url, headers=headers).text
    result = json.loads(text)["data"]["queue"]["items"]
    for item in result:
        url_list.append(item["urlAlias"])
        # print(item["urlAlias"])  # item["headline"],

https://www.scmp.com/coronavirus/health-medicine

In [41]:
# 0-60,step=20
for i in range(4):
    url = "https://apigw.scmp.com/content-delivery/v1?operationName=QueueById&variables=%7B%22itemLimit%22%3A20%2C%22offset%22%3A{offset}%2C%22name%22%3A%22section_top_505355%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%223f30ddb476061fe29f0dba291c35330b61a58fb0aaf08096d985f6b81e107840%22%7D%7D".format(
        offset=i * 20
    )
    headers = {"apikey": "MyYvyg8M9RTaevVlcIRhN5yRIqqVssNY"}
    text = requests.get(url=url, headers=headers).text
    result = json.loads(text)["data"]["queue"]["items"]
    for item in result:
        url_list.append(item["urlAlias"])
        # print(item["urlAlias"])  # item["headline"],

https://www.scmp.com/coronavirus/economic-impact

In [42]:
# 0-60,step=20
for i in range(4):
    url = "https://apigw.scmp.com/content-delivery/v1?operationName=QueueById&variables=%7B%22itemLimit%22%3A20%2C%22offset%22%3A{offset}%2C%22name%22%3A%22section_top_505328%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%223f30ddb476061fe29f0dba291c35330b61a58fb0aaf08096d985f6b81e107840%22%7D%7D".format(
        offset=i * 20
    )
    headers = {"apikey": "MyYvyg8M9RTaevVlcIRhN5yRIqqVssNY"}
    text = requests.get(url=url, headers=headers).text
    result = json.loads(text)["data"]["queue"]["items"]
    for item in result:
        url_list.append(item["urlAlias"])
        # print(item["urlAlias"])  # item["headline"],

https://www.scmp.com/coronavirus/analysis-opinion

In [43]:
# 0-60,step=20
for i in range(4):
    url = "https://apigw.scmp.com/content-delivery/v1?operationName=QueueById&variables=%7B%22itemLimit%22%3A20%2C%22offset%22%3A{offset}%2C%22name%22%3A%22section_top_505327%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%223f30ddb476061fe29f0dba291c35330b61a58fb0aaf08096d985f6b81e107840%22%7D%7D".format(
        offset=i * 20
    )
    headers = {"apikey": "MyYvyg8M9RTaevVlcIRhN5yRIqqVssNY"}
    text = requests.get(url=url, headers=headers).text
    result = json.loads(text)["data"]["queue"]["items"]
    for item in result:
        url_list.append(item["urlAlias"])
        # print(item["urlAlias"])  # item["headline"],

https://www.scmp.com/topics/coronavirus-china

In [44]:
timestamp_list_COVCHINA = [
    "1587436199000",
    "1587117610000",
    "1586913303000",
    "1586610083000",
    "1586361914000",
    "1585954821000",
    "1585735387000",
    "1585528208000",
    "1585229419000",
    "1584874812000",
    "1584581236000",
    "1581608419000",
    "1589540408000",
    "1589274011000",
    "1588939218000",
    "1588673798000",
    "1588345217000",
    "1588063310000",
    "1587770106000",
    "1587558821000",
    "1587295804000",
    "1587036700000",
    "1586842439000",
    "1586527218000",
    "1586317395000",
    "1585896342000",
    "1585400413000",
    "1585132217000",
    "1584768613000",
    "1584493217000",
    "1579925153000",  # back to Mar 19, the earliest
]
timestamp_list_COVCHINA.sort()
timestamp_list_COVCHINA

['1579925153000',
 '1581608419000',
 '1584493217000',
 '1584581236000',
 '1584768613000',
 '1584874812000',
 '1585132217000',
 '1585229419000',
 '1585400413000',
 '1585528208000',
 '1585735387000',
 '1585896342000',
 '1585954821000',
 '1586317395000',
 '1586361914000',
 '1586527218000',
 '1586610083000',
 '1586842439000',
 '1586913303000',
 '1587036700000',
 '1587117610000',
 '1587295804000',
 '1587436199000',
 '1587558821000',
 '1587770106000',
 '1588063310000',
 '1588345217000',
 '1588673798000',
 '1588939218000',
 '1589274011000',
 '1589540408000']

In [45]:
for timestamp in timestamp_list_COVCHINA:
    url = (
        "https://apigw.scmp.com/content-delivery/v1?operationName=gettopicbyentityuuid&variables=%7B%22latestContentsLimit%22%3A30%2C%22latestOpinionsLimit%22%3A30%2C%22entityUuid%22%3A%22c7985da3-2f6d-4540-a1c2-875c2d7881b6%22%2C%22articleTypeId%22%3A%22012d7708-2959-4b2b-9031-23e3d025a08d%22%2C%22applicationIds%22%3A%5B%222695b2c9-96ef-4fe4-96f8-ba20d0a020b3%22%5D%2C%22after%22%3A%22"
        + timestamp
        + "%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22a78c49ca1280c93d31533f172e1a837d7e8aebf1215195364cb7fd85c76d2e0e%22%7D%7D"
    )
    headers = {"apikey": "MyYvyg8M9RTaevVlcIRhN5yRIqqVssNY"}
    r = requests.get(url=url, headers=headers)
    j = r.json()["data"]["topic"]["latestContentsWithCursor"]["items"]
    for item in j:
        url_list.append(item["urlAlias"])
        # print(item["urlAlias"])

## Append new urls and save them in a csv

In [46]:
if not os.path.isfile("news_url.csv"):
    # creat the news url list for the first time
    print("New file created.")
    news_url_list = []
    for item in url_list:
        news_url_list.append("https://www.scmp.com" + item)
    news_url_list = list(dict.fromkeys(news_url_list))  # rid duplicates
    news_url_df = pd.DataFrame(data={"news_url": news_url_list})
    news_url_df.to_csv(
        "../data/news_url.csv", encoding="utf-8-sig", header="column_names", index=False
    )
else:
    # there is already a news url list file
    print("File already exist.")
    news_url_df = pd.read_csv("news_url.csv")
    news_url_list = list(news_url_df["news_url"])
    for item in url_list:
        news_url_list.append("https://www.scmp.com" + item)
    news_url_list = list(dict.fromkeys(news_url_list))  # rid duplicates
    news_url_df = pd.DataFrame(data={"news_url": news_url_list})
    news_url_df.to_csv(
        "../data/news_url.csv",
        encoding="utf-8-sig",
        mode="w",
        header="column_names",
        index=False,
    )

File already exist.


## Test

In [56]:
# read news url list from csv
news_url_df = pd.read_csv("../data/news_url.csv")
news_url_list = list(news_url_df["news_url"].unique())
print("We have collected %d news URLs in all." % len(list(news_url_df["news_url"])))

We have collected 1177 news URLs in all.


After 386 loops, the connection breaks.

In [58]:
counter = 0
# news_url_list is the list of all news URLs
for url in news_url_list[1174:]:
    re_try = re.search(pattern="\d{7}", string=url)
    if re_try != None:
        news_id = re_try.group(0)
        news_df = news_parser(url)  # parse news article
        comments_id = news_id
        comments_url = "https://www.scmp.com/scmp_comments/popup/" + comments_id
        comments_df = comments_parser(comments_url)  # parse comments
        if not news_df["article"][0] == [""]:
            news_df.to_csv(
                "../data/news_new.csv",
                encoding="utf-8-sig",
                header=False,
                index=False,
                mode="a",
            )
        if not comments_df["user_name"][0] == None:
            comments_df.to_csv(
                "../data/comments_new.csv",
                encoding="utf-8-sig",
                header=False,
                index=False,
                mode="a",
            )
        counter += 1
        print("Loop %d finished." % counter)

Parsing news at
 https://www.scmp.com/news/hong-kong/politics/article/3084984/coronavirus-labour-minister-expects-more-90-cent-hong-kong
Parsing comments at
 https://www.scmp.com/scmp_comments/popup/3084984
Loop 1 finished.
Parsing news at
 https://www.scmp.com/business/companies/article/3084969/international-air-travel-starting-creep-back-complete-patchy
Parsing comments at
 https://www.scmp.com/scmp_comments/popup/3084969
Loop 2 finished.
Parsing news at
 https://www.scmp.com/comment/opinion/article/3084828/why-chinas-post-lockdown-recovery-wont-work-test-case-reopening
Parsing comments at
 https://www.scmp.com/scmp_comments/popup/3084828
Loop 3 finished.
