In [28]:
import numpy as np
from tqdm import tqdm
import newspaper
from newspaper import news_pool
from collections import defaultdict
import timeout_decorator
import htmldate
import lazynlp

In [3]:
# Lists mainly from https://github.com/yavuz/news-feed-list-of-countries

cn_urls = ["http://news.baidu.com/", "http://people.com.cn/", "https://www.chinanews.com.cn/",
           "http://www.cnr.cn/", "https://cn.chinadaily.com.cn"]

en_urls = ["https://www.cnn.com/", "https://www.nytimes.com/", "https://www.foxnews.com/",
           "https://www.usatoday.com/", "https://abcnews.go.com/"]

# Many articles removed from https://novayagazeta.ru/ (Last major independent paper in russia
# to shut down...) after journalists were murdered in response to their coverage of the 
# war in Ukraine. 
rs_urls = ["https://news.mail.ru/", "https://novayagazeta.ru/", "https://ria.ru/",
           "https://www.ntv.ru/", "http://rbc.ru/"]

In [9]:
# Build papers and keep track of their country of origin
lang_urls = [cn_urls, en_urls, rs_urls]
lang_abvs = ["zh", "en", "ru"]
built_papers = defaultdict(list)

tqdm.write("Building papers...")
for i in range(len(lang_urls)):
  urls = lang_urls[i]
  abv = lang_abvs[i]
  for url in tqdm(urls):
    # memoize_articles = False so we download all recent articles each run
    paper = newspaper.build(url, memoize_articles=False)
    built_papers[abv].append(paper)

Building papers...


100%|██████████| 5/5 [02:29<00:00, 29.85s/it]
100%|██████████| 5/5 [00:21<00:00,  4.21s/it]]
100%|██████████| 5/5 [01:21<00:00, 16.37s/it] 
100%|██████████| 3/3 [04:12<00:00, 84.06s/it]


In [10]:
# Method 1: Parse articles using purely newspaper3k
articles = defaultdict(list)

# Set timeout for article.parse in case there are slow GET requests
@timeout_decorator.timeout(5, timeout_exception=StopIteration)
def saveArticle(article, abv):
  article.parse()
  articles[abv].append({"url": article.url, "title": article.title, 
                     "authors": article.authors, "date": article.publish_date,
                    "text": article.text})
  np.save("articles.npy", articles)

In [None]:
# Downloading with multiple threads is faster... Here we only use 2 per paper because we don't
# want to spam the news sites

# Download the papers we just built. NOTE: takes ~15 minutes for RU, ~2 minutes for EN,
# and > 2hrs for ZH. Maybe some of the chinese news sites have extra DDos protection
# we're running into?
tqdm.write("Downloading...")
news_pool.set(built_papers, threads_per_source=2) # (15*2) = 30 threads total
news_pool.join()
tqdm.write("Done!")

In [None]:
# Parse all the articles we just downloaded with newspaper3k. Takes about a day...
for abv, papers in built_papers.items():
  for paper in papers:
    for article in tqdm(paper.articles):
      try:
        saveArticle(article, abv)
      except StopIteration:
        tqdm.write("Timed out while parsing article")
        continue

In [11]:
# Method 2: Build a list of urls + article metadata using newspaper3k.
# Then, use lazynlp to download + parse + clean the article text
lang_urls = [cn_urls, en_urls, rs_urls]
lang_abvs = ["zh", "en", "ru"]
built_papers = defaultdict(list)

tqdm.write("Building papers...")
for i in range(len(lang_urls)):
  urls = lang_urls[i]
  abv = lang_abvs[i]
  for url in tqdm(urls):
    # memoize_articles = False so we download all recent articles each run
    paper = newspaper.build(url, memoize_articles=False)
    built_papers[abv].append(paper)

article_urls = defaultdict(list)
for abv, papers in built_papers.items():
  for paper in papers:
    for article in paper.articles:
      article_urls[abv].append(article.url)

np.save("./data/article_urls.npy", article_urls)
print(len(article_urls["zh"]))
print(len(article_urls["ru"]))
print(len(article_urls["en"]))

np.load("./data/article_urls.npy", allow_pickle=True).item()
lazy_parsed_articles = defaultdict(list)
for abv, abv_urls in article_urls.items():
  for url in tqdm(abv_urls):
    publish_date = htmldate.find_date(url)
    article_text = lazynlp.download_page(url, timeout=5)
    lazy_parsed_articles[abv].append({"url": url, "date": publish_date, "text": article_text})

np.save("lazy_articles.npy", lazy_parsed_articles)

Building papers...


100%|██████████| 5/5 [02:18<00:00, 27.78s/it]
100%|██████████| 5/5 [00:14<00:00,  2.85s/it]
100%|██████████| 5/5 [01:21<00:00, 16.29s/it]
