In [22]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
     ------------------------------------- 211.1/211.1 kB 12.6 MB/s eta 0:00:00
Collecting feedfinder2>=0.0.4
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting feedparser>=5.2.1
  Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
     ---------------------------------------- 81.3/81.3 kB 4.7 MB/s eta 0:00:00
Collecting jieba3k>=0.35.1
  Downloading jieba3k-0.35.1.zip (7.4 MB)
     ---------------------------------------- 7.4/7.4 MB 16.4 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting tinysegmenter==0.3
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 k

In [32]:
from bs4 import BeautifulSoup
import urllib.request
import newspaper
import requests

# Saving docs

In [71]:
COUNTRIES = ["Somalia", "Ethiopia", "Kenya", "Uganda", "South Sudan"]

In [72]:
from datetime import datetime, timedelta

DATES = []

def get_dates_between(start_date, end_date, interval_months):
    dates_list = []
    current_date = start_date

    while current_date <= end_date:
        dates_list.append(current_date)
        current_date += timedelta(days=30 * interval_months)

    return dates_list

start_date = datetime(2015, 1, 1)
end_date = datetime(2024, 6, 26)
interval_months = 6

dates_between = get_dates_between(start_date, end_date, interval_months)
for date in dates_between:
    DATES.append(date.strftime("%Y-%m-%d"))


In [114]:
import requests
from bs4 import BeautifulSoup
from dateparser import parse
from datetime import datetime
import pandas as pd

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 30

def get_bbc_articles(country):
    base_url = "https://www.bbc.co.uk"
    search_url = f"{base_url}/search?q={country}&seqId=a2619180-339d-11ef-b51b-05b0b4d9a140&d=NEWS_PS"

    articles = []
    dates = []

    # Iterate over search result pages
    for page in range(1,35):  # Adjust range as needed to cover more pages
        response = requests.get(search_url + f"&page={page}")
        if response.status_code != 200:
            break
        soup = BeautifulSoup(response.text, 'html.parser')
        # Adjust the selector based on the actual structure
        for article in soup.find_all('a', class_="ssrcss-its5xf-PromoLink exn3ah91"):
            base_url = article["href"]
            if not "news" in base_url:
                continue
            article = Article(base_url, config=config)
            article.download()
            article.parse()
            soup_article = BeautifulSoup(article.html, 'html.parser')
            bbc_dictionary = json.loads("".join(soup_article.find("script", {"type":"application/ld+json"}).contents))
            date_published = [value for (key, value) in bbc_dictionary.items() if key == 'datePublished']
            if len(date_published) > 0:
                articles.append("Title: " + article.title + "\n" + article.text)
                dates.append(datetime.strptime(date_published[0], '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y-%m-%d'))
    articles_df = pd.DataFrame(data={"articles": articles, "dates": dates})

    return articles_df


In [115]:
import numpy as np

def write_doc(list_text, country, start_date, end_date):
    split_list = [list_text[i:i + 10] for i in range(0, len(list_text), 10)]
    for i in range(len(split_list)):
        filename = "_".join([country, start_date, end_date, str(i)]) + ".txt"
        text_to_save = "\n\n\n\n\n\n\n".join(split_list[i])
        if text_to_save != "\n\n\n\n\n\n\n":
            with open(filename, "w", encoding="utf-8") as f:
                f.write(text_to_save)
                print(f"Saving {filename}")
                f.close()

In [116]:
def doc_pipeline():
    for country in COUNTRIES:
        print(f"Running for country {country}")
        articles_df = get_bbc_articles(country)
        for i in range(len(DATES)-1):
            start_date = DATES[i]
            end_date = DATES[i+1]
            print(f"Running for {country} between {start_date} and {end_date}")
            subset_articles_df = articles_df.loc[(articles_df['dates'] > start_date) & (articles_df['dates'] <= end_date)]
            if not subset_articles_df.empty:
                texts = subset_articles_df["articles"].to_list()
                write_doc(texts, country, start_date, end_date)

In [117]:
doc_pipeline()

Running for country Somalia
Running for Somalia between 2015-01-01 and 2015-06-30
Running for Somalia between 2015-06-30 and 2015-12-27
Saving Somalia_2015-06-30_2015-12-27_0.txt
Saving Somalia_2015-06-30_2015-12-27_1.txt
Running for Somalia between 2015-12-27 and 2016-06-24
Saving Somalia_2015-12-27_2016-06-24_0.txt
Saving Somalia_2015-12-27_2016-06-24_1.txt
Saving Somalia_2015-12-27_2016-06-24_2.txt
Saving Somalia_2015-12-27_2016-06-24_3.txt
Running for Somalia between 2016-06-24 and 2016-12-21
Saving Somalia_2016-06-24_2016-12-21_0.txt
Saving Somalia_2016-06-24_2016-12-21_1.txt
Saving Somalia_2016-06-24_2016-12-21_2.txt
Running for Somalia between 2016-12-21 and 2017-06-19
Saving Somalia_2016-12-21_2017-06-19_0.txt
Saving Somalia_2016-12-21_2017-06-19_1.txt
Saving Somalia_2016-12-21_2017-06-19_2.txt
Running for Somalia between 2017-06-19 and 2017-12-16
Saving Somalia_2017-06-19_2017-12-16_0.txt
Running for Somalia between 2017-12-16 and 2018-06-14
Saving Somalia_2017-12-16_2018-06-1

Running for South Sudan between 2015-01-01 and 2015-06-30
Saving South Sudan_2015-01-01_2015-06-30_0.txt
Saving South Sudan_2015-01-01_2015-06-30_1.txt
Running for South Sudan between 2015-06-30 and 2015-12-27
Saving South Sudan_2015-06-30_2015-12-27_0.txt
Saving South Sudan_2015-06-30_2015-12-27_1.txt
Saving South Sudan_2015-06-30_2015-12-27_2.txt
Saving South Sudan_2015-06-30_2015-12-27_3.txt
Running for South Sudan between 2015-12-27 and 2016-06-24
Saving South Sudan_2015-12-27_2016-06-24_0.txt
Saving South Sudan_2015-12-27_2016-06-24_1.txt
Running for South Sudan between 2016-06-24 and 2016-12-21
Saving South Sudan_2016-06-24_2016-12-21_0.txt
Saving South Sudan_2016-06-24_2016-12-21_1.txt
Saving South Sudan_2016-06-24_2016-12-21_2.txt
Saving South Sudan_2016-06-24_2016-12-21_3.txt
Running for South Sudan between 2016-12-21 and 2017-06-19
Saving South Sudan_2016-12-21_2017-06-19_0.txt
Saving South Sudan_2016-12-21_2017-06-19_1.txt
Running for South Sudan between 2017-06-19 and 2017-

In [90]:
articles_df = get_bbc_articles("Somalia")

In [111]:
start_date = "2022-05-24"
end_date = "2022-11-20"
country = "Somalia"
print(f"Running for {country} between {start_date} and {end_date}")
subset_articles_df = articles_df.loc[(articles_df['dates'] > start_date) & (articles_df['dates'] <= end_date)]
if not subset_articles_df.empty:
    list_text = subset_articles_df["articles"].to_list()
split_list = [list_text[i:i + 10] for i in range(0, len(list_text), 10)]
for i in range(len(split_list)):
    filename = "_".join([country, start_date, end_date, str(i)]) + ".txt"
    text_to_save = "\n\n\n\n\n\n\n".join(split_list[i])
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text_to_save)
        print(f"Saving {filename}")
        f.close()

Running for Somalia between 2022-05-24 and 2022-11-20
Saving Somalia_2022-05-24_2022-11-20_0.txt
Saving Somalia_2022-05-24_2022-11-20_1.txt
