In [1]:
import time
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
import re

In [None]:
start_date = "2/22/2024"
end_date = "2/28/2024"

In [2]:
def generate_dates_m_d_Y_compatible(start_date, end_date):
    start = datetime.strptime(start_date, "%m/%d/%Y")
    end = datetime.strptime(end_date, "%m/%d/%Y")
    step = timedelta(days=1)
    
    date_list = []
    while start <= end:
        date_str = f"{start.month}/{start.day}/{start.year}"
        date_list.append(date_str)
        start += step
    
    return date_list


def get_search_results(url, retries=5, sleep_time=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
     }
    
    for attempt in range(retries):
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            print(response)
            soup = BeautifulSoup(response.text, 'html.parser')
            search_results = soup.find_all('a')
            
            links = [link['href'] for link in search_results if link.has_attr('href')]
            links = [x for x in links if 'https://www.bbc.com/news/' in x]
            links = [x for x in links if '/live/' not in x]
            links = [x.replace('/url?q=', '') for x in links]
            links = [x.split('&')[0] for x in links]
            links = [x for x in links if x[-1].isdigit()]
            
            return links
        else:
            print(f'Response status code {response.status_code}. Retrying in {sleep_time} seconds...')
            time.sleep(sleep_time)
    
    print("Failed to retrieve data after multiple attempts.")
    return []

In [3]:
url = 'https://www.google.com/search?q=news+site:bbc.com/news&tbs=cdr:1,cd_min:<DATE>,cd_max:<DATE>&tbm=nws&start=<START>'
max_page_num = 100
sleep_time = 0.5

dates = generate_dates_m_d_Y_compatible(start_date, end_date)

url_list = []
for date in dates:
    for start in [str(x * 10) for x in range(max_page_num)]:
        request_url = url.replace('<START>', start).replace('<DATE>', date)
        search_links = get_search_results(request_url)
        url_list += search_links
        print('-' * 100)
        print(f'date:{date} start:{start} total:{len(set(url_list))}')
        for link in search_links:
            print(link)
        if len(search_links) == 0:
            break
        time.sleep(sleep_time)

<Response [200]>
----------------------------------------------------------------------------------------------------
date:2/22/2024 start:0 total:10
https://www.bbc.com/news/science-environment-68374520
https://www.bbc.com/news/world-middle-east-68375460
https://www.bbc.com/news/world-asia-india-68366859
https://www.bbc.com/news/world-asia-india-68378060
https://www.bbc.com/news/world-asia-india-68366869
https://www.bbc.com/news/entertainment-arts-68338730
https://www.bbc.com/news/world-europe-68374769
https://www.bbc.com/news/world-australia-68377379
https://www.bbc.com/news/world-68367482
https://www.bbc.com/news/business-68377742
<Response [200]>
----------------------------------------------------------------------------------------------------
date:2/22/2024 start:10 total:20
https://www.bbc.com/news/technology-68368439
https://www.bbc.com/news/world-europe-68374811
https://www.bbc.com/news/world-europe-68368372
https://www.bbc.com/news/uk-68367973
https://www.bbc.com/news/busine

In [4]:
url_list = set(url_list)
len(url_list)

1370

In [5]:
def extract_content_from_url(url):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.HTTPError as http_err:
        return f"HTTP error occurred: {http_err}"
    except Exception as err:
        return f"An error occurred: {err}"

    soup = BeautifulSoup(response.text, 'lxml')
    
    main_content = soup.find('main')
    content_list = []

    if main_content:
        
        pattern = re.compile(r'(text-block|subheadline-block)')

        content_divs = main_content.find_all('div', attrs={'data-component': pattern})

        for div in content_divs:
            content = ' '.join(div.stripped_strings)
            content_list.append(content)
    
    joined_string = '\n'.join(content_list)

    return joined_string


In [10]:
def fetch_url(url):
    extracted_contents = extract_content_from_url(url)
    if len(extracted_contents) > 100:
        return extracted_contents
    else:
        return None

all_news = []

with ThreadPoolExecutor(max_workers=10) as executor:
    
    future_to_url = {executor.submit(fetch_url, url): url for url in url_list}
    
    for future in tqdm(as_completed(future_to_url), total=len(url_list)):
        url = future_to_url[future]
        try:
            content = future.result()
            if content:
                print('-' * 100)
                print(content)
                all_news.append(content)
            else:
                print('-' * 100)
                print('length < 100')
        except Exception as exc:
            print(f'{url} generated an exception: {exc}')
            

  0%|          | 0/1370 [00:00<?, ?it/s]

----------------------------------------------------------------------------------------------------
An error occurred: HTTPSConnectionPool(host='www.bbc.com', port=443): Max retries exceeded with url: /news/av/uk-wales-68372713 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:992)')))
----------------------------------------------------------------------------------------------------
Part of the A1 in West Yorkshire has reopened after a crash involving an HGV which went across all lanes of the carriageway.
Diversions were put in place on the northbound stretch between junctions 39 for Upton and 40 for Darrington on Friday morning.
National Highways said the HGV involved in the crash was recovered and the area was reopened at around 08:50 GMT.
Motorists were earlier warned of delays and urged to allow extra time.
Follow BBC Yorkshire on Facebook , X (formerly Twitter) and Instagram . Send your story ideas to yorkslincs.news@bbc.co.uk .
-----------------

In [11]:
all_news_cut = [x[:5000] for x in all_news]
all_news_cut = all_news_cut[-1000:]
len(all_news_cut), max([len(x) for x in all_news_cut]), min([len(x) for x in all_news_cut])

(1000, 5000, 106)

In [12]:
import json

path = 'bbc_news_240222_to_240228.json'

with open(path, 'w') as f:
    json.dump(all_news_cut, f, ensure_ascii=True, indent=4)

In [13]:
with open(path, 'r') as f:
    check = json.load(f)
check

['A lawyer for Russian opposition leader Alexei Navalny, who died in prison earlier this month, has been briefly detained in Moscow.\nRussian media said Vasily Dubkov was held for "violating public order".\nRussian officials have not confirmed that he was arrested nor the reason why. But Mr Dubkov told news outlet Verstka he was freed later on Tuesday.\nMr Dubkov accompanied Navalny\'s mother to the Arctic prison colony where he died on 16 February.\nIn October 2023, other lawyers for Navalny - Vadim Kobzev, Igor Sergunin, and Aleksei Lipster - were arrested on charges of "extremism".\nIn January, Olga Mikhailova, another lawyer for the opposition leader, said she had been charged with the same crime and decided to remain in exile.\nRussian authorities banned the Anti-Corruption Foundation, the organisation led by Navalny, for "extremism" in 2021.\nThe opposition leader\'s body was held by prison authorities for more than a week following his death. His mother, Lyudmila Navalnaya, trav