# BBC Health Crawler

In [1]:
from bs4 import BeautifulSoup
import requests
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import WebBaseLoader
import tiktoken
from tqdm import tqdm

parent_url = 'https://www.bbc.com/news/health'
urls = []
# 1~10페이지의 url추출
for num in tqdm(range(1, 11)):
    res = requests.get(parent_url+f'?page={num}')
    soup = BeautifulSoup(res.content, 'html.parser')

    hrefs = ['https://www.bbc.com'+a['href'] for li in soup.find_all('li', class_='ssrcss-qtbdxl-StyledListItem e1d6xluq3') if (a := li.find('a')) and 'href' in a.attrs]
    urls += hrefs

# text_splitter = CharacterTextSplitter(
#     separator='\n\n',
#     chunk_size = 3000,
#     chunk_overlap = 300,
#     length_function = len,
#     is_separator_regex=False
# )

urls = list(set(urls))

# 토큰 단위로 청킹
tokenizer = tiktoken.get_encoding('cl100k_base')
def token_length(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30,
    length_function = token_length
)

# text split
all_articles = []
for url in tqdm(urls):
    docs = WebBaseLoader(url).load_and_split(text_splitter)
    all_articles+=docs
all_articles[0]


USER_AGENT environment variable not set, consider setting it to identify your requests.
100%|██████████| 10/10 [00:07<00:00,  1.29it/s]
100%|██████████| 230/230 [05:04<00:00,  1.32s/it]


Document(metadata={'source': 'https://www.bbc.com/news/articles/c0kj1pmr7jdo', 'title': "Covid-19 vaccine: Omagh man wants help after 'life ruined' by jab", 'description': 'Larry Lowe used to have an active life but says everyday tasks are now beyond him.', 'language': 'en-GB'}, page_content="Covid-19 vaccine: Omagh man wants help after 'life ruined' by jabSkip to contentBritish Broadcasting CorporationHomeNewsUS ElectionSportBusinessInnovationCultureArtsTravelEarthVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifyUS ElectionElection pollsKamala HarrisDonald TrumpJD VanceTim WalzSportBusinessExecutive LoungeTechnology of BusinessWomen at the HelmFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArts

In [2]:
import json

bbc_health_news = []
for info in all_articles:
    bbc_health_news.append(
        {
            'page_content':info.page_content,
            'metadata':info.metadata
         }
    )

with open('bbc_health_news.json', 'w') as json_file:
    json.dump(bbc_health_news, json_file, indent=4)

In [5]:
with open('bbc_health_news.json', 'r') as f:
    data = json.load(f)

In [7]:
len(data)

1182

In [8]:
data[0]

{'page_content': "Covid-19 vaccine: Omagh man wants help after 'life ruined' by jabSkip to contentBritish Broadcasting CorporationHomeNewsUS ElectionSportBusinessInnovationCultureArtsTravelEarthVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifyUS ElectionElection pollsKamala HarrisDonald TrumpJD VanceTim WalzSportBusinessExecutive LoungeTechnology of BusinessWomen at the HelmFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWeather & ScienceClimate SolutionsSustainable Bu