# Reliable News Web Scraping

In [None]:
! nvidia-smi

In [60]:
! pip install requests



In [61]:
# Imports
import requests
import pandas as pd
from bs4 import BeautifulSoup
from typing import List, Tuple

In [63]:
def get_articles_links(page: int) -> List[str]:
    """Gets links to articles from termedia.pl.

    Args:
        page (int): Page number of termedia.pl to process.

    Returns:
        links (List[str]): List of links to articles.
    """
    url = 'https://www.termedia.pl/koronawirus/?&p=' + str(page)
    print(f'Processing page {page}: {url}')
    res = requests.get(url)  
    res.encoding = 'utf-8'                           
    soup = BeautifulSoup(res.text, 'html.parser')
    links = ['https://www.termedia.pl' + art.find('a')['href'].strip() 
                for art in soup.find_all('div', attrs={'class': 'pl2Pos'})]
    return links


def extract_text_from_article(url: str) -> Tuple[str, str]:
    """Gets the article's title and content.

    Args:
        url (str): URL of the article.

    Returns:
        (Tuple [str, str]): Tuple containing:
            title (str): Article's title.
            text (str): Article's content.
    """
    res = requests.get(url)
    res.encoding = 'utf-8'                             
    soup_art = BeautifulSoup(res.text, 'html.parser')
    title = soup_art.find('div', attrs={'class': 'pageTitle'}).text
    try:
        text = ' '.join([t.text for t in soup_art.find_all('div', attrs={'class': 'articleContent'})])
    except AttributeError:
        text = ''
    return title, text

In [None]:
articles_counter = 0
pages_to_scrap = 144

df = pd.DataFrame(columns=['Verdict', 'Title', 'Text', 'Url'])

for page in range(1, pages_to_scrap + 1):
    links = get_articles_links(page)
    for idx, link in enumerate(links):
        articles_counter += 1
        title, text = extract_text_from_article(link)
        df.loc[10 * (page-1) + idx] = ['true', title, text, link]
    
df.to_excel('termedia_dataset.xlsx', encoding='utf-8', index=False)
display(df.head())
print(f'Scraped articles in total: {len(df)}')