# Web Scraping of Portuguese News

## Tools

In [9]:
#Import modules
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import openpyxl
import json

#Load the configuration from a config.json
with open("config.json", "r") as config_file:
    config = json.load(config_file)

#Determine the base directory path based on the environment
if os.getenv("GITHUB_ACTIONS"):
    repo_name = "Analysis-of-Portuguese-news"
    base_dir = os.path.abspath(os.path.join(os.getcwd(), "../..", repo_name))
else:
    base_dir = config["dir"]

#Define the filename to save to
filename = os.path.join(base_dir, config["data"]["news"])

#Ensure the Data directory exists
data_dir = os.path.dirname(filename)
os.makedirs(data_dir, exist_ok=True)

print(f"Saving to {filename}")

Saving to /Users/freitas/Desktop/Projetos/Analysis-of-Portuguese-news/Data/news.xlsx


## Web Scraping

### URL extraction

In [10]:
# User-agent string as good practice
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
    "From": "miguel_freitas_12@hotmail.com"
}

#Extracting today's date into string
good_morning = datetime.now().strftime("%Y-%m-%d")

#The base URL of the news site
base_url = "https://sicnoticias.pt"

#Endpoint of the specific section
section_endpoint = ""

#Combine the base URL with the endpoint of the section
url = base_url + section_endpoint

#Send a GET request to the URL
response = requests.get(url, headers=headers)

#Saving all full_urls in list form
full_urls = []

#Check if the request was successful
if response.status_code == 200:
    
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    #Find all the article lists on the page
    article_lists = soup.find_all("ul", class_="list-articles")

    #Find all <a> tags within this list
    for article_list in article_lists:
        article_links = article_list.find_all("a", href=True)
        
        #Extract all links inside <a> tags
        for link in article_links:
            href = link.get("href")
            
            #Append to the base URL if the href is a relative URL
            full_url = base_url + href if href.startswith("/") else href
            full_urls.append(full_url)

else:
    print(f"Failed to retrieve content: Status code {response.status_code}")

#Filtering urls with more than 90 characters and that contain today's date, for purposeful links
intended_urls = [url for url in full_urls if len(url) > 90 and good_morning in url]

#Removing duplicates
intended_urls = set(intended_urls)

### Information extraction

In [11]:
#Create empty lists
titles = []
descriptions = []
categories = []

#Loop through each url found
for each_link in intended_urls:

    #Send a GET request to the URL of each link
    response = requests.get(each_link)

    #Check if the request was successful
    if response.status_code == 200:
        
        #Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
        
        
        #######################
        ## Fetch news' title ##
        #######################
        
        #Flag
        title_found = False
        
        #Search inside <h1> tag
        h1_tags = soup.find_all("h1", class_="title")
        for h1_tag in h1_tags:
            titles.append(h1_tag.text)
            title_found = True
        
        #If nothing is found, append an empty string
        if not title_found:
            titles.append("")
            
            
        #############################    
        ## Fetch news' description ##
        #############################
        
        #Flag
        description_found_going_once = False
        
        #Search for <p> tags
        p_tags1 = soup.find_all('p', class_="g-article-lead lead")
        for p_tag in p_tags1:
            descriptions.append(p_tag.text)
            description_found_going_once = True
        
        #If news' description not in <p> tag
        if not description_found_going_once:
            
            #Flag
            description_found_going_twice = False
            
            #Search for <div> tags
            div_tags = soup.find_all("div", class_="g-article-lead lead")
            
            #Search inside <div> tags
            for div_tag in div_tags:
                p_tags2 = div_tag.find_all("p")

                #Search inside <p> tags
                for p_tag in p_tags2:
                    span_tags = p_tag.find_all("span")
                    
                    #Search for <span> tag
                    #This block avoids news descriptions with bold to be misinterpreted
                    spans = []
                    for span_tag in span_tags:
                        spans.append(span_tag.text)
                        description_found_going_twice = True
                    spans = "".join(spans)
                    descriptions.append(spans)
    
            #If nothing is found, append an empty string
            if not description_found_going_twice:
                descriptions.append("")
            
            
        #############################    
        ##  Fetch news' category   ##
        #############################
        
        #Flag
        category_found = False
        
        #Search for <p> tags
        p_tags3 = soup.find_all("p", class_="category")
        
        #If <p> tags exist append their the first one's text
        if p_tags3:
            categories.append(p_tags3[0].text)
            category_found = True
        else:
            categories.append("")

print(f"{len(titles)} titles found!\n{len(categories)} categories found!\n{len(descriptions)} descriptions found!")


#Resolving mismatched description number if it happens
if len(titles) != len(descriptions):
    number_of_missing_desc = descriptions.count("")
    if number_of_missing_desc > 0:
        for _ in range(number_of_missing_desc):
            if "" in descriptions:
                missing_desc_index = descriptions.index("")
                descriptions.pop(missing_desc_index)
    print(f"\n{number_of_missing_desc} empty descriptions found! Mismatched descriptions resolved")
    print(f"No of descriptions: {len(descriptions)}")

40 titles found!
40 categories found!
40 descriptions found!


### Storing data in an Excel file

In [12]:
#Creating DataFrame
news = pd.DataFrame({"date": good_morning, "category": categories, "title": titles, "description": descriptions})

try:
    #Check if the Excel file exists and has data
    if os.path.isfile(filename):
        
        #Read existing data
        existing_data = pd.read_excel(filename, sheet_name = "Sheet1")
        
        #Append new data
        combined_data = pd.concat([existing_data, news], ignore_index=True)
        
        #Removing duplicates in case the program is run more than once per day
        combined_data = combined_data.drop_duplicates()
        
    else:
        #If file does not exist, use new data as the combined data
        combined_data = news

    #Write combined data back to Excel
    with pd.ExcelWriter(filename, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
        combined_data.to_excel(writer, sheet_name = "Sheet1", index=False)
        
    print(f"News from {good_morning} downloaded!\n{len(titles)} new headlines added to the file.\n{len(combined_data)} headlines in total.")

#Catching any errors while trying to write on the Excel file
except Exception as e:
    print(f"An error occurred: {e}")

News from 2025-01-26 downloaded!
40 new headlines added to the file.
11775 headlines in total.
