# Endi.com Web-Scrapper

## Import Modules
* requests -> to send HTTP/1.1 requests
* bas4/BeautifulSoup -> parsing HTML documents
* itertools -> counter functionality

In [22]:
import requests
from bs4 import BeautifulSoup, element
from itertools import count

## Extract and Save HTML Content

In [23]:
# Base url
news_url= "https://www.elnuevodia.com/noticias/seguridad"

# Get request to base url
news_html = requests.get(news_url)

## Parse HTML Content

In [24]:
# Creating soup of HTML text content with bs4 HTML parser
main_soup = BeautifulSoup(news_html.text, 'html.parser')

## Find Articles

In [25]:
# find all articles by tag and class specification
articles = main_soup.find_all('article', {'class': 'standard-teaser-container condensed-horizontal news'})

## Function to Extract Articles Data
* Title
* Path
* url
* Author

In [26]:
def get_artcl_data(article: element.Tag) -> dict:
        """
        Extract articles data from endi.com digital news paper.

        Parameters
        ----------
        articles : ResultSet
            list of query results

        Returns
        -------
        dict
            a dictionary containing article title, url, and author data

        Raises
        ------
        AttribureError
            when article not found
        """
        try:
            # Check check article tagged Noticias or Seguridad
            if (article.find(name= "h5", attrs= {"class": "standard-teaser-subheading slug"}).text== "Seguridad"
                or article.find(name= "h5", attrs= {"class": "standard-teaser-subheading slug"}).text == "Noticias"):

                # Extract article title 
                article_headline= article.find(name= 'h3', attrs= {'class':'standard-teaser-headline teaser-headline'}).text
                
                # Extract article path
                article_path= article.find(name= "a", href= True)['href']
                
                # Extract article url
                article_url= "https://www.elnuevodia.com{path}".format(path= article_path)
                
                # Extract article author
                article_author= article.find("div", {"class": "authors-byline-text"}).text
                
                # Return dictionary with data
                return {"headline": article_headline, "url": article_url, "author": article_author}
        except AttributeError:
             # Print message is find error
             print("¡No se encontró resultado!")

## Function to Extract Articles Content
Alsi, it appends the contents into the given dictionary.
* Subheadline
* Date
* Time
* Content

In [27]:
def get_artcl_content(article_data: dict) -> dict:
    article_html = requests.get(article_data["url"])
    
    article_soup= BeautifulSoup(article_html.text, "html.parser")
    
    headline_sub= article_soup.find(name= "div", attrs= {"class": "article-headline__subheadline"}).text
    
    article_date_time= (article_soup.find(name= "div", attrs= {"class": "article-headline__date"}).text).split("-")
    article_date= article_date_time[0].strip()
    article_time= article_date_time[1].strip()
    
    article_content= article_soup.find_all(name= "p", attrs= {"class": "content-element"})
    content= []
    for __ in article_content: content.append(__.text)
    content= " ".join(content)
    
    return {"headline_sub": headline_sub, "date": article_date, "time": article_time, "content": content}

## Iterate Articles and Construct Dictionary 

In [28]:
data_handler= {}

for _ in zip(count(start= 1, step= 1), articles): data_handler[str(_[0])]= get_artcl_data(article= _[1])
for _ in data_handler.keys(): data_handler[_].update(get_artcl_content(article_data= data_handler[_]))

## Final Dictionary output

In [29]:
data_handler

{'1': {'headline': 'Informe sobre la investigación de Justicia por excarcelación de feminicida de Manatí estaría listo en septiembreSe espera que el sospechoso, Hermes Ávila Vázquez, sea entrevistado por las fiscales encargadas de emitir el informe',
  'url': 'https://www.elnuevodia.com/noticias/seguridad/notas/informe-sobre-la-investigacion-de-justicia-por-excarcelacion-de-feminicida-de-manati-estaria-listo-en-septiembre/',
  'author': 'Por Richard I. Colón Badillo',
  'headline_sub': 'Se espera que el sospechoso, Hermes Ávila Vázquez, sea entrevistado por las fiscales encargadas de emitir el informe',
  'date': '25 de junio de 2024',
  'time': '4:15 PM',
  'content': 'El secretario de Justicia, Domingo Emanuelli, adelantó que no espera tener hasta mediados de septiembre de este año un informe de parte de las fiscales de la División de Integridad Pública que investigan el manejo de la excarcelación de Hermes Ávila Vázquez por parte del Departamento de Corrección. “Esto es un caso que 

# Testing Code