# Endi.com Web-Scrapper

## Import Modules
* requests -> to send HTTP/1.1 requests
* bas4/BeautifulSoup -> parsing HTML documents
* itertools -> counter functionality
* spacy -> NLP
* unidecode -> unicode to ASCII characters

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, element
from itertools import count
import spacy
from unidecode import unidecode


## Variables

In [2]:
# Base url
news_url= "https://www.elnuevodia.com/noticias/seguridad"

## Extract and Save HTML Content

In [3]:
# Get request to base url
news_html = requests.get(news_url)

## Parse HTML Content

In [4]:
# Creating soup of HTML text content with bs4 HTML parser
main_soup = BeautifulSoup(news_html.text, 'html.parser')

## Find Articles

In [5]:
# find all articles by tag and class specification
articles = main_soup.find_all('article', {'class': 'standard-teaser-container condensed-horizontal news'})

## Function to Print Better Fictionaries

In [6]:
def pretty_print_dict(d):
    # From: https://www.geeksforgeeks.org/python-pretty-print-a-dictionary-with-dictionary-value/
    #take empty string
    pretty_dict = ''  
        
    #get items for dict
    for k, v in d.items():
        pretty_dict += f'{k}: \n'
        for value in v:
            pretty_dict += f'    {value}: {v[value]}\n'
    #return result
    return pretty_dict

## Function to Extract Articles Data
* Title
* Path
* url
* Author

In [7]:
def get_artcl_data(article: element.Tag) -> dict:
        """
        Extract articles data from endi.com digital news paper.

        Parameters
        ----------
        articles : ResultSet
            list of query results

        Returns
        -------
        dict
            a dictionary containing article title, url, and author data

        Raises
        ------
        Error Counter
            when article not found
        """
        try:
            # Extract article title 
            article_headline= article.find(name= 'h3', attrs= {'class':'standard-teaser-headline teaser-headline'}).text
            
            # Extract article path
            article_path= article.find(name= "a", href= True)['href']
            
            # Extract article url
            article_url= "https://www.elnuevodia.com{path}".format(path= article_path)
            
            # Extract article author
            article_author= article.find("div", {"class": "authors-byline-text"}).text
            
            # Return dictionary with data
            return {"headline": article_headline, "url": article_url, "author": article_author}
        except:
             # Print message is find error
             print("¡No se encontró resultado!")

## Function to Extract Articles Content
Alsi, it appends the contents into the given dictionary.
* Subheadline
* Date
* Time
* Content

In [8]:
def get_artcl_content(article_data: dict) -> dict:
    article_html = requests.get(article_data["url"])
    
    article_soup= BeautifulSoup(article_html.text, "html.parser")
    
    headline_sub= article_soup.find(name= "div", attrs= {"class": "article-headline__subheadline"}).text
    
    article_date_time= (article_soup.find(name= "div", attrs= {"class": "article-headline__date"}).text).split("-")
    article_date= article_date_time[0].strip()
    article_time= article_date_time[1].strip()
    
    article_content= article_soup.find_all(name= "p", attrs= {"class": "content-element"})
    content= []
    for __ in article_content: content.append(__.text)
    content= " ".join(content)
    
    return {"headline_sub": headline_sub, "date": article_date, "time": article_time, "content": content}

## Iterate Articles and Construct Dictionary 

In [9]:
data_handler= {}
stats_handler= {}

for _ in zip(count(start= 1, step= 1), articles): data_handler[str(_[0])]= get_artcl_data(article= _[1])
for _ in data_handler.keys(): data_handler[_].update(get_artcl_content(article_data= data_handler[_]))

## Load Data of Municipalities of Puerto Rico

In [10]:
# Store html tables
# df_wiki= pd.read_html("https://en.wikipedia.org/wiki/Pueblos_in_Puerto_Rico#List_of_Pueblos")
municipalities_df = pd.read_csv("resources/puerto_rico_municipalities.txt",
                                sep= " ",
                                header= None,
                                names= ["municipality"])

# Filter for the first table and unique values from the Pueblos Column
municipalities_list= [unidecode(municipality) for municipality in municipalities_df["municipality"].values]

## Configure spaCy model

In [11]:
# Load the model
npl= spacy.load("es_core_news_sm")

## Add municipalities to data handler
!!!   Need to package this into a function   !!!

In [12]:
# Iterate over data_handler
for _ in count(start= 1, step= 1):
    try:
        # Get article content
        body_text= unidecode(data_handler[str(_)]["content"])
        headline_text= unidecode(data_handler[str(_)]["content"])
        
        # Analyze article content
        body_doc= npl(body_text)
        headline_doc= npl(headline_text)

        # Store the pueblos found from the article content
        municipalities_found_body= [ent.text.lower().replace(" ", "_") for ent in body_doc.ents if ent.label_== "LOC"]
        municipalities_found_headline= [ent.text.lower().replace(" ", "_") for ent in headline_doc.ents if ent.label_== "LOC"]
 
        # Validate model found words with pueblos de puerto rico list 
        municipalities_validated= (set(municipalities_found_body) & set(municipalities_list)) or (set(municipalities_found_headline) & set(municipalities_list))
        data_handler[str(_)].update({"pueblos": list(municipalities_validated)}) 
        
    except:
        break

## Final Data

In [13]:
print(pretty_print_dict(data_handler))

1: 
    headline: Una mujer muerta y otra herida luego de ser tiroteadas en IsabelaLa policía encontró a las víctimas en el interior de un auto volcado y baleado en la urbanización Medina 
    url: https://www.elnuevodia.com/noticias/seguridad/notas/una-mujer-muerta-y-otra-herida-luego-de-ser-tiroteadas-en-isabela/
    author: Por Agustín Criollo Oquero
    headline_sub: La policía encontró a las víctimas en el interior de un auto volcado y baleado en la urbanización Medina 
    date: 28 de junio de 2024
    time: 6:15 AMActualizado el 28 de junio de 2024
    content: Una mujer resultó muerta y otra herida de bala luego de que el automóvil en el que viajaban fue tiroteado en una urbanización en el municipio de Isabela, informó el Negociado de la Policía. Según el informe preliminar, a las 10:16 de la noche del jueves se registraron disparos frente a la residencia número 15, de la calle 12, en la urbanización Medina del mencionado municipio. Al llegar al lugar, los agentes encontraron u

# Testing Code