In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
query_words = {'ciberseguridad', 'seguridad informatica', 'filtran datos', 'sitio web', 'cibercrimen'}

key_words = {}


def make_url(page, search_string):
    """ Creates a url, querying prensalibre for the given search_string on a given page.
    Pages start on 1, ends on code 404
    parameters:
        page(int): number of page
        search_string(str): query string to be used
    returns:
        str: resulting url
    """

    query_string = str(search_string).replace(" ", "+") 
    return f"https://www.prensalibre.com/page/{str(page)}/?s={query_string}"

# https://www.prensalibre.com/?s=

def crawler(query_words:set, key_words:set, limit = 1): 
    """
    main(): iterar sobre la pagina principal.
            recibir el resultado de make_url.
    parameters: 
        limit(int): limit of pages per query
        query_words(set): words to be queried on the web page
        key_words(set): key words to be searched inside the page results
    returns:
        pd.DataFrame: important data 
        
    """
    queries_urls = []
    
    for query_word in query_words:
        page_num = 0 
        while(True):
            page_num+=1
            if limit < page_num:
                break
            raw=requests.get(make_url(page_num, query_word))
            html=raw.text
            response_status = raw.status_code
            if response_status == 404 or response_status == '404':
                break
            
            bs4= BeautifulSoup(html,"html.parser")
            results = get_results(bs4)
            queries_urls.extend(results)
    print(queries_urls)
    data = []
    for url in queries_urls:
        raw=requests.get(url)
        html=raw.text
        response_status = raw.status_code
        if response_status == 404 or response_status == '404':
            next
        bs4= BeautifulSoup(html,"html.parser")
        page_data = analyse_page(bs4)
        page_data['html'] = html # this is to add html to the pandas dataframe
        data.append(page_data)
    df = pd.DataFrame(data)
    # print(df.head(2))
        
        
    return df
            #analyse_page(bs4, key_words)
            
            

def get_results(soups):
    """ gets the results list from a html
    parameters:
        bs4(BeautifulSoup): html of the prensalibre query webpage
    returns:
        list: list containing the url
    """
    result_list = []
    bs4= soups
    
    h1= str(bs4.select("h1", { 'class': 'story-title'}))
    h1 = BeautifulSoup(h1)
    a_type= h1.find_all("a")
    for link in a_type:
        result_list.append(link.get('href'))

    return(result_list)

def analyse_page(html_soup):
    """
    Searches for instances of keywords
    parameters:
        bs4(BeautifulSoup): html of the prensalibre single new webpage
    returns:
        dict: publishing date, author, and tags archive listings.
    """
    # fecha de publicación
    publishing_date = html_soup.find("time")
    if publishing_date: publishing_date = publishing_date.get("datetime")
    author = html_soup.find("span", {'class':'author vcard'})
    if author: author = author.text
    tags = html_soup.find("div", {'class': 'tag-list-podcast'})
    if tags: tags_archive = [t.text.upper() for t in tags.findAll('a')]
    else: tags_archive = []
    for element in {'plus-title', 'sart-title'}:
        title = html_soup.find("h1", {"class":element})
        if title: break
    return {"publishing_date":publishing_date,"author":author,"tags_archive":tags_archive, "title":title}
    
    

if __name__ == '__main__':
    df = crawler(query_words=query_words, key_words=key_words, limit = 10)

In [None]:
df.info()

In [None]:
df.to_csv('info.csv', index=False)
