In [18]:
# Web scraping libraries.
import requests
from bs4 import BeautifulSoup

#Sentiment Analisys libraries.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import mtranslate

# System libraries.
import sys
import json
import os
import urllib
import datetime

# Lithops.
from threading import Thread
from lithops.multiprocessing import Pool
from lithops import Storage

# Http header.
header = { 'user-agent':'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0' }

SEARCH_KEY = ''
STORAGE = None
BUCKET = 'news-bucket'

# The higher, more results. The lower, less results.
DEFAULT_SEARCH_RESULTS = 200

In [19]:
###############################################
# WEB SCRAWLER FOR THE WEBSITE WWW.CCMA.CAT   #
###############################################
def ccma_process_news(link):
    
    analyzer = SentimentIntensityAnalyzer()
    news_format = {}
    try:
        r = requests.get(link, headers=header)
    except requests.exceptions.TooManyRedirects:
        return 0

    soup = BeautifulSoup(r.text, 'html.parser')
    frame = soup.find("div", class_='span8')
    
    # Put news link.
    news_format['link'] = link

    # Get news header.
    news_format['title'] = frame.find("h1", class_='titol').text

    # Get news starter.
    starter = frame.find("h2", class_='entradeta')
    if starter is not None:
        starter = starter.text
    else: starter = ''
    news_format['starter'] = starter.replace("\n","").replace("\t","")

    # Get news date.
    news_format['date'] = frame.find("time", class_='data').text.split(" ")[0]

    # Get news paragraphs.
    frame = frame.find("div", class_='R-itemNotiCos')
    body = ''
    for parraph in frame.find_all("p"):
        body = body+" "+parraph.text
    news_format['body'] = body.replace("\n","").replace("\t","")

    #Get news sentiment analysis.
    try:
        news_format['sentiment'] = analyzer.polarity_scores(mtranslate.translate(news_format['starter']+'\n'+news_format['body'],'en','auto'))['compound']
    except urllib.error.HTTPError:
        return 0
        
    # Get news total words number.
    word_counter = 0
    for field in news_format.values():
        word_counter += len(str(field).split(" "))
    news_format['words_number'] = word_counter

    # Store the news content to the cloud COS.
    storage = Storage()
    storage.put_object(bucket='news-bucket', key=SEARCH_KEY+'/ccma/'+news_format['title'].replace(" ","_")+'.json', body = json.dumps(news_format))

    return 1

def ccma_get_links():

    # Auxiliar variables.
    link_to_news = []

    # We create HTML parser.
    r = requests.get('https://www.ccma.cat/cercador/?text='+SEARCH_KEY+'&profile=noticies&pagina=1', headers=header)
    soup = BeautifulSoup(r.text, 'html.parser')

    # Get the number of pages in the website.
    pages = soup.find(class_='numeracio')
    if pages is None:
        pages = 0
    else:
        pages = pages.text.split(" ")[3]

    # Get the links to the news.
    for i in range(int(pages)+1):
        r = requests.get('https://www.ccma.cat/cercador/?text='+SEARCH_KEY+'&profile=noticies&pagina='+str(i), headers=header)
        soup = BeautifulSoup(r.text, 'html.parser')

        for news in soup.find_all("li", class_='F-llistat-item'):
            # We get the link to the news page.
            link_to_news.append("https://www.ccma.cat"+news.find("a").get('href'))
    
    return link_to_news

def ccma_query():

    link_to_news = ccma_get_links()

    # Start cloud multiprocessing.
    with Pool() as pool:
        result = pool.map(ccma_process_news, link_to_news)
    
    count = sum(result)

    if count == 0:
        return "ccma: no results found."
    else:
        return "ccma: "+str(count)+" results found."

# -------------------------------------------------------------------------------------------------------------------------------------

# -------------------------------------------------------------------------------------------------------------------------------------

In [20]:
###########################################################
# WEB SCRAWLER FOR THE WEBSITE WWW.DIARIDETARRAGONA.COM   #
###########################################################

def dtg_process_news(link):
     
    analyzer = SentimentIntensityAnalyzer()
    news_format = {}
    try:
        r = requests.get(link, headers=header)
    except requests.exceptions.TooManyRedirects:
        return 0

    soup = BeautifulSoup(r.text, 'html.parser')
    head = soup.find("header", class_='news-header')

    # Put news link.
    news_format['link'] = link

    # Get news header.
    news_format['title'] = head.find("h1", class_='news-title').text.replace("\n","").replace("\t","")

    # Get news starter.
    starter = head.find("div",class_='news-excerpt')
    if starter is not None:
        starter = starter.text
    else: starter = ''
    news_format['starter'] = starter.replace("\n","").replace("\t","")

    # Get news date.
    news_format['date'] = head.find("time", class_='news-date').text.replace("\n","").replace("\t","").split(" ")[0]

    # Get news paragraphs.
    frame = soup.find("div", class_='news-body')
    body = ''
    for parraph in frame.find_all("p"):
        body = body+" "+parraph.text
    news_format['body'] = body.replace("\n","").replace("\t","")

    #Get news sentiment analysis.
    try:
        news_format['sentiment'] = analyzer.polarity_scores(mtranslate.translate(news_format['starter']+'\n'+news_format['body'],'en','auto'))['compound']
    except urllib.error.HTTPError:
        return 0
    
    # Get news total words number.
    word_counter = 0
    for field in news_format.values():
        word_counter += len(str(field).split(" "))
    news_format['words_number'] = word_counter

    # Store the news content to the cloud COS.
    storage = Storage()
    storage.put_object(bucket='news-bucket', key=SEARCH_KEY+'/diaridetarragona/'+news_format['title'].replace(" ","_")+'.json', body = json.dumps(news_format))

    return 1

def dtg_get_links():

    # Auxiliar variables.
    link_to_news = []

    # We create HTML parser.
    r = requests.get('https://www.diaridetarragona.com/ajax/get_search_news.html?viewmore=%2Fajax%2Fget_search_news.html&page=1&size='+str(DEFAULT_SEARCH_RESULTS)+'&search='+SEARCH_KEY, headers=header)
    soup = BeautifulSoup(r.text, 'html.parser')

    # Get the links to the news.
    for news in soup.find_all("div", class_='news-data'):
        # We get the link to the news page.
        link_to_news.append("https://www.diaridetarragona.com"+news.find("a").get('href'))

    return link_to_news

def dtg_query():
    
    link_to_news = dtg_get_links()

    # Start cloud multiprocessing.
    with Pool() as pool:
        result = pool.map(dtg_process_news, link_to_news)
    count = sum(result)
    
    if count == 0:
        return "DiarideTarragona: no results found."
    else:
        return "DiarideTarragona: "+str(count)+" results found."


In [21]:
def dbc_process_news(link):

    analyzer = SentimentIntensityAnalyzer()
    news_format = {}
    try:
        r = requests.get(link, headers=header)
    except requests.exceptions.TooManyRedirects:
        return 0
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Put news link.
    news_format['link'] = link

    # Get news header.
    title = soup.find("div", class_='title-opening-section')
    if title is None:
        return 0         #  Case we are threatting an opinion...
    news_format['title'] = title.text.replace("\n","").replace("\t","").replace("\"","")

    # Get news starter.
    description = soup.find("div", class_='description')
    if description is None:
        return 0         # Case we are threatting an interview...
    news_format['starter'] = description.text.replace("\n","").replace("\t","").replace("\xa0"," ").replace("\"","")

    # Get news date.
    frame = soup.find("div", class_='info-date')
    date = frame.find("span", class_='date').text
    news_format['date'] = date.replace(" ","").replace("d’","/").replace("de","/").replace("gener","01/").replace("febrer","02/").replace("març","03/").replace("abril","04/").replace("maig","05/").replace("juny","06/").replace("juliol","07/").replace("agost","08/").replace("setembre","09/").replace("octubre","10/").replace("novembre","11/").replace("desembre","12/")
    # Get news paragraphs.
    frame = soup.findAll("div", class_='component-html pb-3')[3]
    body = ''
    for parraph in frame.find_all("p"):
        body = body+" "+parraph.text
    news_format['body'] = body.replace("\n","").replace("\t","").replace("\xa0"," ").replace("\"","")
    
    #Get news sentiment analysis.
    try:
        news_format['sentiment'] = analyzer.polarity_scores(mtranslate.translate(news_format['starter']+'\n'+news_format['body'],'en','auto'))['compound']
    except urllib.error.HTTPError:
        return 0

    # Get news total words number.
    word_counter = 0
    for field in news_format.values():
        word_counter += len(str(field).split(" "))
    news_format['words_number'] = word_counter

    # Store the news content to the cloud COS.
    storage = Storage()
    storage.put_object(bucket='news-bucket', key=SEARCH_KEY+'/diaridebarcelona/'+news_format['title'].replace(" ","_")+'.json', body = json.dumps(news_format))

    return 1

def dbc_get_links():

    # Auxiliar variables.
    link_to_news = []

    # We create HTML parser.
    r = requests.get('https://www.diaridebarcelona.cat/search?q='+sys.argv[1], headers=header)
    soup = BeautifulSoup(r.text, 'html.parser')

    # Get the number of pages in the website.
    frame = soup.find(lambda tag: tag.name == 'li' and tag.get('class') == ['first'])
    if frame is not None:
        pages = frame.find("a").get('href')
        pages = pages.split("=")[-1]
    else:
        pages = 0  

    # Get the links to the news.
    for i in range(int(pages)+1):
        r = requests.get('https://www.diaridebarcelona.cat/search?q='+SEARCH_KEY+'&start='+str(i), headers=header)
        soup = BeautifulSoup(r.text, 'html.parser')

        for news in soup.find_all(class_='col-sm-6 col-lg-3 mb-20px mb-lg-30px'):
            # We get the link to the news page.
            news = news.find(class_='h1 modul-petit')
            link_to_news.append(news.find("a").get('href'))
    
    return link_to_news

def dbc_query():
    link_to_news = dbc_get_links()

    # Start cloud multiprocessing.
    with Pool() as pool:
        result = pool.map(dbc_process_news, link_to_news)
    count = sum(result)

    if count == 0:
        print("DiarideBarcelona: no results found.")
    else:
        print("DiarideBarcelona: "+str(count)+" results found.")

In [22]:
def menu():
    print("0 - Leave the search engine.")
    print("1 - Regular search using a topic.")
    print("2 - Advanced search using filters.")
    print("3 - Advanced search with filters and data analytics.")
    return int(input("Choice: "))

def get_object_cloud(key):
        return json.loads(STORAGE.get_object(BUCKET,key))

def regularSearch():
    global SEARCH_KEY
    global STORAGE
    SEARCH_KEY = ''
    while SEARCH_KEY == '':
        SEARCH_KEY = input("Choose the topic: ")
    
    ccma_thread = Thread(target=ccma_query)
    ccma_thread.start()
    dtg_thread = Thread(target=dtg_query)
    dtg_thread.start()
    dbc_thread = Thread(target=dbc_query)
    dbc_thread.start()
    
    ccma_thread.join()
    dtg_thread.join()
    dbc_thread.join()

    STORAGE = Storage()
    news_list = STORAGE.list_keys(BUCKET,SEARCH_KEY+'/')
    with Pool() as pool:
        news = pool.map(get_object_cloud, news_list)
    return news
    
def advancedFilterSearch():
    apply_sentiment = input("Filter by sentiment analysis? (bias[-1,1] more/less) (blank if none)   ")
    apply_wordnumber = input("Filter by word numbers (get the news with more/less words than specified)? (bias more/less) (blank if none)   ")
    apply_date = input("Filter by date (get the news before/after the specified date)? (before/after date[yyyy/mm/dd]) (blank if none)   ")
    #apply_specific_word = input("Filter by specific word (get the news contaning the word)? (word) (blank if none)   ")
    news = regularSearch()

    for n in news:
        if apply_sentiment.isEmpty() == False:
            bias = apply_sentiment[0]
            sign = apply_sentiment[1]
            if sign == 'more' and float(n['sentiment']) < float(bias):
                news.remove(n)
                continue

            elif sign == 'less' and float(n['sentiment']) > float(bias):
                news.remove(n)
                continue

        if apply_wordnumber.isEmpty() == False:
            bias = apply_wordnumber[0]
            op = apply_wordnumber[1]
            if op == 'more' and int(n['words_number']) < int(bias):
                news.remove(n)
                continue

            elif op == 'less' and int(n['words_number']) > int(bias):
                news.remove(n)
                continue
        
        if apply_date.isEmpty() == False:
            date = apply_date[1].split("/")
            specified_date = datetime.datetime(date[0],date[1],date[2])
            news_date = n['date'].split("/")
            time_frame = apply_date[0]
            if time_frame == 'before' and datetime.datetime(news_date[2],news_date[1],news_date[0]) > specified_date:
                news.remove(n)
                continue

            elif time_frame == 'after' and datetime.datetime(news_date[2],news_date[1],news_date[0]) < specified_date:
                news.remove(n)
                continue
    return news

def advancedAnalyticsSearch():
    news = advancedFilterSearch()
    print("analytics")

def printToFile(vector):
    with open("results.txt","w") as f:
        for news in vector:
            f.write(json.dumps(news))

def options(choice):
    if choice == 1: printToFile(regularSearch())
    elif choice == 2: printToFile(advancedFilterSearch())
    elif choice == 3: printToFile(advancedAnalyticsSearch())
    elif choice == 0: print("Leaving...")
    else: print("Wrong option.")
          
# ------------------------------------------------------------------------------------- #
if __name__ == '__main__':
    choice = -1
    while choice != 0:
        choice = menu()
        options(choice)
    print("See u soon")

0 - Leave the search engine.
1 - Regular search using a topic.
2 - Advanced search using filters.
3 - Advanced search with filters and data analytics.
2021-06-18 00:24:05,481 [INFO] lithops.config -- Lithops v2.3.4
2021-06-18 00:24:05,498 [INFO] lithops.storage.backends.ibm_cos.ibm_cos -- IBM COS Storage client created - Region: eu-gb
2021-06-18 00:24:05,500 [INFO] lithops.serverless.backends.ibm_cf.ibm_cf -- IBM CF client created - Region: us-south - Namespace: lluisoriol.colom@estudiants.urv.cat_dev
2021-06-18 00:24:05,543 [INFO] lithops.executors -- Serverless Executor created with ID: 3d75f6-11
2021-06-18 00:24:05,565 [INFO] lithops.invokers -- ExecutorID 3d75f6-11 | JobID M000 - Selected Runtime: repstail123/sdpractica2:sdpract2 - 512MB
2021-06-18 00:24:05,576 [INFO] lithops.job.job -- ExecutorID 3d75f6-11 | JobID M000 - Uploading function and data - Total: 2.3KiB
2021-06-18 00:24:06,361 [INFO] lithops.invokers -- ExecutorID 3d75f6-11 | JobID M000 - Starting function invocation: C

TypeError: cannot pickle '_thread.lock' object