In [28]:
import requests
from bs4 import BeautifulSoup
import re
from joblib import load
import pandas as pd
from fuzzywuzzy import fuzz

In [29]:
COMPANIES = ["BCP", "Banco Comercial Português"]
companies = COMPANIES

In [30]:
LINKS = {"www.publico.pt": "Público", #from https://www.kadaza.pt
         "publico.pt": "Público",
         "www.dn.pt": "Diário de Notícias",
         "www.rtp.pt": "RTP",
         "rpt.pt": "RTP",
         "www.cmjornal.pt": "Correio da Manhã",
         "www.iol.pt": "IOL",
         "www.tvi24.iol.pt": "TVI",
         "tvi24.iol.pt": "TVI",
         "noticias.sapo.pt": "Sapo",
         "observador.pt": "Observador",
         "expresso.pt": "Expresso",
         "www.expresso.pt": "Expresso",
         "sol.sapo.pt": "SOL",
         "www.jornaldenegocios.pt": "Jornal de Negócios",
         "www.jn.pt": "Jornal de Notícias",
         "jn.pt": "Jornal de Notícias",
         "ionline.pt": "Jornal i",
         "sicnoticias.pt": "SIC",
         "www.sicnoticias.pt": "SIC",
         "www.lux.iol.pt": "Lux",
         "www.ionline.pt": "Jornal i",
         "news.google.pt": "Google",
         "www.dinheirovivo.pt": "Dinheiro Vivo",
         "www.aeiou.pt": "AEIOU",
         "aeiou.pt": "AEIOU",
         "www.tsf.pt": "TSF",
         "tsf.pt": "TSF",
         "www.sabado.pt": "Sábado",
         "economico.sapo.pt": "Jornal Económico",
         "cnnportugal.iol.pt": "CNN Portugal"}

news_sources = LINKS

---

In [31]:
def api_request(search, websites, date, dedup):
    """
    Makes a request to the arquivo.pt API to search for a specific term within specified websites and date range.

    Parameters:
    search (str): The expression or word to look for.
    websites (str): Comma-separated websites where the search should be performed.
    date (list): A list containing two dates in the format [YYYYMMDD, YYYYMMDD] representing the start and end dates for the search.

    Returns:
    list: A list of response items from the arquivo.pt API.

    Notes:
    - The function constructs a URL for the API request based on the provided parameters.
    - If the number of response items is 500, a message is printed indicating that some data might have been lost due to the limit.
    """
    search = f"q=%22{search.replace(' ', '%20')}%22"
    websites = f"&siteSearch={websites}"
    date = f"&from={date[0]}&to={date[1]}"    
    url = (
        f"https://arquivo.pt/textsearch?{search}{websites}{date}"
        "&fields=linkToExtractedText,tstamp,linkToNoFrame"
        f"&maxItems=500&dedupValue={dedup}&dedupField=url&prettyPrint=false&type=html"
        )
    json = requests.get(url).json()
    data = json["response_items"]
    if len(data) == 500:
        print(f"You might have lost some data: {search, date}")
    return data

In [32]:
def api_request_bulk(companies, news_sources=news_sources):
    """
    Makes bulk API requests for a list of companies over a range of years and returns the results.

    Args:
        companies (list): A list of company aliases to request data for.
        news_sources (dict): A dictionary of news sources with their corresponding keys.

    Returns:
        dict: A dictionary where each key is a link to a no-frame version of the article, and each value is a dictionary containing:
            - "tstamp" (int): The timestamp of the article in the format YYYYMM.
            - "linkToExtractedText" (str): The link to the extracted text of the article.
            - "source" (str): The source of the article.
    """
    websites = ",".join(list(news_sources.keys()))
    api_answer = {}
    for alias in companies:
        for year in range(2000, 2021):
            if year < 2010:
                dedup = 25
            else:
                dedup = 2
            api_aliasS1 = api_request(alias, websites, [int(f"{year}0101"), int(f"{year}0630")], dedup)
            api_aliasS2 = api_request(alias, websites, [int(f"{year}0701"), int(f"{year}1231")], dedup)
            api_alias = api_aliasS1 + api_aliasS2
            for item in api_alias:
                source = "unknown"
                for wbsite in news_sources.keys():
                    if wbsite in item["linkToNoFrame"]:
                        source = news_sources[wbsite]
                        break
                api_answer[item["linkToNoFrame"]] = {"tstamp": int(str(item["tstamp"])[:6]),
                                                      "linkToExtractedText": item["linkToExtractedText"],
                                                      "source": source}
    return api_answer

In [33]:
def extracText(linkToExtractedText):
    """
    Extracts text content from a given URL.

    Args:
        linkToExtractedText (str): The URL from which to extract text.

    Returns:
        str: The extracted text content if the request is successful.
        str: "1min" if the request is rate-limited (status code 429).
        int: 404 if the requested resource is not found (status code 404).
        int: 404 if the request fails with any other status code.

    Raises:
        requests.exceptions.RequestException: If there is an issue with the HTTP request.
    """
    response = requests.get(linkToExtractedText)
    status_code = response.status_code
    if status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        return soup.get_text()
    elif status_code == 429:
        return "1min"
    elif status_code == 404:
        return 404
    else:
        print(f"Request failed with status code {status_code}. Link was {linkToExtractedText}")
        return 404

In [34]:
# FEATURES for ML model for news filtering
# [['IstALIAS', 'propAN', 'txtSZ', 'countALI', 'countDTS', 'countHOUR', 'countCAPS']]
#
#This module provides feature extraction functions for a machine learning model used for news filtering.
#
#Functions:
#    IstALIAS(text, aliases):
#        Determines the position of the first alias in the text and returns the number of words before it.
#    
#    propAN(text, aliases):
#        Calculates the proportion of alphanumeric characters in the text.
#    
#    txtSZ(text, aliases):
#        Returns the size (length) of the text.
#    
#    countALI(text, aliases):
#        Counts the number of times any alias appears in the text.
#    
#    countDTS(text, aliases):
#        Counts the number of date occurrences in the text.
#    
#    countHOUR(text, aliases):
#        Counts the number of time occurrences (in hh:mm format) in the text.
#    
#    countCAPS(text, aliases):
#        Counts the number of uppercase words in the text.

def IstALIAS(text, aliases):
    """where does the first alias appear, title?"""
    indexs = []
    for alias in aliases:
        index = text.lower().find(alias.lower())
        if index != -1:
            indexs.append(index)
    try:
        a = text[:min(indexs)].count(' ')
    except:
        a = 10000000000000000000
    return a

def propAN(text, aliases):
    """proportion of alphanumeric chars in the text"""
    alphanumeric_chars = sum(char.isalnum() for char in text)
    proportion = alphanumeric_chars / len(text)
    return proportion

def txtSZ(text, aliases):
    """text size"""
    return len(text)

def countALI(text, aliases):
    """count how many aliases appear in the text"""
    alias_count = {expression: 0 for expression in aliases}
    for alias in aliases:
        # Use re.escape to handle any special characters in the expression
        pattern = re.escape(alias.lower())
        matches = re.findall(alias, text.lower())
        alias_count[alias] = len(matches)
    return sum(alias_count.values())

def countDTS(text, aliases):
    """count how many dates appear in the text"""
    date_pattern = r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2})\b'
    # 10/11/2024', '10/10/2024', '12-25-1990', '2024-11-05', '01/10/2024'
    dates = re.findall(date_pattern, text)
    date_count = len(dates)
    return date_count

def countHOUR(text, aliases):
    """count how many hours (ex.: hh:mm) appear in the text"""
    time_pattern = r'\b([01]?[0-9]|2[0-3]):[0-5][0-9]\b'   
    occurrences = re.findall(time_pattern, text)
    return len(occurrences)

def countCAPS(text, aliases):
    """count how many WORDS are upper"""
    words = text.split()
    uppercase_word_count = sum(1 for word in words if word.isupper())
    return uppercase_word_count

# Load the saved model
clf = load('dtree01.joblib')

In [35]:
# In case its needed to filter sentences which only contain  keywords
def filter_sentences_by_keywords(text, aliases):
    """
    Filters sentences from the given text that contain any of the specified keywords (aliases).

    Args:
        text (str): The input text to be filtered.
        aliases (list of str): A list of keywords to filter sentences by.

    Returns:
        str: A single string containing the filtered sentences joined together.
    """
    # Split the text by punctuation and also by multiple spaces or newlines
    sentences = re.split(r'(?<=[.!?]) +|\s{2,}|\n+', text)
    # Filter sentences that contain any of the aliases
    filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in aliases)]
    # Join the filtered sentences back into a single string
    filtered_text = ' '.join(filtered_sentences)
    return filtered_text

In [36]:
def nearDuplicates(text, lista, threshold=90):
    """
    Check if a given text is a near duplicate of any text in a list based on a similarity threshold.

    Args:
        text (str): The text to check for near duplicates.
        lista (list of str): The list of texts to compare against.
        threshold (int, optional): The similarity threshold (default is 90).

    Returns:
        bool: True if a near duplicate is found, False otherwise.
    """
    for txt in lista:
        if fuzz.ratio(txt, text) > threshold:
            return True
    return False

# SOMETHING IS WRONG WITH THIS NEXT CODE, DELETING TO MANY NEWS IDK WHY !

In [None]:
def filterNews(news_json, companies = companies):
    features = ['IstALIAS', 'propAN', 'txtSZ', 'countALI', 'countDTS', 'countHOUR', 'countCAPS']
    news_index = -1
    news_list = list(news_json.keys())
    delete_news = []
    processed_news = []
    df_features = {}
    while news_index < len(news_list)-1:
        news_index += 1
        news = news_list[news_index]
        text = extracText(news_json[news]['linkToExtractedText'])
        if text == 404:
            continue
        elif text == "1min":
            news_index -= 1
            # Process the news that have been extracted so far (could be a function)
            curr_news = news_index
            for ExtractedText in reversed(processed_news):
                for feature in features:
                    df_features[feature] = [globals()[feature](ExtractedText, companies)]
                probability = clf.predict_proba(pd.DataFrame(df_features))[0, 1]
                if probability < 0.1: #.4
                    delete_news.append(news_list[curr_news])
                    pass
                elif probability <= 0.6:
                    news_json[news_list[curr_news]]["probability"] = round(probability, 3)
                    ExtractedText = filter_sentences_by_keywords(ExtractedText, companies)
                else:
                    news_json[news_list[curr_news]]["probability"] = round(probability, 3)
                curr_news -= 1
            processed_news = []

        else:
            if text in processed_news:
                processed_news.append(" ")
            elif nearDuplicates(text, processed_news):
                processed_news.append(" ")
            else:
                processed_news.append(news)
        
    # If the last news is reached, process the remaining news (could be a function)
    curr_news = news_index
    for ExtractedText in reversed(processed_news):
        for feature in features:
            df_features[feature] = [globals()[feature](ExtractedText, companies)]
        probability = clf.predict_proba(pd.DataFrame(df_features))[0, 1]
        if probability < 0.1:
            delete_news.append(news_list[curr_news])
            pass
        elif probability <= 0.6:
            news_json[news_list[curr_news]]["probability"] = round(probability, 3)
            ExtractedText = filter_sentences_by_keywords(ExtractedText, companies)
        else:
            news_json[news_list[curr_news]]["probability"] = round(probability, 3)
        curr_news -= 1

    print(f"News to delete: {len(delete_news)}")
    for news_to_delete in delete_news:
        news_json.pop(news_to_delete, None)
                
    return news_json

In [38]:
news_json = api_request_bulk(companies)
print(len(news_json))
news_json1 = filterNews(news_json)
print(len(news_json1))

1459
141


6m 53s | 6m 22s, 141 | 1009 141 | original: 807

# MISSING NEW AND SENT, maybe give bool if sentiment or not, because its only used in graph maybe not always necessary