In [1]:
# !pip install selenium pandas

In [2]:
import os
import time
import pandas as pd
import textwrap
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from datetime import datetime

In [3]:
# Função para iniciar o webdriver
def start_driver():
    options = Options()
    options.add_argument('--headless')  # Executa o navegador em modo headless (sem interface gráfica)
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(options=options)
    return driver

driver = start_driver()

In [4]:
def find_stock_market_news(driver):    
    url = "https://www.investing.com/news/stock-market-news"
    driver.get(url)
    time.sleep(5)  # Aguarde o carregamento da página

    # Encontre a lista de notícias
    news_list = driver.find_element(By.CSS_SELECTOR, "ul[data-test='news-list']")
    news_items = news_list.find_elements(By.TAG_NAME, "li")

    print(f"Found {len(news_items)} items in news_list element")
    
    # Converte os itens de notícias em strings HTML
    news_items_html = [item.get_attribute('outerHTML') for item in news_items]

    return news_items_html

news_items = find_stock_market_news(driver)
driver.quit()

Found 74 items in news_list element


In [5]:
news_items

['<li class="list_list__item__dwS6E !mt-0 border-t border-solid border-[#E6E9EB] py-6"><article data-test="article-item" class="news-analysis-v2_article__wW0pT flex w-full sm:flex-row-reverse md:flex-row"><figure tabindex="-1" class="hidden justify-center sm:ml-6 sm:block md:ml-0 md:mr-6 h-[59px] w-[80px] md:h-[104px] md:w-[140px] "><div class="relative"><div class="absolute z-1 flex items-center justify-center bg-[#8488924d] h-[59px] w-[80px] md:h-[104px] md:w-[140px] "><div class="lazyload-wrapper "><img src="https://i-invdn-com.investing.com/news/external-images-thumbnails/LYNXMPEK9R0QF_L.jpg" alt="" data-test="item-image" class="m-auto h-[58px] max-w-[80px] md:h-auto md:max-h-[104px] md:max-w-[140px]"></div></div><div class="lazyload-wrapper "><img src="https://i-invdn-com.investing.com/news/external-images-thumbnails/LYNXMPEK9R0QF_L.jpg" alt="" class="relative left-0 right-0 object-cover blur-sm h-[59px] w-[80px] md:h-[104px] md:w-[140px] " style="clip-path: inset(0px);"></div></d

In [6]:
# Função para converter tempo para timestamp
def convert_time_to_timestamp(time_text):
    # Aqui você pode definir diferentes regras dependendo do formato do tempo (ex: "7 minutes ago", "2 hours ago", etc.)
    if 'minute' in time_text:
        minutes_ago = int(time_text.split()[0])
        return datetime.now() - pd.to_timedelta(minutes_ago, unit='m')
    elif 'hour' in time_text:
        hours_ago = int(time_text.split()[0])
        return datetime.now() - pd.to_timedelta(hours_ago, unit='h')
    # Adicione mais regras conforme necessário para outros formatos de tempo
    else:
        return pd.to_datetime(time_text)

In [7]:
def extract_text_string(html_string, keyword, start_char, end_char, division_index: int=1):
    try:
        # Divide o HTML pela palavra-chave
        parts = html_string.split(keyword)
        # print(f"Parts[1] ({len(parts)}) :", parts[1])
        
        # Verifica se a palavra-chave está no HTML
        if len(parts) < 2:
            print(f'({len(parts)})\n\n({keyword})\n{parts}\n\n')
            return None
        
        # Pega a segunda parte da divisão
        text_part = parts[division_index]
        
        # Encontra o índice do caractere inicial
        start_index = text_part.find(start_char)
        
        # Verifica se o caractere inicial foi encontrado
        if start_index == -1:
            return None
        
        # Ajusta o índice inicial para começar após o caractere inicial
        start_index += len(start_char)
        
        # Encontra o índice do caractere final a partir do índice inicial
        end_index = text_part.find(end_char, start_index)
        
        # Verifica se o caractere final foi encontrado
        if end_index == -1:
            return None
        
        # Extrai o texto entre o caractere inicial e o caractere final
        extracted_text = text_part[start_index:end_index]
        
        return extracted_text.strip()
    except Exception as e:
        print(f"Erro ao extrair texto: {e}")
        return None

In [8]:
# Função para extrair dados de um item de notícia
def extract_news_item_data(item):
    # print(f'Item:\n\n{textwrap.indent(item.get_attribute("innerHTML"), prefix='    ')}\n\n')
    try:        
        # Extraindo o link do artigo
        link = extract_text_string(item, "article-title-link", 'href="', '"', division_index=0)
        print(f'Article Link: {link}')

        # Extraindo o título do artigo
        title = extract_text_string(item, "article-title-link", ">", "<")
        print(f'Article Title: {title}')

        # Extraindo a descrição do artigo
        description = extract_text_string(item, "article-description", ">", "<")
        print(f'Article Description: {description}')

        # Extraindo a data de publicação
        publish_date = extract_text_string(item, "article-publish-date", 'datetime="', '"')
        print(f'Publish Date: {publish_date}')

        return {
            'Link': link,
            'Title': title,
            'Description': description,
            'PublishDate': publish_date
        }
    except Exception as e:
        print(f"Erro ao processar item: {e}")
        return None

In [9]:
# Função para extrair os dados das notícias
def extract_news_data(news_items):
    # Lista para armazenar os dados
    news_data = []

    # Itera sobre os itens de notícias
    for item in news_items:
        if item.startswith('<li class="list_list__item__dwS6E !mt-0 border-t'):
            news_item_data = extract_news_item_data(item)
            if news_item_data:
                news_data.append(news_item_data)

    # Cria um DataFrame com os dados extraídos
    news_df = pd.DataFrame(news_data)

    return news_df

news_df = extract_news_data(news_items)
news_df.to_csv('news_data.csv', index=False)
print("Dados salvos em 'news_data.csv'")

Article Link: https://www.investing.com/news/stock-market-news/biden-proposal-would-effectively-prohibit-sale-of-polestar-cars-in-us-automaker-says-3687083
Article Title: Polestar says Biden proposal would 'effectively prohibit' sale of its cars in US
Article Description: By David Shepardson WASHINGTON (Reuters) -Polestar said on Monday that a proposed Biden administration rule to bar the use of Chinese vehicle hardware and software would...
Publish Date: 2024-10-29 00:00:43
Article Link: https://www.investing.com/news/stock-market-news/brazil-institute-sues-social-media-giants-for-525-million-over-excessive-use-by-minors-3687085
Article Title: Brazil institute sues social media giants for $525 million over excessive use by minors
Article Description: By Ricardo Brito BRASILIA (Reuters) - Brazil's Collective Defense Institute has filed two lawsuits demanding 3 billion reais ($525.27 million) from the Brazilian units of TikTok,...
Publish Date: 2024-10-28 23:50:37
Article Link: https://

In [10]:
def scrape_and_save_news():
    driver = start_driver()
    news_df = extract_news_data(driver)
    driver.quit()  # Fecha o navegador
    
    # Verifica se o arquivo já existe
    file_exists = os.path.isfile('news_data.csv')
    
    # Salva o DataFrame em um arquivo CSV, adicionando dados se o arquivo já existir
    news_df.to_csv('news_data.csv', mode='a', header=not file_exists, index=False)
    print("Dados adicionados em 'news_data.csv'")
    
# Chama a função principal
# scrape_and_save_news()