In [149]:
import openai
from dotenv import load_dotenv
import requests
import re
from bs4 import BeautifulSoup
import os
from datetime import datetime, timedelta
import locale
import unicodedata

In [137]:
load_dotenv()
locale.setlocale(locale.LC_TIME, 'pt_BR.utf8')
client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))

In [138]:
def str_cleasing(str):
    str = re.sub(r'\n', ' ', str)
    str = re.sub(r'\s+', ' ', str)
    str = re.sub(r'\s+$', ' ', str)
    str = re.sub(r'^\s+', ' ', str)
    str = re.sub(r'  ', ' ', str)
    str = re.sub(r'  ', ' ', str)
    str = re.sub(r'  ', ' ', str)
    str = re.sub(r'  ', ' ', str)
    str = re.sub(r'  ', ' ', str)
    str = re.sub(r'  ', ' ', str)
    return str

In [139]:
def normalize_string(text):
    normalized_text = unicodedata.normalize('NFKD', text)
    text_without_accents = ''.join([c for c in normalized_text if not unicodedata.combining(c)])
    final_text = text_without_accents.lower().replace(' ', '-')
    return final_text

In [140]:
def yesterday_date_dn():
    ontem = datetime.now() - timedelta(days=1)
    data_formatada = ontem.strftime('%d de %B de %Y')
    data_formatada = data_formatada.replace(data_formatada.split()[2], data_formatada.split()[2].capitalize())
    return data_formatada

In [177]:
def getNewsUrlsFromG1(team, mainurl):
    browsers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 86.0.4240.198Safari / 537.36"}
    page = requests.get(mainurl, headers = browsers)
    resposta = page.text
    soup = BeautifulSoup(resposta, 'html.parser')

    hrefs = []
    for a in soup.find_all('a'):
        newsUrl = a.get('href')
        if newsUrl and team + '/noticia/2024/08/10/' in newsUrl:
            hrefs.append(newsUrl)
    
    # retonar sem duplicatas
    hrefs = list(dict.fromkeys(hrefs))
    return hrefs

In [142]:
def getNewsFromG1(url):
    browsers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 86.0.4240.198Safari / 537.36"}
    page = requests.get(url, headers = browsers)
    resposta = page.text
    soup = BeautifulSoup(resposta, 'html.parser')

    title = soup.find('h1', class_="content-head__title").get_text()
    content = ""

    for p in soup.find_all('p', class_="content-text__container"):
        content += p.get_text()
    
    content = re.sub(r'\s+', ' ', content).strip()

    return {
        "url": url,
        "title": title,
        "content": content
    }

In [143]:
def getNewsUrlsFromDN(mainUrl, word):
    browsers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 86.0.4240.198Safari / 537.36"}
    page = requests.get(mainUrl, headers = browsers)
    resposta = page.text
    soup = BeautifulSoup(resposta, 'html.parser')
    allHrefs = soup.find_all('a')

    selectedUrls = []
    for a in allHrefs:
        newsUrl = a.get('href')
        if (word in newsUrl) & ('?page=' not in newsUrl): # & ('/noticias/sobre/' not in newsUrl):
            selectedUrls.append(f'https://diariodonordeste.verdesmares.com.br{newsUrl}')
    
    selectedUrls = list(dict.fromkeys(selectedUrls))
    return selectedUrls

In [144]:
def getNewsFromDN(url):
    browsers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 86.0.4240.198Safari / 537.36"}
    page = requests.get(url, headers = browsers)
    resposta = page.text
    soup = BeautifulSoup(resposta, 'html.parser')

    title = soup.find('h1', class_="text-heading text-heading-4xl m-l-article__heading").get_text()
    title = str_cleasing(title)
    title = re.sub(r'\s+', ' ', title).strip()
    
    timestamp = soup.find('time', class_="m-l-article__date_created").get_text()
    if yesterday_date_dn() not in timestamp:
        return None

    content = ""
    contentHtml = soup.find_all('div', class_="m-l-article__content space-y-6 prose")
    for c in contentHtml:
        for p in c.find_all('p'):
            content += p.get_text()
            content = str_cleasing(content)
            content += '\n'
    
    return {
        "url": url,
        "title": title,
        "content": content
    }

In [145]:
times = [
    {
        "timeName": 'Fortaleza', 
        "sources": [
            {"source": "g1", "url": "https://ge.globo.com/ce/futebol/times/fortaleza/"},
            {"source": "dn", "url": "https://diariodonordeste.verdesmares.com.br/noticias/sobre/Esportes-fortaleza%20esporte%20clube"}
        ]
    }
]

In [171]:
for team in times:
    newsRefereces = []
    for source in team['sources']:
        if source['source'] == 'g1':
            urls = getNewsUrlsFromG1(normalize_string(team['timeName']), source['url'])
            for url in urls:
                newsData = getNewsFromG1(url)
                newsRefereces.append(newsData)
        elif source['source'] == 'dn':
            urls = getNewsUrlsFromDN(source['url'], 'fortaleza')
            for url in urls:
                newsData = getNewsFromDN(url)
                newsRefereces.append(newsData)
        else:
            print('Fonte não encontrada')
    newsRefereces = [x for x in newsRefereces if x is not None]


In [182]:
import import_ipynb

%run "../jgc_news/g1.ipynb" import G1

# Usar a classe G1
g1 = G1()
g1.yesterday_url_str()
news = g1.getNewsData('https://ge.globo.com/ce/futebol/times/fortaleza/noticia/2024/08/10/renato-kayzer-sai-sangrando-apos-cotovelada-de-newton-em-fortaleza-x-criciuma.ghtml')


AttributeError: 'NoneType' object has no attribute 'get_text'

In [185]:
import import_ipynb

%run "../jgc_news/dn.ipynb" import G1

# Usar a classe G1
dn = DN()
news = dn.getUrls('https://diariodonordeste.verdesmares.com.br/noticias/sobre/Esportes-fortaleza%20esporte%20clube', 'fortaleza')
print(news)


/
/opiniao
/ceara
/jogada
/negocios
/pontopoder
/seguranca
/verso
/ultima-hora
/ultima-hora/automovel
/ultima-hora/ciencia
/estilo-de-vida/culinaria
/entretenimento/e-hit
/entrevista
/ultima-hora/mundo
/ultima-hora/pais
/papo-carreira
/ser-saude
/ultima-hora/seu-direito
/estilo-de-vida/sisi
/ultima-hora/tecnologia
/entretenimento/zoeira
/ultima-hora/al
/ultima-hora/ba
/ultima-hora/ma
/ultima-hora/pb
/ultima-hora/pe
/ultima-hora/pi
/ultima-hora/rn
/ultima-hora/se
/jogada/futebol/brasileirao-serie-a
https://privacidade.geq.com.br/
/projeto-comprova
/publicidade-legal
/ultima-hora
https://diariodonordeste.verdesmares.com.br/noticias/sobre/Estados-Cear%C3%A1-Praia%20%C3%89%20Vida
https://diariodonordeste.verdesmares.com.br/noticias/sobre/educacao-financeira-2.16387
https://diariodonordeste.verdesmares.com.br/noticias/sobre/Institucional-Projeto%20Elas
https://diariodonordeste.verdesmares.com.br/noticias/sobre/Esportes-Arena%20da%20Vida
/ceara/terra-de-sabidos
/projetos/engenharias-que-tran

UnboundLocalError: cannot access local variable 'hrefs' where it is not associated with a value