In [22]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta, timezone
import re
class XmlToNews:

    def __init__(self, base_url):
        self.base_url = base_url
        self.xml_soup = self.get_xml()
        self.title =  self.get_title()
        self.news = self.get_news()
    
    def yesterday_url_str(self):
        ontem = datetime.now() - timedelta(days=1)
        data_formatada = f"{ontem.strftime('%d')}/{ontem.strftime('%m')}/{ontem.year}"
        return data_formatada
    
    def get_xml(self):
        browsers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 86.0.4240.198Safari / 537.36"}
        xml_content = requests.get(self.base_url, headers = browsers)
        soup = BeautifulSoup(xml_content.text, 'xml')
        return soup

    def get_title(self):
        title = self.xml_soup.find('title').get_text()
        return title.split(sep=' - ')[1]
    
    def get_source(self, url):
        if 'tntsports.com.br' in url:
            return 'tntsports'
        if 'correiobraziliense.com.br' in url:
            return 'correiobraziliense'
        if 'placar.com.br' in url:
            return 'placar'
        if 'flamengo.com.br' in url:
            return 'flamengo'
        if 'agoracomvoce.com.br' in url:
            return 'agoracomvoce'
        if 'folhape.com.br' in url:
            return 'folhape'
        if 'espn.com.br' in url:
            return 'espn'
        if 'uol.com.br' in url:
            return 'uol'
        if 'ge.globo.com' in url:
            return 'geglobo'
        return 'outro'

    def get_until_now(self, ts_str):
        date_time_obj = datetime.strptime(ts_str, '%Y-%m-%dT%H:%M:%SZ')
        date_time_obj = date_time_obj.replace(tzinfo=timezone.utc)
        current_time = datetime.now(timezone.utc)
        time_difference = current_time - date_time_obj
        hours_passed = time_difference.total_seconds() / 3600
        return hours_passed
        
                
    def get_news(self):
        news_all = self.xml_soup.find_all('entry')
        result = []
        for news in news_all:
            id = news.find('id').get_text()
            title = news.find('title', type='html').get_text()
            url = news.find('link').get('href')
            publishedAt = news.find('published').get_text()
            
            if(self.get_until_now(publishedAt) > 24):
                return result
            
            result += [{
                'id': id,
                'title': title,
                'url': url,
                'source': self.get_source(url),
                'publishedAt': publishedAt,
            }]
        return result

news = XmlToNews("https://www.google.com.br/alerts/feeds/11679571385690767210/11098906599508033294")
news.news

[{'id': 'tag:google.com,2013:googlealerts/feed:3571392005149868623',
  'title': '<b>Flamengo</b> bate o Ceará na Gávea, pelo Brasileiro Sub-17',
  'url': 'https://www.google.com/url?rct=j&sa=t&url=https://www.flamengo.com.br/noticias/futebol-de-base/flamengo-bate-o-ceara-na-gavea--pelo-brasileiro-sub-17&ct=ga&cd=CAIyHTJmNTc2YWJhNTI3Y2MzYjY6Y29tLmJyOnB0OkJS&usg=AOvVaw176ky0wPFr6aWvtBa_cG0a',
  'source': 'flamengo',
  'tilnow': 0.3185197302777778,
  'publishedAt': '2024-08-15T18:46:12Z'},
 {'id': 'tag:google.com,2013:googlealerts/feed:2893688343268496447',
  'title': 'Quem é Carlos Alcaraz, alvo do <b>Flamengo</b> e possível contratação mais cara do <b>futebol</b> brasileiro?',
  'url': 'https://www.google.com/url?rct=j&sa=t&url=https://www.terra.com.br/esportes/flamengo/quem-e-carlos-alcaraz-alvo-do-flamengo-e-possivel-contratacao-mais-cara-do-futebol-brasileiro,faddb5d62c8230a399904e0f36c12dd31nqgp1u7.html&ct=ga&cd=CAIyHTJmNTc2YWJhNTI3Y2MzYjY6Y29tLmJyOnB0OkJS&usg=AOvVaw2uj20P4UjLjbkI1-