In [2]:
import nltk
from nltk.corpus import stopwords
import re
from unidecode import unidecode
import openai
import os
import datetime
import requests

In [None]:
class SocratesNews:

    def __init__(self, references, sectionId, sectionArr, tags):
        self.references = references
        self.sectionId = sectionId
        self.sectionArr = sectionArr
        self.tags = tags
        nltk.download('stopwords')
        self.stop_words = stopwords.words('portuguese')
        self.script = self.get_script()
        self.news = self.get_news()
        self.publish = self.publish_news()


    def preprocess_text(text):
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in self.stop_words])
        return unidecode(text)


    def filter_news(self):
        for ref in self.references:
            try:
                text1 = ref.text
                len_text1 = len(text1.split(' '))
            except:
                len_text1 = 0
            
            try:
                text2 = re.sub(r'\d+', '', text1)
                text2 = re.sub(r'[^\w\s]', '', text2)
                text2 = text2.lower()
                text2 = ' '.join([word for word in text2.split() if word not in self.stop_words])
                text2 = unidecode(text2)
                len_text2 = len(text2.split(' '))
            except:
                len_text2 = 0
            
            try:
                text3 = text2.split(' ')
                text3 = list(set(text3))
                len_text3 = len(text3)
            except:
                len_text3 = 0
            
            if len_text3 < 100 or len_text3 > 500 or len_text2 < 100 or ref.title == None or ref.text == None :
                self.references.remove(ref)
        return None


    def show_news_data(self):
        counter = 1
        for ref in self.references:
            print(f'#{counter} >> {ref.url}')
            print(f'{ref.title}')
            print(f'{'Possui text com ' + str(len(ref.text.split(' '))) + ' palavras' if ref.text != None else "N/A"}')
            print('\n')
            counter += 1


    def get_script(self):
        self.filter_news()
        self.filter_news()
        self.filter_news()
        
        content = f'Escreva um texto jornalístico sem muita enrolação entre 400 e 500 palavras que será publicado em um jornal de grande circulação falando sobre um time de futebol brasileiro, o {self.sectionArr[-1]}.\n'
        content += f'Crie o texto em linguagem direta e apenas com informações relevantes sobre o time de futebol {self.sectionArr[-1]}.\n'
        content += f'Considere como referêcia exclusivamente o que foi extraído de alguns sites de prestígio, elencados abaixo.\n\n'
        
        for n in self.references:
            if len(content.split(' ')) < 8000:
                content += f'{n.title}\n'
                content += f'{n.text}\n\n'
        
        return content


    def get_news(self):
        client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
        message = [{
            'role': 'user',
            'content': self.script
        }]
        response = client.chat.completions.create(
            messages=message, 
            model="gpt-4o-mini", 
            max_tokens=10000, 
            temperature=0, 
        )
        prediction = response.choices[0].message.content
        newsParagraphs = prediction.split("\n")
        newsParagraphs = [paragraph for paragraph in newsParagraphs if len(paragraph) > 10]
        title = newsParagraphs[0].replace("*", "").replace("#", "")
        newsParagraphs = newsParagraphs[1:]
        content = "\n\n".join(newsParagraphs)
        return {"title": title, "content": content}


    def publish_news(self):
        url = f'{os.getenv("API_URL")}/api/news'
        
        payload = {
            "uid": os.getenv("UID"),
            "supportUid": os.getenv("SUPPORT_UID"),
            "h1": self.sectionArr[-1], 
            "h2": self.news["title"], 
            "text": self.news["content"], 
            "sectionId": self.sectionId,
            "sectionName": self.sectionArr[0],
            "sectionArr": self.sectionArr,
            "tags": self.tags,
        }
        response = requests.post(url, json=payload)

        if response.status_code != 200:
            print(f'Erro: {response.status_code}')
        
        return datetime.datetime.now()
    
    
    
    
    
    
    
