### Imports necessários

In [None]:
import sys
from pathlib import Path
ROOT_DIR = Path().resolve().parents[0] 
sys.path.append(str(ROOT_DIR))
import word_analiser
from sqlite_handler import SQLiteHandler
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


### Variaveis e constantes

- Carregar base de dados

In [2]:
sqlite = SQLiteHandler(Path().resolve() / 'dbs_union.db')

df = pd.DataFrame(sqlite.fetch_data('posts_unificados'))
df.columns = [
    "id", "entity", "user", "text", "title", "url", "owner",
    "likes", "shares", "coments", "views", "date", "origem"
]
df.head()

Unnamed: 0,id,entity,user,text,title,url,owner,likes,shares,coments,views,date,origem
0,d34575e3b28a351e7fd5476400a12e2c,Nike,,,"Descontou na Netshoes: compre produtos Nike, A...",https://www.tecmundo.com.br/produto/405060-des...,Sem fonte,,,,,Sem tempo,bing_google
1,0ef88057c79a7e8da76b1605beae5b1e,Nike,,,Nike oferece Roupas de moda casual para adulto...,https://revistaforum.com.br/cupom/nike/20-off-...,revistaforum,,,,,11meses,bing_google
2,e8dcc3562bcaeb7adf2335e67d94bae5,Nike,,,Nike oferece Roupas polos para adulto masculin...,https://revistaforum.com.br/cupom/nike/15-off-...,revistaforum,,,,,11meses,bing_google
3,5e6ac8525c60fdf475554e59a6c9da9a,Nike,,,Cupom de desconto Nike,https://www.cartacapital.com.br/cupons/cashbac...,Carta Capital,,,,,2meses,bing_google
4,eaf630cd2f8ec2299371279a1482f053,Nike,,,Subida de preços da Nike começa na próxima sem...,https://clickpetroleoegas.com.br/subida-de-pre...,Click Petróleo e Gás,,,,,12dia(s),bing_google


- Classificando textos

In [3]:
# Cria a nova coluna vazia
df["classificacao"] = ""

for index, row in df.iterrows():
    if row["origem"] == "bing_google":
        texto = row["title"] if pd.notna(row["title"]) else ""
    else:
        texto = row["text"] if pd.notna(row["text"]) else ""
    try:
        classificacao = word_analiser.word_analyser(texto)[0]['label']
    except Exception as e:
        print(f"Erro na linha {index}: {e}")
        classificacao = "erro"
        
    df.at[index, "classificacao"] = classificacao


Erro na linha 717: The size of tensor a (813) must match the size of tensor b (512) at non-singleton dimension 1
Erro na linha 866: The size of tensor a (536) must match the size of tensor b (512) at non-singleton dimension 1
Erro na linha 1934: The size of tensor a (549) must match the size of tensor b (512) at non-singleton dimension 1
Erro na linha 1957: The size of tensor a (557) must match the size of tensor b (512) at non-singleton dimension 1
Erro na linha 1992: The size of tensor a (529) must match the size of tensor b (512) at non-singleton dimension 1
Erro na linha 2090: The size of tensor a (624) must match the size of tensor b (512) at non-singleton dimension 1
Erro na linha 2098: The size of tensor a (1094) must match the size of tensor b (512) at non-singleton dimension 1
Erro na linha 2101: The size of tensor a (684) must match the size of tensor b (512) at non-singleton dimension 1
Erro na linha 2103: The size of tensor a (684) must match the size of tensor b (512) at n

In [4]:

# Substitui NaN por 0 nas métricas
df[['likes', 'shares', 'views', 'coments']] = df[['likes', 'shares', 'views', 'coments']].fillna(0)

# Normaliza as colunas por seu máximo (evita divisão por zero com replace)
for col in ['likes', 'shares', 'views', 'coments']:
    max_val = df[col].max()
    if max_val > 0:
        df[col + '_norm'] = df[col] / max_val
    else:
        df[col + '_norm'] = 0


In [5]:
def calcular_score_post(row):
    if row.get('origem', '') != 'bing_google':
        likes = row.get('likes_norm', 0) or 0
        shares = row.get('shares_norm', 0) or 0
        views = row.get('views_norm', 0) or 0
        coments = row.get('coments_norm', 0) or 0

        base_score = (
            0.3 * likes +
            0.4 * shares +
            0.1 * views +
            0.2 * coments
        )
        sentimento = row.get('classificacao', '').lower()
        if sentimento == 'positivo':
            fator = 1.2
        elif sentimento == 'neutro':
            fator = 1.0
        elif sentimento == 'negativo':
            fator = 0.7
        else:
            fator = 1.0
    else:
        base_score = 1

        sentimento = row.get('classificacao', '').lower()
        if sentimento == 'positivo':
            fator = 1
        elif sentimento == 'neutro':
            fator = 0.5
        elif sentimento == 'negativo':
            fator = 0.01
        else:
            fator = 1.0

    

    return base_score * fator

df['score_post'] = df.apply(calcular_score_post, axis=1)

In [6]:
df.tail()

Unnamed: 0,id,entity,user,text,title,url,owner,likes,shares,coments,views,date,origem,classificacao,likes_norm,shares_norm,views_norm,coments_norm,score_post
3929,417cdfd9c85d5ad67db881d389881c6e,Adidas,jay,I wish I still had the videos nigga used to be...,,,,0.0,0.0,0.0,11.0,,x,NEUTRAL,0.0,0.0,1.2e-05,0.0,1e-06
3930,57310199e28eaf37fd44d9b221cecec1,Adidas,にき,ヒョンジンがY-3着てたりadidas着てるのほんとにすき,,,,0.0,0.0,0.0,36.0,,x,NEGATIVE,0.0,0.0,4e-05,0.0,4e-06
3931,ed4f1caed5701a77195f3565fe182b32,Adidas,Soulpower,Si hubo mucha plata de Adidas,,,,0.0,0.0,0.0,1464.0,,x,NEUTRAL,0.0,0.0,0.001612,0.0,0.000161
3932,a0599c6edf0e0f2e6430d3bd125a7920,Adidas,生活の知恵Box,adidas(アディダス)のメンズのジャージ上下セット　<PR>,,,,0.0,0.0,0.0,308.0,,x,NEUTRAL,0.0,0.0,0.000339,0.0,3.4e-05
3933,4e640ccedb0c981b89ddc89b5717e654,Adidas,Arlos.ip,This initial,,,,3.0,0.0,2.0,30.0,,x,NEUTRAL,1.3e-05,0.0,3.3e-05,0.000233,5.4e-05


In [7]:

df['entity'] = df['entity'].str.strip().str.lower()

# Mapeia nomes padronizados
correcoes = {
    'openai': 'OpenAI',
    'insper': 'Insper',
    'google': 'Google',
    'adiddas': 'Adidas',
    'adidas': 'Adidas', 
    'nike': 'Nike',
    'zara': 'Zara',
    'renner': 'Renner'
}

df['entity'] = df['entity'].replace(correcoes)


media_entidades = df.groupby('entity')['score_post'].mean().reset_index()
media_entidades.columns = ['entity', 'score_medio']

# Calcula a média geral
media_geral = media_entidades['score_medio'].mean()

# Adiciona uma nova coluna com o score relativo (comparado à média)
media_entidades['score_relativo'] = media_entidades['score_medio'] - media_geral

# 5. Normalização do score relativo para o intervalo [0, 10]
min_score = media_entidades['score_relativo'].min()
max_score = media_entidades['score_relativo'].max()

media_entidades['score_normalizado'] = (
    (media_entidades['score_relativo'] - min_score) / (max_score - min_score)
) * 10

# 6. Opcional: arredonda para deixar mais apresentável
media_entidades['score_normalizado'] = media_entidades['score_normalizado'].round(2)

In [8]:
media_entidades

Unnamed: 0,entity,score_medio,score_relativo,score_normalizado
0,Adidas,0.383207,0.111131,6.45
1,Google,0.233207,-0.038868,3.78
2,Insper,0.144115,-0.127961,2.18
3,Nike,0.021839,-0.250237,0.0
4,OpenAI,0.581727,0.309651,10.0
5,Renner,0.27793,0.005855,4.57
6,Zara,0.262506,-0.00957,4.3


In [11]:
sqlite.setup_table("entity_med", {
    "entity": "TEXT",
    "score_med": "REAL",
    "relat_score": "REAL",
    "normal_score": "REAL"
})

for _, row in media_entidades.iterrows():
    sqlite.insert_data("entity_med", {
        "entity": row["entity"],
        "score_med": row["score_medio"],
        "relat_score": row["score_relativo"],
        "normal_score": row["score_normalizado"]
    })