In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import faiss
from sklearn.decomposition import TruncatedSVD


In [2]:
# VAR ENV
news_folder = '../Data/itens/itens'
user_news_folder = '../Data/files/treino'


In [3]:
def read_all_csv_files_in_folder(folder_path):
    # List to hold DataFrames
    dataframes = []

    # Iterate over all files in the folder
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            # Read the CSV file and append the DataFrame to the list
            df = pd.read_csv(file_path)
            dataframes.append(df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df

In [4]:
user_interactions = read_all_csv_files_in_folder(user_news_folder)

In [5]:
news_data = read_all_csv_files_in_folder(news_folder)

In [6]:
news_data["content"] = news_data["title"] + " " + news_data["body"] + " " + news_data["caption"]
news_data.head()

Unnamed: 0,page,url,issued,modified,title,body,caption,content
0,13db0ab1-eea2-4603-84c4-f40a876c7400,http://g1.globo.com/am/amazonas/noticia/2022/0...,2022-06-18 20:37:45+00:00,2023-04-15 00:02:08+00:00,Caso Bruno e Dom: 3º suspeito tem prisão tempo...,"Após audiência de custódia, a Justiça do Amazo...",Jeferson da Silva Lima foi escoltado por agent...,Caso Bruno e Dom: 3º suspeito tem prisão tempo...
1,92907b73-5cd3-4184-8d8c-e206aed2bf1c,http://g1.globo.com/pa/santarem-regiao/noticia...,2019-06-20 17:19:52+00:00,2023-06-16 20:19:15+00:00,Linguajar dos santarenos é diferenciado e chei...,Vista aérea de Santarém\nÁdrio Denner/ AD Prod...,As expressões santarenas não significam apenas...,Linguajar dos santarenos é diferenciado e chei...
2,61e07f64-cddf-46f2-b50c-ea0a39c22050,http://g1.globo.com/mundo/noticia/2022/07/08/e...,2022-07-08 08:55:52+00:00,2023-04-15 04:25:39+00:00,Ex-premiê Shinzo Abe morre após ser baleado no...,Novo vídeo mostra que assassino de Shinzo Abe ...,Ex-primeiro-ministro foi atingido por tiros de...,Ex-premiê Shinzo Abe morre após ser baleado no...
3,30e2e6c5-554a-48ed-a35f-6c6691c8ac9b,http://g1.globo.com/politica/noticia/2021/09/0...,2021-09-09 19:06:46+00:00,2023-06-07 17:44:54+00:00,"Relator no STF, Fachin vota contra marco tempo...","Relator no STF, Fachin vota contra marco tempo...",Ministro defendeu que posse indígena é diferen...,"Relator no STF, Fachin vota contra marco tempo..."
4,9dff71eb-b681-40c7-ac8d-68017ac36675,http://g1.globo.com/politica/noticia/2021/09/1...,2021-09-15 19:16:13+00:00,2023-06-07 17:43:39+00:00,"\nApós 2 votos, pedido de vista suspende julga...",Após um pedido de vista (mais tempo para análi...,"Pelo marco temporal, índios só podem reivindic...","\nApós 2 votos, pedido de vista suspende julga..."


In [7]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")  # Baixar a lista de stop words
stop_words_pt = stopwords.words("portuguese")  # Carregar stop words do PT-BR


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muril\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Vetorização do conteúdo das notícias
vectorizer = TfidfVectorizer(stop_words=stop_words_pt, max_features=50000)
tfidf_matrix = vectorizer.fit_transform(news_data["content"])

In [9]:
# Reduzir para 300 dimensões (ou ajuste para seu caso)
svd = TruncatedSVD(n_components=300)
tfidf_reduced = svd.fit_transform(tfidf_matrix)

In [10]:
# Criar índice FAISS com a matriz reduzida
#index = faiss.IndexFlatL2(tfidf_reduced.shape[1])
#index.add(tfidf_reduced.astype("float32"))
index = faiss.read_index("faiss_index.bin")


In [11]:
# Criando um dicionário para mapear IDs das notícias
news_index = {news_data.iloc[i]["page"]: i for i in range(len(news_data))}

In [35]:
def recommend_news_faiss(user_id, user_interactions, index, news_data, top_n=5):
    """
    Gera recomendações de notícias para um usuário usando FAISS.

    user_id: ID do usuário
    user_interactions: Dicionário {user_id: [lista de notícias lidas]}
    index: Índice FAISS treinado
    news_data: DataFrame com notícias
    top_n: Número de recomendações desejadas
    
    """

    # Verifica se o usuário existe no dataset
    if user_id not in user_interactions:
        return "Usuário não encontrado."

    read_news = user_interactions[user_id]  # Notícias que o usuário já leu
    print(len(read_news))
    if not read_news:
        return "Nenhuma notícia lida por esse usuário."

    # Converter lista de IDs de notícias para índices da matriz TF-IDF
    read_indices = [news_index[nid] for nid in read_news if nid in news_index]
    print(len(read_indices))
    if not read_indices:
        return "Não há notícias válidas no histórico do usuário."

    # Pegar os vetores das notícias que ele já leu
    read_vectors = tfidf_reduced[read_indices]
    print(len(read_vectors))
    # Buscar as notícias mais similares no índice FAISS
    _, I = index.search(read_vectors, top_n + len(read_news))  # Pegamos mais para remover duplicatas

    # Coletar os IDs das recomendações, excluindo as que já foram lidas
    recommended_news = []
    for news_indices in I:
        for idx in news_indices:
            news_id = news_data.iloc[idx]["page"]
            if news_id not in read_news and news_id not in recommended_news:
                recommended_news.append(news_id)
            if len(recommended_news) >= top_n:
                break

    return recommended_news if recommended_news else "Nenhuma recomendação disponível."



107
107
107


In [13]:
user_interactions.head()

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,"c8aab885-433d-4e46-8066-479f40ba7fb2, 68d2039c...","1657146417045, 1657146605778, 1657146698738","76, 38, 41","20380, 21184, 35438","50.3, 18.18, 16.46","2, 1, 1","1657146417045, 1657146605778, 1657146698738"
1,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,"3325b5a1-979a-4cb3-82b6-63905c9edbe8, fe856057...","1656684240278, 1656761266729, 1656761528085, 1...","7, 80, 2, 1, 7, 62, 26, 44, 4, 4, 14, 45, 13, ...","6049, 210489, 8672, 10000, 30000, 123007, 9965...","25.35, 45.66, 35.3, 28.05, 36.53, 47.57, 55.33...","1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1...","1656684240278, 1656761266729, 1656761528085, 1..."
2,0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...,Logged,107,"04756569-593e-4133-a95a-83d35d43dbbd, 29b6b142...","1656678946256, 1656701076495, 1656701882565, 1...","0, 0, 0, 0, 0, 44, 0, 0, 2, 1, 0, 0, 0, 44, 0,...","311274, 140000, 32515, 157018, 118689, 159243,...","67.58, 47.22, 41.52, 63.09, 51.38, 65.11, 71.9...","1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1...","1656678946256, 1656701076495, 1656701882565, 1..."
3,c1e8d644329a78ea1f994292db624c57980b2886cfbc2d...,Non-Logged,56,"1f2b9c2f-a2d2-4192-b009-09065da8ec23, 04756569...","1658333312180, 1658404553818, 1658408449062, 1...","8, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1, 1...","182696, 91925, 30000, 273655, 126409, 42980, 1...","58.26, 72.66, 22.57, 59.89, 40.36, 36.35, 14.7...","1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","1658333312180, 1658404553818, 1658408449062, 1..."
4,e777d1f31d4d955b63d60acc13df336d3903f52ab8f8f4...,Non-Logged,4,"bebdeb3e-1699-43e0-a1b8-989f5a6ab679, f4b484a7...","1658766608801, 1658766608801, 1660084035094, 1...","579, 579, 7, 2","801396, 801396, 10000, 10000","78.74, 78.74, 16.71, 9.34","7, 7, 1, 1","1658766608801, 1658766608801, 1660084035094, 1..."


In [None]:
user_dict = user_interactions.groupby("userId")["history"].apply(list).to_dict()
#split history ,
user_dict = {k: [x.split(",") for x in v][0] for k, v in user_dict.items()}
user_dict = {k: [x.strip() for x in v] for k, v in user_dict.items()}
user_dict

In [32]:
recommend_news_faiss("0adffd7450d3b9840d8c6215f0569ad942e782fb19b805367b02b709b73f42a1", user_dict, index, news_data)

107
107
107


['df8107a6-d4c6-4a81-b616-59954fecf7a5',
 '1138eccb-bda8-48f9-a3ff-da64725b6b32',
 '26c6f3db-3bfc-4bfc-b7bc-6b273f457ed0',
 'be26528d-71de-4066-8dcd-6aa0df6ed405',
 'b9b5215a-a4ac-492c-84a7-10f2ce140bf5',
 '2595f507-da08-4882-8c95-c0e16a1b64a4',
 '16162a6c-e495-4ad5-8d64-d7f243baac3c',
 'bf000044-173b-4172-837b-71f9bb3b2c30',
 '2911a7aa-0f44-453f-9563-9973c1e84993',
 '75925628-7f2c-4378-bcd6-c426acff85ff',
 'da98ee61-460a-4070-9baf-cbf6b332f2d8',
 '5ed87b37-e22a-46ac-a0ac-c1d24d0fdf1a',
 'ed080132-3ebe-4535-9b7f-323152187154',
 '5fb7d709-c205-47c0-8542-5a1610645a5c',
 '3eb9c23b-541d-4d14-85fc-88da4dcd3473',
 '42656b0b-0584-4b25-a566-468557a8fd81',
 '187e8053-8065-4fe5-9028-716fe1f01ea0',
 '6cba99f0-1a5c-4269-93cf-4d8d7f2137e2',
 '61174be1-6d7b-4c8b-8d3b-4d147afe26ca',
 'c881fdba-e600-4b9b-8c07-8bd6eed8775c',
 'd05ff1d1-bc8c-409c-8978-11bad2651fa0',
 '43ba73b5-56d0-4f04-bbfd-c502722dd478',
 '59ed3406-f0e4-440f-b243-34752851aedd',
 '9201771f-0733-48f5-a078-0366f66261f0',
 '5ff38371-6539-

In [25]:
# Get line of news_data by id
news_data[news_data['page'] == 'f4b484a7-38f7-4246-be9e-4e2cd8373bcd']

#bebdeb3e-1699-43e0-a1b8-989f5a6ab679, f4b484a7-38f7-4246-be9e-4e2cd8373bcd, esid:conteudo_editorial_g1#materia#https://especiais.g1.globo.com/economia/concursos-e-emprego/lista-de-concursos-publicos-e-vagas-de-emprego/, esid:conteudo_editorial_g1#materia#http://especiais.g1.globo.com/economia/concursos-e-emprego/lista-de-concursos-publicos-e-vagas-de-emprego/

Unnamed: 0,page,url,issued,modified,title,body,caption,content
88597,f4b484a7-38f7-4246-be9e-4e2cd8373bcd,http://g1.globo.com/economia/noticia/2022/08/0...,2022-08-04 15:27:29+00:00,2022-09-21 19:02:56+00:00,Empréstimo consignado para beneficiários do Au...,Piso do Auxílio Brasil será de R$ 600 até o fi...,Governo aprovou Medida Provisória que permite ...,Empréstimo consignado para beneficiários do Au...


In [22]:
#faiss.write_index(index, "faiss_index.bin")
print("Índice FAISS salvo com sucesso!")


Índice FAISS salvo com sucesso!


In [19]:
#import pickle
#
## Criar um dicionário com o índice FAISS e o vetor TF-IDF
#model_data = {
#    "faiss_index": "faiss_index.bin",
#    "vectorizer": vectorizer,  # TfidfVectorizer treinado
#    "news_index": news_index,  # Dicionário {news_id: índice}
#}
#
#with open("model_data.pkl", "wb") as f:
#    pickle.dump(model_data, f)

In [None]:
#with open("model_data.pkl", "rb") as f:
#    model_data = pickle.load(f)
#
#index = faiss.read_index(model_data["faiss_index"])
#vectorizer = model_data["vectorizer"]
#news_index = model_data["news_index"]
#
#print("Modelo FAISS e metadados carregados!")