## Importando bibliotecas:

In [None]:
#!pip install lightfm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
import torch
import pickle
import gc
from time import sleep
from sklearn.decomposition import TruncatedSVD

## Definindo hiperparâmetros

In [None]:
hiperparametros = {
    'timeOnPage_normalized' : 0.2,
    'scrollPercentage_normalized' : 0.2,
    'pageVisitsCount_normalized' : 0.3,
    'numberOfClicksHistory_normalized' : 0.3
}

## Definindo funções auxiliares

In [None]:
def parse_float_list(value):
    if pd.isna(value):
        return []
    return [float(x) for x in value.split(',')]

def parse_str_list(value):
    if pd.isna(value):
        return []
    return [str(x) for x in value.split(',')]

def parse_int_list(value):
    if pd.isna(value):
        return []
    return [int(x) for x in value.split(',')]

def parse_int_list_valid(value):
    if pd.isna(value):
        return []

    str_list = value.replace("[", "").replace("]", "").replace("\n", ",").replace(" ", ",")

    result = [int(item.strip()) for item in str_list.split(",") if item.strip()]

    return result

In [None]:
PATH = '/content/drive/MyDrive/'

In [None]:
with open(f'{PATH}/tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

with open(f'{PATH}/svd_model.pkl', 'rb') as f:
    svd = pickle.load(f)

def get_embedding(texto):
    """Gera o embedding TF-IDF reduzido usando TruncatedSVD treinado."""

    transformed_text = vectorizer.transform([texto])


    if transformed_text.shape[0] > 0:
        tfidf_reduced = svd.transform(transformed_text)
        return tfidf_reduced[0]

    return None

In [None]:
def weighted_avg_embedding(user_group):
    scores = np.array(user_group['engagement_score'])
    embeddings = np.stack(user_group['news_embedding'])

    scores += 1e-6

    scores /= scores.sum() if scores.sum() > 0 else 1

    return np.average(embeddings, axis=0, weights=scores)

## Carregando dados de treinamento

In [None]:
access1 = pd.read_csv(f'{PATH}treino_parte1.csv',  converters={
    'scrollPercentageHistory': parse_float_list,
    'timestampHistory' : parse_int_list,
    'numberOfClicksHistory' : parse_int_list,
    'timeOnPageHistory' : parse_int_list,
    'pageVisitsCountHistory' : parse_int_list,
    'history' : parse_str_list,
    }).sample(frac=0.50, random_state=42)

access2 = pd.read_csv(f'{PATH}treino_parte2.csv',  converters={
    'scrollPercentageHistory': parse_float_list,
    'timestampHistory' : parse_int_list,
    'numberOfClicksHistory' : parse_int_list,
    'timeOnPageHistory' : parse_int_list,
    'pageVisitsCountHistory' : parse_int_list,
    'history' : parse_str_list,
    }).sample(frac=0.50, random_state=42)
access3 = pd.read_csv(f'{PATH}treino_parte3.csv',  converters={
    'scrollPercentageHistory': parse_float_list,
    'timestampHistory' : parse_int_list,
    'numberOfClicksHistory' : parse_int_list,
    'timeOnPageHistory' : parse_int_list,
    'pageVisitsCountHistory' : parse_int_list,
    'history' : parse_str_list,
    }).sample(frac=0.50, random_state=42)
access4 = pd.read_csv(f'{PATH}treino_parte4.csv',  converters={
    'scrollPercentageHistory': parse_float_list,
    'timestampHistory' : parse_int_list,
    'numberOfClicksHistory' : parse_int_list,
    'timeOnPageHistory' : parse_int_list,
    'pageVisitsCountHistory' : parse_int_list,
    'history' : parse_str_list,
    }).sample(frac=0.50, random_state=42)
access5 = pd.read_csv(f'{PATH}treino_parte5.csv',  converters={
    'scrollPercentageHistory': parse_float_list,
    'timestampHistory' : parse_int_list,
    'numberOfClicksHistory' : parse_int_list,
    'timeOnPageHistory' : parse_int_list,
    'pageVisitsCountHistory' : parse_int_list,
    'history' : parse_str_list,
    }).sample(frac=0.50, random_state=42)
access6 = pd.read_csv(f'{PATH}treino_parte6.csv',  converters={
    'scrollPercentageHistory': parse_float_list,
    'timestampHistory' : parse_int_list,
    'numberOfClicksHistory' : parse_int_list,
    'timeOnPageHistory' : parse_int_list,
    'pageVisitsCountHistory' : parse_int_list,
    'history' : parse_str_list,
    }).sample(frac=0.50, random_state=42)

df_access = pd.concat([access1, access2, access3, access4, access5, access6], ignore_index=True)

In [None]:
del access1
del access2
del access3
del access4
del access5
del access6

In [None]:
news1 = pd.read_csv(f'{PATH}/itens-parte1.csv')
news2 = pd.read_csv(f'{PATH}/itens-parte2.csv')
news3 = pd.read_csv(f'{PATH}/itens-parte3.csv')

df_news= pd.concat([news1, news2, news3], ignore_index=True)

NameError: name 'pd' is not defined

In [None]:
del news1
del news2
del news3

## Pré processamento

In [None]:
cols_to_check = [col for col in df_access.columns if not isinstance(df_access[col].iloc[0], list)]
df_access.drop_duplicates(subset=cols_to_check, inplace=True)
df_access.dropna(inplace=True)

In [None]:
news_lines = []
lines_erros = []

for _, row in df_access.iterrows():
    try:
        history_len = len(row['history'])
        if (history_len != len(row['timestampHistory']) or
            history_len != len(row['numberOfClicksHistory']) or
            history_len != len(row['timeOnPageHistory']) or
            history_len != len(row['scrollPercentageHistory']) or
            history_len != len(row['pageVisitsCountHistory'])):
            raise ValueError("As listas nas colunas têm comprimentos diferentes.")

        for i in range(history_len):
            news_lines.append({
                'userId': str(row['userId']),
                'userType': str(row['userType']),
                'historySize': int(row['historySize']),
                'newsId': str(row['history'][i]),
                'timestampHistory': int(row['timestampHistory'][i]),
                'numberOfClicksHistory': int(row['numberOfClicksHistory'][i]),
                'timeOnPageHistory': int(row['timeOnPageHistory'][i]),
                'scrollPercentageHistory': float(row['scrollPercentageHistory'][i]),
                'pageVisitsCountHistory': int(row['pageVisitsCountHistory'][i])
            })

    except Exception as e:
        lines_erros.append({
            'line': row['userId'],
            'error': str(e)
        })

In [None]:
if lines_erros:
    print("Erros encontrados:")
    for erro in lines_erros:
        print(erro)
else:
    print("Nenhum erro encontrado.")

Nenhum erro encontrado.


In [None]:
access_exploded= pd.DataFrame(news_lines)

In [None]:
del news_lines
del lines_erros

### Normalizando data de publicação para trabalhar com timestamp

In [None]:
df_news['issued_timestamp'] = pd.to_datetime(df_news['issued'], utc=True, errors='coerce')
df_news['issued_timestamp'] = df_news['issued_timestamp'].apply(lambda x: int(x.timestamp()) * 1000 if pd.notnull(x) else None)
scaler_issued = MinMaxScaler()
df_news['issued_timestamp_normalized'] = scaler_issued.fit_transform(df_news[['issued_timestamp']])

### Gerando embeddings das noticias

In [None]:
df_news['news_embedding'] = df_news['title'].apply(get_embedding)

### Realizando merge entre os datasets

In [None]:
access_exploded['newsId'] = access_exploded['newsId'].str.strip().str.lower()
df_news['page'] = df_news['page'].str.strip().str.lower()

In [None]:
#Salvando as noticias
df_save_news = df_news[['page', 'title', 'caption', 'body', 'url','issued_timestamp', 'news_embedding']]
df_save_news.to_parquet(f'{PATH}df_news.parquet', index=False)
del df_save_news

In [None]:
inner_joined = access_exploded.merge(
    df_news,
    left_on='newsId',
    right_on='page',
    how='inner'
)

inner_joined.drop(columns=['page'], inplace=True)

In [None]:
# Removendo noticias lidas antes da publicação da noticia
inner_joined_filtred = inner_joined[inner_joined['timestampHistory'] > inner_joined['issued_timestamp']]

In [None]:
del inner_joined

### Calculando popularidade

In [None]:
popularity_counts = inner_joined_filtred['newsId'].value_counts().reset_index()
popularity_counts.columns = ['newsId', 'popularity_score']

inner_joined_filtred = inner_joined_filtred.merge(
    popularity_counts, on='newsId', how='left'
)

scaler_popularity = MinMaxScaler()
inner_joined_filtred['popularity_score_normalized'] = scaler_popularity.fit_transform(
    inner_joined_filtred[['popularity_score']]
)

inner_joined_filtred.drop(columns=['popularity_score'], inplace=True)

### Calculando score de engajamento

In [None]:
# Score de engajamento
scaler_timestamp = MinMaxScaler()
inner_joined_filtred['timestamp_normalized'] = scaler_timestamp.fit_transform(
    inner_joined_filtred[['timestampHistory']]
)

scaler_numberOfClicksHistory = MinMaxScaler()
inner_joined_filtred['numberOfClicksHistory_normalized'] = scaler_numberOfClicksHistory.fit_transform(
    inner_joined_filtred[['numberOfClicksHistory']]
)

scaler_timeOnPage = MinMaxScaler()
inner_joined_filtred['timeOnPage_normalized'] = scaler_timeOnPage.fit_transform(
    inner_joined_filtred[['timeOnPageHistory']]
)
scaler_scroll = MinMaxScaler()
inner_joined_filtred['scrollPercentage_normalized'] = scaler_scroll.fit_transform(
    inner_joined_filtred[['scrollPercentageHistory']]
)
scaler_pageVisits = MinMaxScaler()
inner_joined_filtred['pageVisitsCount_normalized'] = scaler_pageVisits.fit_transform(
    inner_joined_filtred[['pageVisitsCountHistory']]
)


inner_joined_filtred['engagement_score_raw'] = (
    hiperparametros.get('timeOnPage_normalized',0.1) * inner_joined_filtred['timeOnPageHistory'] +
    hiperparametros.get('scrollPercentage_normalized',0.1) * inner_joined_filtred['scrollPercentageHistory'] +
    hiperparametros.get('pageVisitsCount_normalized',0.1)  * inner_joined_filtred['pageVisitsCountHistory'] +
    hiperparametros.get('numberOfClicksHistory_normalized',0.1) * inner_joined_filtred['numberOfClicksHistory']
)

scaler_engagement = MinMaxScaler()
inner_joined_filtred['engagement_score'] = scaler_engagement.fit_transform(inner_joined_filtred[['engagement_score_raw']])

In [None]:
inner_joined_filtred.head()

Unnamed: 0,userId,userType,historySize,newsId,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,url,...,issued_timestamp_normalized,news_embedding,popularity_score_normalized,timestamp_normalized,numberOfClicksHistory_normalized,timeOnPage_normalized,scrollPercentage_normalized,pageVisitsCount_normalized,engagement_score_raw,engagement_score
0,f737327be39d6e6f4f29702c77137b715fdb41d693a9b8...,Non-Logged,1,6e1511be-b04d-475f-9338-cc9a6b4f10d1,1658406502336,0,23248,32.13,1,http://g1.globo.com/ba/bahia/noticia/2019/07/1...,...,0.614686,"[0.09124679079248565, 0.027996247035304258, 0....",0.002811,0.453216,0.0,0.000827,6.1e-05,0.0,4656.326,0.000827
1,2de1394fa39edb0fa95eb1197d858fddc2cc0da24930e3...,Logged,1,12e4f2ef-66e2-4816-8267-cf20d45fce79,1656898821075,16,87672,43.84,1,http://g1.globo.com/mundo/noticia/2022/01/10/p...,...,0.926325,"[0.3197532296569859, -0.10587621610525713, 0.4...",0.002977,0.065437,0.001099,0.003746,8.3e-05,0.0,17548.268,0.003747
2,c44c5655b5093387fe48c1657d33a1903149f2c24c555c...,Non-Logged,1,8cd82ad5-bb03-4872-bdbb-a56cef1445aa,1659526627549,0,8672,6.56,1,http://g1.globo.com/meio-ambiente/noticia/2021...,...,0.905481,"[0.30600950530297216, -0.09707926833721148, 0....",0.003804,0.741316,0.0,0.000166,1.2e-05,0.0,1736.012,0.000166
3,76571af9a2e9e1b05183e31872fa1ca38fbfeb4bc578c5...,Non-Logged,1,0c316026-18b5-4da3-a9d3-10b3696a88ad,1657325082237,14,32973,79.05,1,http://g1.globo.com/mg/sul-de-minas/noticia/20...,...,0.909637,"[0.3036497558370645, 0.5902878081033303, -0.11...",0.000248,0.175072,0.000961,0.001268,0.00015,0.0,6614.91,0.001271
4,9897f1558bbe8237c14f4eb66fafdbfc5a82e349981423...,Logged,2,f6b5d170-48b9-4f8e-88d4-c84b6668f3bd,1660046047726,1,65499,22.28,1,http://g1.globo.com/politica/blog/andreia-sadi...,...,0.991264,"[0.2672366129675922, 0.33229563038249665, 0.37...",0.712915,0.874912,6.9e-05,0.002741,4.2e-05,0.0,13104.856,0.00274


### Calculando embedding médio ponderado do usuário

In [None]:
user_embeddings = inner_joined_filtred.groupby('userId').apply(weighted_avg_embedding)

  user_embeddings = inner_joined_filtred.groupby('userId').apply(weighted_avg_embedding)


In [None]:
inner_joined_filtred['user_weighted_embedding'] = inner_joined_filtred['userId'].map(user_embeddings)

In [None]:
#Salvando os usuarios
grouped_users = inner_joined_filtred.groupby('userId')[['userId', 'user_weighted_embedding']].first()
grouped_users.to_parquet(f'{PATH}df_users.parquet', index=False)
del grouped_users

In [None]:
#Salvando acessos
access = inner_joined_filtred[['userId', 'newsId', 'timestampHistory' ,'engagement_score']]
access.to_parquet(f'{PATH}df_access.parquet', index=False)
del access

In [None]:
inner_joined_filtred['popularity_score'] = inner_joined_filtred['popularity_score_normalized']
inner_joined_filtred['issued_timestamp'] = inner_joined_filtred['issued_timestamp_normalized']

### Salvando dados pré treinados para posterior treinamento de modelo

In [None]:
inner_joined_filtred = inner_joined_filtred[['userId', 'newsId', 'engagement_score', 'popularity_score', 'issued_timestamp' ,'news_embedding', 'user_weighted_embedding']]

In [None]:
chunks = np.array_split(inner_joined_filtred, 6)

  return bound(*args, **kwds)


In [None]:
del inner_joined_filtred
gc.collect()
sleep(10)

In [None]:
for i, chunk in enumerate(chunks):

    #chunk['news_embedding'] = chunk['news_embedding'].apply(lambda x: ','.join(map(str, x)) if x is not None else None)
    #chunk['user_weighted_embedding'] = chunk['user_weighted_embedding'].apply(lambda x: ','.join(map(str, x)) if x is not None else None)

    chunk.to_parquet(f"{PATH}chunk_{i}.parquet", index=False)
    print(f'chunk_{i}.parquet salvo!')


    del chunk
    gc.collect()
    sleep(5)

print("Processamento concluído!")

chunk_0.parquet salvo!
chunk_1.parquet salvo!
chunk_2.parquet salvo!
chunk_3.parquet salvo!
chunk_4.parquet salvo!
chunk_5.parquet salvo!
Processamento concluído!
