## Importando bibliotecas

In [None]:
#!pip install lightfm
#!pip install -U bentoml

## Definindo hiperparâmetros

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from lightfm import LightFM
from lightfm.data import Dataset

In [None]:
hiperparametros = {
    'k': 10,
    'num_threads' : 2,
    'loss' : 'warp',
    'item_alpha' :1e-5, # penalidade de features de items
    'user_alpha' :1e-5,  # penalidade de features de usuarios
    'random_state' : 42,
    'epochs' : 10,
    'heuristic': {
        'popularity_wheight' : 0.8,
        'recency_wheight' : 0.2
    }
}

In [None]:
PATH = '/content/drive/MyDrive/'

## Modelagem

### Carregando treinamento

In [None]:
train1 = pd.read_parquet(f'{PATH}chunk_0.parquet').sample(frac=0.01)
train2 = pd.read_parquet(f'{PATH}chunk_1.parquet')
train3 = pd.read_parquet(f'{PATH}chunk_2.parquet')
train4 = pd.read_parquet(f'{PATH}chunk_3.parquet')
train5 = pd.read_parquet(f'{PATH}chunk_4.parquet')
train6 = pd.read_parquet(f'{PATH}chunk_5.parquet')

train_data = pd.concat([train1, train2, train3, train4, train5, train6], ignore_index=True)

In [None]:
del train1
del train2
del train3
del train4
del train5
del train6

In [None]:
train_data.head()

Unnamed: 0,userId,newsId,engagement_score,popularity_score,issued_timestamp,news_embedding,user_weighted_embedding
19018,6cc090e8e3a058a35e3cbe83a33b13d33cad4ff1ef3c65...,6f75413a-d1d4-4f32-8ecd-f02a532a7b06,0.00364,0.058138,0.99045,"[0.026605241722758355, 0.01291719678240135, 0....","[0.33602800752146056, 0.020283905642255774, 0...."
97460,e838e83b35e07112893890b7c4848cc9f2631ad7a68b17...,e987b2ab-5b21-4320-8470-a0a9947d12f2,0.002041,0.0427,0.987104,"[0.06985880609630948, 0.042148433484136015, 0....","[0.3337053855989045, 0.08034954486359032, 0.02..."
125304,81b6ee3309bbb266444fd31d9161ad5e0c5a065fe7d8df...,17f9cb6a-ec41-49ad-a52c-748b318c7e4c,0.000227,0.006898,0.994309,"[0.051342856906292346, 0.01261143205606899, 0....","[0.33464175710557886, 0.07202656897011842, 0.0..."
115844,7faf146514fdf9e61f39af35b9cb5854e847dd2bc8f78f...,d2593c3d-2347-40d9-948c-b6065e8459a9,0.001134,1.0,0.988088,"[0.41131494845979033, 0.16391853784663804, 0.0...","[0.38531372849297446, -0.008160789690843535, 0..."
183691,3026a21a23d1c53a0260339d4cf6cca2c3c0580fdca003...,64be40be-0f38-400e-bdb5-229289ac9ce3,0.000679,0.018887,0.985075,"[0.3697049023898116, 0.1607022227848471, -0.04...","[0.4845935623383476, -0.1464559127313242, -0.0..."


### Heurística

In [None]:
heuristic_popularity = (
    train_data.drop_duplicates(subset='newsId')
    .sort_values(by='popularity_score', ascending=False)
    ['newsId']
    .tolist()
)
heuristic_recency = (
    train_data.drop_duplicates(subset='newsId')
    .sort_values(by='issued_timestamp', ascending=True)
    ['newsId']
    .tolist()
)

In [None]:
df_popularity = (
    train_data.drop_duplicates(subset='newsId')
    .sort_values(by='popularity_score', ascending=False)
    [['newsId', 'popularity_score']]
)
df_recency = (
    train_data.drop_duplicates(subset='newsId')
    .sort_values(by='issued_timestamp', ascending=True)
    [['newsId', 'issued_timestamp']]
)

In [None]:
df_combine = df_popularity.merge(df_recency, on='newsId')

In [None]:
#Normalizando para atribuição de pesos
df_combine['popularity_score'] = df_combine['popularity_score'] / df_combine['popularity_score'].max()
df_combine['recency_score'] = 1 - (df_combine['issued_timestamp'].rank() / len(df_combine))

In [None]:
df_combine['combine_score'] = (
    hiperparametros.get('heuristic',None).get('popularity_wheight',None) * df_combine['popularity_score'] +
    hiperparametros.get('heuristic',None).get('recency_wheight',None)  * df_combine['recency_score']
)

In [None]:
df_combine = df_combine.sort_values(by='combine_score', ascending=False)
heuristic_combine = df_combine['newsId'].tolist()
print("combine ponderado:", heuristic_combine)

combine ponderado: ['d2593c3d-2347-40d9-948c-b6065e8459a9', '855d20b7-53f2-4678-a10f-55402d085018', '4c63d7cd-4902-4ffb-9b94-578b1b2151f0', 'f6b5d170-48b9-4f8e-88d4-c84b6668f3bd', '89fa73f0-4341-4de4-bb2a-e429ef96bd43', 'e5185368-70f8-4998-a738-ca22f300da7b', '61e07f64-cddf-46f2-b50c-ea0a39c22050', '458bf0ec-efb4-4bfd-9446-c80295e6aa87', '1a3641be-fa6a-4d72-a5eb-9069686a88ee', '362d282e-5d9d-4691-86a6-21bcac0703d7', 'e384ec29-136e-4241-9321-49b367b8cbd5', 'c041f6d7-df78-40ee-8832-6772a0db14bb', '07b5ae90-4161-4463-945e-41a48fdced21', '38578b5c-4509-49df-ad79-c62cc914e4a5', '15281e10-e6bc-48bc-9b1b-94402f83699b', '8e0884e2-50ba-44c0-87b9-d64d913288af', '8d477e04-3bab-4ad9-8fe3-799059238a9c', 'bd4e7054-4043-4acf-9a49-7d883152189d', '5b90df37-0b8b-44d6-abdb-6fad31217c26', 'cb324527-6a66-491d-b53c-6a6d7ece566f', '1045a19a-a686-47b8-aa54-33c1063613d6', '529a23e0-baa8-4824-9554-416ceba464ae', '7f747187-72e4-4791-996b-4518742dc672', '5af99ef3-adff-464d-9976-5c5d6391b014', '83568c85-7264-4e45-

### Pré processamento

In [None]:
# Criando mapeamentos de itens para recência
item_recency = {row['newsId']: row['issued_timestamp'] for _, row in train_data.iterrows()}

In [None]:
user_embeddings_train = {row['userId']: np.array(row['user_weighted_embedding']) for _, row in train_data.iterrows()}

In [None]:
news_embeddings_train = {row['newsId']: np.array(row['news_embedding']) for _, row in train_data.iterrows()}

### Criando matrizes

In [None]:
num_users_train = len(train_data['userId'])
embedding_dim_users_train = len(next(iter(user_embeddings_train.values())))

In [None]:
num_news_train = len(train_data['newsId'])
embedding_dim_news_train = len(next(iter(news_embeddings_train.values())))

In [None]:
users_feature_matrix_train = np.zeros((num_users_train, embedding_dim_users_train))
user_id_map_train = {user_id: i for i, user_id in enumerate(list(user_embeddings_train.keys()))}

In [None]:
item_feature_matrix_train = np.zeros((num_news_train, embedding_dim_news_train))
item_id_map_train = {news_id: i for i, news_id in enumerate(list(news_embeddings_train.keys()))}

In [None]:
for user_id, embedding in user_embeddings_train.items():
    users_feature_matrix_train[user_id_map_train[user_id]] = embedding

In [None]:
for news_id, embedding in news_embeddings_train.items():
    item_feature_matrix_train[item_id_map_train[news_id]] = embedding

### Criando dataset lightFM

In [None]:
dataset = Dataset()
dataset.fit(
    users=train_data['userId'].unique(),
    user_features=[f"emb_{i}" for i in range(embedding_dim_news_train)],
    items=train_data['newsId'].unique(),
    item_features=[f"emb_{i}" for i in range(embedding_dim_news_train)] + ['recency'] + ['popularity'],
)

In [None]:
dataset.mapping()

({'6cc090e8e3a058a35e3cbe83a33b13d33cad4ff1ef3c65431a2693f8acc79305': 0,
  'e838e83b35e07112893890b7c4848cc9f2631ad7a68b17a735e7db0a1647be88': 1,
  '81b6ee3309bbb266444fd31d9161ad5e0c5a065fe7d8dfbac7c3b0cd137eeb50': 2,
  '7faf146514fdf9e61f39af35b9cb5854e847dd2bc8f78f778fa61fc7e4e094be': 3,
  '3026a21a23d1c53a0260339d4cf6cca2c3c0580fdca003ec2f948b6edfb2b79c': 4,
  '33eaa9788ca42389decfdd6ca940cef8fd202acd15b93d21094e0185689ae296': 5,
  '02ed360418524ee2fd0a1999b70fbdff60e1a9fe5920c068b341c86335ddbc26': 6,
  'bd5fc30086a4bf9f66c93ca9966be7e8677bf358e9671b170ad7b84600984e5a': 7,
  '904598f368c64ec1b0541d58bfa606e20bf6f1295765cb834d35eb618a62f428': 8,
  'd616d60701fe6869361bae8d0d2e2c9ed245565c16c80066af859b0604be7077': 9,
  '1081231723c0038568b4b25ab7db6fb030e65f4925138435257c1a1a9661e2a2': 10,
  'dcc827eb68ed2cfeeffa6dc1fd3c5eee0d9599e08c9d65916a9743a4336e076d': 11,
  '031f476e95d7560063c7c92a835b522c19a2803055c1627445258638090a55df': 12,
  '1cfc71b503c7ffb32afd0e5685d7cdd805cbfecbca8f6

In [None]:
(interactions_train, wheights_train) = dataset.build_interactions([(row['userId'], row['newsId'], row['engagement_score']) for _, row in train_data.iterrows()])

In [None]:
item_recency_train = {row['newsId']: row['issued_timestamp'] for _, row in train_data.iterrows()}
item_popularity_train = {row['newsId']: row['popularity_score'] for _, row in train_data.iterrows()}

In [None]:
item_features_train = dataset.build_item_features(
    [
        (news_id,
         {f"emb_{i}": value for i, value in enumerate(embedding)} |
         {"recency": item_recency_train[news_id]} |
         {"popularity": item_popularity_train[news_id]})
        for news_id, embedding in news_embeddings_train.items()
    ], normalize=False
)

In [None]:
user_features_train = dataset.build_user_features(
    [
        (user_id, {f"emb_{i}": value for i, value in enumerate(embedding_avg)})
        for user_id, embedding_avg in user_embeddings_train.items()
    ],
    normalize=False
)

In [None]:
# Treinando modelo
model = LightFM(
      loss=hiperparametros.get('loss', 'warp'),
      item_alpha=hiperparametros.get('item_alpha', 1e-5),
      user_alpha=hiperparametros.get('user_alpha', 1e-5),
      random_state=hiperparametros.get('random_state', 42)
      )
model.fit(
      interactions_train,
      epochs=hiperparametros.get('epochs',10),
      num_threads=hiperparametros.get('num_threads',2),
      item_features=item_features_train,
      user_features=user_features_train
)

<lightfm.lightfm.LightFM at 0x7db0d1c56e10>

In [None]:
# Mapeamentos de usuários e itens
user_mapping_train = dataset.mapping()[0]
item_mapping_train = dataset.mapping()[2]

In [None]:
user_mapping_train, item_mapping_train

({'6cc090e8e3a058a35e3cbe83a33b13d33cad4ff1ef3c65431a2693f8acc79305': 0,
  'e838e83b35e07112893890b7c4848cc9f2631ad7a68b17a735e7db0a1647be88': 1,
  '81b6ee3309bbb266444fd31d9161ad5e0c5a065fe7d8dfbac7c3b0cd137eeb50': 2,
  '7faf146514fdf9e61f39af35b9cb5854e847dd2bc8f78f778fa61fc7e4e094be': 3,
  '3026a21a23d1c53a0260339d4cf6cca2c3c0580fdca003ec2f948b6edfb2b79c': 4,
  '33eaa9788ca42389decfdd6ca940cef8fd202acd15b93d21094e0185689ae296': 5,
  '02ed360418524ee2fd0a1999b70fbdff60e1a9fe5920c068b341c86335ddbc26': 6,
  'bd5fc30086a4bf9f66c93ca9966be7e8677bf358e9671b170ad7b84600984e5a': 7,
  '904598f368c64ec1b0541d58bfa606e20bf6f1295765cb834d35eb618a62f428': 8,
  'd616d60701fe6869361bae8d0d2e2c9ed245565c16c80066af859b0604be7077': 9,
  '1081231723c0038568b4b25ab7db6fb030e65f4925138435257c1a1a9661e2a2': 10,
  'dcc827eb68ed2cfeeffa6dc1fd3c5eee0d9599e08c9d65916a9743a4336e076d': 11,
  '031f476e95d7560063c7c92a835b522c19a2803055c1627445258638090a55df': 12,
  '1cfc71b503c7ffb32afd0e5685d7cdd805cbfecbca8f6

### Carreagando validação

In [None]:
def parse_int_list_valid(value):
    if pd.isna(value):
        return []

    str_list = value.replace("[", "").replace("]", "").replace("\n", ",").replace(" ", ",")

    result = [int(item.strip()) for item in str_list.split(",") if item.strip()]

    return result

def parse_str_list_valid(value):
    if pd.isna(value):
        return []

    str_list = value.replace("[", "").replace("]", "").replace("\n", ",").replace(" ", ",").replace("'", "")

    result = [item.strip() for item in str_list.split(",") if item.strip()]

    return result

In [None]:
val_data = pd.read_csv(f'{PATH}validacao.csv', converters={
    'history': parse_str_list_valid,
    'timestampHistory': parse_int_list_valid
})

In [None]:
val_data['userId'] = val_data['userId'].astype(str)
val_data['history'] = val_data['history'].apply(lambda x: [str(i) for i in x])

In [None]:
val_data['type_history'] = val_data['history'].apply(type)

In [None]:
# Função para validar o modelo
def simple_validation(model, val_data, user_mapping_train, item_mapping_train, top_n=5):
    result = []
    total_users = len(user_mapping_train.values())

    for _, row in val_data.iterrows():
        user_id = row['userId']
        history = row['history']

        internal_user_id = user_mapping_train.get(user_id)
        if internal_user_id is None:
            print(f"Usuário {user_id} não encontrado no mapeamento. Ignorando.")
            continue

        scores = model.predict(
            user_ids=internal_user_id,
            item_ids=np.array(list(item_mapping_train.values()))
        )

        top_items = np.argsort(-scores)[:top_n]
        news_recommended = [list(item_mapping_train.keys())[item] for item in top_items]

        print(f"Usuário: {user_id}")
        print(f"Notícias recomendadas: {news_recommended}")
        print(f"Notícias lidas: {history}")
        print(f"Acerto: {'Sim' if any(news in history for news in news_recommended) else 'Não'}")
        print("-" * 40)

        result.append({
            'user_id': user_id,
            'history': history,
            'news_recommended': news_recommended,
            'hit_recommended': any(news in history for news in news_recommended),
            'heuristic_popularity': heuristic_popularity[:top_n],
            'hit_heuristic_popularity': any(news in history for news in heuristic_popularity[:top_n]),
            'heuristic_recency': heuristic_recency[:top_n],
            'hit_heuristic_recency': any(news in history for news in heuristic_recency[:top_n]),
            'heuristic_combine': heuristic_combine[:top_n],
            'hit_heuristic_combine': any(news in history for news in heuristic_combine[:top_n]),
        })

    return result

In [None]:
df_result = pd.DataFrame(simple_validation(model, val_data, user_mapping_train, item_mapping_train, top_n=hiperparametros.get('k',5)))

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
Usuário d302337e74e43444a9c03f5bf19f0ba6a5a85c871cb931d8768ed5d2e3c5897d não encontrado no mapeamento. Ignorando.
Usuário 1b273ca6f932160058996e56802d04fb70329f8ed1ad60738e194e9682281165 não encontrado no mapeamento. Ignorando.
Usuário ac4bf940349d0012074bd5dc71aec5d3394d55247d4c2cd8d6a54f4642013cca não encontrado no mapeamento. Ignorando.
Usuário 282f122713cd57f64e1f976d7d6ab78a80d6dc71ba635389464abb586529c712 não encontrado no mapeamento. Ignorando.
Usuário 9d2860d429f51e2ef8c68d2098788b12413e3369d7113bf9f90974145f5d17e4 não encontrado no mapeamento. Ignorando.
Usuário f794d64a75d7765bff3bbb593d55b80d26b35cce59e0d92d5298f7fc60b1f96c não encontrado no mapeamento. Ignorando.
Usuário 78bee1cd4960ce25928217a7762c510db195df79abb3b239f699d273883ddc10 não encontrado no mapeamento. Ignorando.
Usuário 038ce9f776f94558ede974975652037cc4da07235ec4595ccd75ab35fa5f9cc7 não encontrado no mapeamento. Ignorando.
Usuário c83db00

In [None]:
df_result.head(100)

Unnamed: 0,user_id,history,news_recommended,hit_recommended,heuristic_popularity,hit_heuristic_popularity,heuristic_recency,hit_heuristic_recency,heuristic_combine,hit_heuristic_combine
0,fadeba4a4c9b9b9436295676ea32687592582cabca39e9...,"[4a0fcad6-a43c-4a3a-b405-41e463008bd2, 93150cf...","[a76dc579-5e75-4d29-84ec-0e8b7cf477e5, 91dd61b...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False
1,cf1ef0e961dd7435ac88e0f5f3ba2014f301861ed515b2...,"[df8aa871-86d8-4ad5-8c7d-70cc509ec0b2, fc6d569...","[3211b1db-98b2-47f1-b641-8dedfc38f654, e40ae16...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False
2,5bc0bbe89e34a4df8436218e9e6d99db191e42f973e8d4...,[be89a7da-d9fa-49d4-9fdc-388c27a15bc8],"[1dc0c4a8-0e13-454d-b3cc-16181bdf727d, fd3c217...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False
3,85429dd0a6868e3e134f0da1f7317912d640c91b2533e8...,[eb23272d-8e6c-479d-b972-eabeb5f6f3dd],"[3211b1db-98b2-47f1-b641-8dedfc38f654, 3482566...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False
4,6da503c22404be1c30128e886d1bdf35243d65acb9ed16...,"[1c4dd193-af76-4d6e-ada2-ef9e6979b6be, aeab0e4...","[b2cbb11c-edd1-4ec2-92fc-23d40d23fac7, b6bad9c...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False
...,...,...,...,...,...,...,...,...,...,...
95,b65d7b93506e97144766d0bbe03adc8da189c384bcda2d...,[eb23272d-8e6c-479d-b972-eabeb5f6f3dd],"[ae495db6-141a-43ef-841b-d5f01ab280fb, fbffe52...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False
96,c117656e03e3045d649e2af18d08ace410dcf75d30df91...,"[950dc359-68bc-4789-a955-6c28849c9344, cd814be...","[3211b1db-98b2-47f1-b641-8dedfc38f654, d1b1b97...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False
97,4330ddb239d210528259dc2a75143d3f3fd460c58002d4...,"[4c3d47a1-6f4b-424f-8944-6c227e686c5c, 9c764c3...","[b2cbb11c-edd1-4ec2-92fc-23d40d23fac7, a832a6e...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False
98,6e4621c8cdaa7eaba069c3d6cc03d9f7966c0a24653a1a...,"[1523a1a4-30fc-463b-83bf-a1272edfba69, 190930d...","[3211b1db-98b2-47f1-b641-8dedfc38f654, 6d773e0...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, f6b5d17...",False,"[883f4280-2bfc-4d2d-b9c1-9fdae0f8bf76, 9258aa7...",False,"[d2593c3d-2347-40d9-948c-b6065e8459a9, 855d20b...",False


In [None]:
df_filtered = df_result[df_result['hit_recommended'] == True]
df_filtered.count()

Unnamed: 0,0
user_id,0
history,0
news_recommended,0
hit_recommended,0
heuristic_popularity,0
hit_heuristic_popularity,0
heuristic_recency,0
hit_heuristic_recency,0
heuristic_combine,0
hit_heuristic_combine,0


In [None]:
df_filtered = df_result[df_result['hit_heuristic_popularity'] == True]
df_filtered.count()

Unnamed: 0,0
user_id,1
history,1
news_recommended,1
hit_recommended,1
heuristic_popularity,1
hit_heuristic_popularity,1
heuristic_recency,1
hit_heuristic_recency,1
heuristic_combine,1
hit_heuristic_combine,1


In [None]:
df_filtered = df_result[df_result['hit_heuristic_recency'] == True]
df_filtered.count()

Unnamed: 0,0
user_id,0
history,0
news_recommended,0
hit_recommended,0
heuristic_popularity,0
hit_heuristic_popularity,0
heuristic_recency,0
hit_heuristic_recency,0
heuristic_combine,0
hit_heuristic_combine,0


In [None]:
df_filtered = df_result[df_result['hit_heuristic_combine'] == True]
df_filtered.count()

Unnamed: 0,0
user_id,0
history,0
news_recommended,0
hit_recommended,0
heuristic_popularity,0
hit_heuristic_popularity,0
heuristic_recency,0
hit_heuristic_recency,0
heuristic_combine,0
hit_heuristic_combine,0


In [None]:
hit_rate_model = df_result['hit_recommended'].mean()
hit_rate_popularity = df_result['hit_heuristic_popularity'].mean()
hit_rate_recency = df_result['hit_heuristic_recency'].mean()
hit_rate_combined = df_result['hit_heuristic_combine'].mean()
print("-" * 40)
print(f'hit hate de model = {hit_rate_model}')
print(f'hit hate de popularity = {hit_rate_popularity}')
print(f'hit hate de recency = {hit_rate_recency}')
print(f'hit hate de combined = {hit_rate_combined}')
print("-" * 40)

----------------------------------------
hit hate de model = 0.0
hit hate de popularity = 0.0012515644555694619
hit hate de recency = 0.0
hit hate de combined = 0.0
----------------------------------------


### Salvando modelo