## Importando bibliotecas

In [1]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831157 sha256=5d3c70dda2f76f0a3928d31688b0832212511d20131c286248892460ef0dc57a
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [2]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from lightfm import LightFM
from lightfm.data import Dataset
import shutil
import pickle
from google.colab import files
import cloudpickle as cp

In [None]:
PATH = '/content/drive/MyDrive/'

## Criando custom lightFM

In [None]:
class CustomLightFM:
    def __init__(self, loss="warp"):
        self.model = LightFM(
            loss=loss,
            item_alpha=1e-5,
            user_alpha=1e-5,
            random_state=42
        )

        self.user_encoder = LabelEncoder()
        self.news_encoder = LabelEncoder()

        self.scaler_popularity = MinMaxScaler()
        self.scaler_recency = MinMaxScaler()

        self.user_count = 0
        self.news_count = 0

        self.interactions = None
        self.weights = None

        self.user_features = None
        self.item_features = None

        self.more_popularity = None

        self.state = 'ACTIVE'

    def fit(self, interactions, user_features=None, item_features=None, epochs=10, num_threads=1, verbose = True):
        """Treina o modelo com interações e features opcionais de usuários e itens."""
        print('INICIANDO PROCESSOD DE RETREINAMENTO')

        self.model.fit(interactions, user_features=user_features, item_features=item_features, epochs=epochs, num_threads=num_threads, verbose = verbose)

        self.interactions = interactions

        return True

    def add_user(self, user_id):
        """Adiciona um novo usuário ao modelo."""
        if user_id in self.user_encoder.classes_:
            return self.user_encoder.transform([user_id])[0]

        self.user_encoder.classes_ = np.append(self.user_encoder.classes_, user_id)
        new_id = self.user_count
        self.user_count += 1
        return new_id

    def add_news(self, news_id):
        """Adiciona uma nova notícia ao modelo."""
        if news_id in self.news_encoder.classes_:
            return self.news_encoder.transform([news_id])[0]

        self.news_encoder.classes_ = np.append(self.news_encoder.classes_, news_id)
        new_id = self.news_count
        self.news_count += 1
        return new_id

    def predict(self, user_id, news_ids, top_n=5) -> list:
        """Faz uma predição de recomendação para um usuário com base no ID externo."""
        try:
            user_internal = self.user_encoder.transform([user_id])[0]
            news_internal = self.news_encoder.transform(news_ids)

            user_array = np.full(len(news_internal), user_internal)

            scores = self.model.predict(user_array, news_internal)

            top_items = np.argsort(-scores)[:top_n]

            return list(self.news_encoder.inverse_transform(news_internal[top_items]))
        except Exception as e:
            return self.more_popularity.values()

In [None]:
model = CustomLightFM()

### Carregando dados de treinamento

In [None]:
train_data_0 = pd.read_parquet(f'{PATH}chunk_0.parquet').sample(frac=0.00001)
# train_data_1 = pd.read_parquet(f'{PATH}chunk_1.parquet')
# train_data_2 = pd.read_parquet(f'{PATH}chunk_2.parquet')
# train_data_3 = pd.read_parquet(f'{PATH}chunk_3.parquet')
# train_data_4 = pd.read_parquet(f'{PATH}chunk_4.parquet')
# train_data_5 = pd.read_parquet(f'{PATH}chunk_5.parquet')

train_data = train_data_0 #pd.concat([train_data_0, train_data_1, train_data_2, train_data_3, train_data_4, train_data_5], ignore_index=True)

In [None]:
del train_data_0
# del train_data_1
# del train_data_2
# del train_data_3
# del train_data_4
# del train_data_5

### Criando encoders do modelo

In [None]:
train_data['user_encoded'] =  model.user_encoder.fit_transform(train_data['userId'])
train_data['news_encoded'] =  model.news_encoder.fit_transform(train_data['newsId'])

### Criando heuristica

In [None]:
train_data.sort_values(by='popularity_score', inplace=True, ascending=False)

In [None]:
train_data.head()

In [None]:
train_data.info()

### Treinando modelo

In [None]:
user_embeddings_train = {row['user_encoded']: np.array(row['user_weighted_embedding']) for _, row in train_data.iterrows()}

In [None]:
news_embeddings_train = {row['news_encoded']: np.array(row['news_embedding']) for _, row in train_data.iterrows()}

#### Criado matrizes

In [None]:
num_users_train = len(train_data['user_encoded'])
embedding_dim_users_train = len(next(iter(user_embeddings_train.values())))

In [None]:
num_news_train = len(train_data['news_encoded'])
embedding_dim_news_train = len(next(iter(news_embeddings_train.values())))

In [None]:
users_feature_matrix_train = np.zeros((num_users_train, embedding_dim_users_train))
user_id_map_train = {user_id: i for i, user_id in enumerate(list(user_embeddings_train.keys()))}

In [None]:
item_feature_matrix_train = np.zeros((num_news_train, embedding_dim_news_train))
item_id_map_train = {news_id: i for i, news_id in enumerate(list(news_embeddings_train.keys()))}

In [None]:
for user_id, embedding in user_embeddings_train.items():
    users_feature_matrix_train[user_id_map_train[user_id]] = embedding

In [None]:
for news_id, embedding in news_embeddings_train.items():
    item_feature_matrix_train[item_id_map_train[news_id]] = embedding

In [None]:
dataset = Dataset()
dataset.fit(
    users=train_data['user_encoded'].unique(),
    user_features=[f"emb_{i}" for i in range(embedding_dim_news_train)],
    items=train_data['news_encoded'].unique(),
    item_features=[f"emb_{i}" for i in range(embedding_dim_news_train)] + ['recency'] + ['popularity'],
)

In [None]:
(interactions_train, wheights_train) = dataset.build_interactions([(row['user_encoded'], row['news_encoded'], row['engagement_score']) for _, row in train_data.iterrows()])

In [None]:
model.interactions = interactions_train
model.weights = wheights_train

In [None]:
item_recency_train = {row['news_encoded']: row['issued_timestamp'] for _, row in train_data.iterrows()}
item_popularity_train = {row['news_encoded']: row['popularity_score'] for _, row in train_data.iterrows()}

In [None]:
item_features_train = dataset.build_item_features(
    [
        (news_id,
         {f"emb_{i}": value for i, value in enumerate(embedding)} |
         {"recency": item_recency_train[news_id]} |
         {"popularity": item_popularity_train[news_id]})
        for news_id, embedding in news_embeddings_train.items()
    ], normalize=False
)

In [None]:
model.item_features = item_features_train

In [None]:
user_features_train = dataset.build_user_features(
    [
        (user_id, {f"emb_{i}": value for i, value in enumerate(embedding_avg)})
        for user_id, embedding_avg in user_embeddings_train.items()
    ],
    normalize=False
)

In [None]:
model.user_features = user_features_train

In [None]:
dataset.mapping()

In [None]:
print(item_features_train)

In [None]:
model.fit(
      interactions_train,
      epochs=10,
      num_threads=2,
      item_features=item_features_train,
      user_features=user_features_train
)

### Criando Heuristica

In [None]:
top_news = train_data[['newsId', 'news_encoded', 'popularity_score']].head(10)

In [None]:
more_popularity = dict().fromkeys(range(10))

for i, item  in  top_news.iterrows():
  more_popularity[i] = item['newsId']

In [None]:
model.more_popularity = dict(more_popularity)

In [None]:
top_news = train_data[['newsId', 'news_encoded', 'popularity_score']].sort_values(by='issued_timestamp', ascending=False).head(10)

### Salvando modelo