# Disentangled Multimodal Representation Learning for Recommendation (DMRL)

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import numpy as np
import pandas as pd
from cornac.metrics import NDCG
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality
from cornac.models.dmrl.recom_dmrl import DMRL

from utils import load_data, preprocessing_content_data


# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Load and process data

In [4]:
ratings, content, targets = load_data()

In [5]:
ratings["TimestampDate"] = ratings['Timestamp'].dt.date
ratings.loc[ratings.Rating == 0, "Rating"] = 0.01

In [6]:
content_columns = content.columns.to_list()
content_columns.pop(0)

'ItemId'

In [7]:
content_processed = content[['ItemId']].copy()
content_processed["text"] = content[content_columns].astype(str).fillna('').agg(' '.join, axis=1)

In [11]:
# ratings_cornac_dataset = Dataset.build(df[['UserId', 'ItemId', 'Rating', "text"]].values.tolist(), fmt='UIR')

## Basic analysis

In [None]:
ratings.head()

In [None]:
# Number of unique users and items
ratings.UserId.nunique(), ratings.ItemId.nunique()

In [None]:
# how many itens purchased by each user purchase
ratings.groupby(["UserId", 'Timestamp'])["ItemId"].nunique().value_counts()

In [None]:
# how many itens purchased by each user day by day
ratings.groupby(["UserId", 'TimestampDate'])["ItemId"].nunique().value_counts()

In [None]:
# how many times each user purchased items
ratings.groupby("UserId")['Timestamp'].nunique().value_counts()

In [None]:
# how many times each user purchased items per day
ratings.groupby("UserId")['TimestampDate'].nunique().value_counts()

In [None]:
content.isna().sum()

In [None]:
ratings.Rating.unique()

## Train model

In [10]:
item_text_modality = TextModality(
    corpus=content_processed.text.to_list(),
    ids=content_processed.ItemId.to_list(),
)

In [11]:
ratio_split = RatioSplit(
    data=ratings[['UserId', 'ItemId', 'Rating']].values.tolist(),
    test_size=0.2,
    exclude_unknowns=True,
    verbose=True,
    seed=123,
    rating_threshold=0.5,
    item_text=item_text_modality,
)

rating_threshold = 0.5
exclude_unknowns = True
---
Training data:
Number of users = 46776
Number of items = 26964
Number of ratings = 527776
Max rating = 10.0
Min rating = 0.0
Global mean = 7.3
---
Test data:
Number of users = 46776
Number of items = 26964
Number of ratings = 123946
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 46776
Total items = 26964


In [33]:
# ratio_split.train_set.uid_map

In [None]:
uid_map = ratio_split.train_set.uid_map
iid_map = ratio_split.train_set.iid_map

In [18]:
# dir(ratio_split)

In [None]:
uid_map
iid_map

In [12]:

# Instantiate DMRL recommender
dmrl_recommender = DMRL(
    batch_size=4096,
    epochs=20,
    log_metrics=False,
    learning_rate=0.01,
    num_factors=2,
    decay_r=0.5,
    decay_c=0.01,
    num_neg=3,
    embedding_dim=100,
)

In [13]:
# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split, models=[dmrl_recommender], metrics=[NDCG()]
).run()


[DMRL] Training started!


  saved_ids = torch.load(id_path)
  self.features = torch.load(path)


Using device cuda for training
  batch 5 loss: 2839.7349609375
  batch 10 loss: 2838.8173828125
  batch 15 loss: 2823.062060546875
  batch 20 loss: 2752.82568359375
  batch 25 loss: 2639.58984375
  batch 30 loss: 2528.148681640625
  batch 35 loss: 2455.21103515625
  batch 40 loss: 2305.06240234375
  batch 45 loss: 2113.9444580078125
  batch 50 loss: 1926.7844482421874
  batch 55 loss: 1752.9337890625
  batch 60 loss: 1700.745751953125
  batch 65 loss: 1653.53310546875
  batch 70 loss: 1632.4863525390624
  batch 75 loss: 1605.7842529296875
  batch 80 loss: 1592.30791015625
  batch 85 loss: 1568.596435546875
  batch 90 loss: 1565.9516357421876
  batch 95 loss: 1551.0882080078125
  batch 100 loss: 1533.9483154296875
  batch 105 loss: 1506.35341796875
  batch 110 loss: 1498.362939453125
  batch 115 loss: 1481.4756591796875
  batch 120 loss: 1442.396435546875
  batch 125 loss: 1490.872265625
Epoch: 0 is done
  batch 5 loss: 1323.680517578125
  batch 10 loss: 1297.060302734375
  batch 15 los

Ranking: 100%|██████████| 19966/19966 [05:46<00:00, 57.69it/s]


TEST:
...
     | NDCG@-1 | Train (s) | Test (s)
---- + ------- + --------- + --------
DMRL |  0.2341 |   64.3903 | 346.1104






In [73]:
# como usar esse algoritmo para tratar itens novos?
# # como usar esse algoritmo para tratar usuarios novos?


# eu tenho o conteudo de todos os itens, incluive itens que estao apenas n conjunto de teste?

In [77]:
target_prediction = targets.copy()
target_prediction["Rating"] = -1

user_id_list = targets.UserId.unique()
for user_id in user_id_list:
    # Get the train dataframe index of the user to predict
    user_index = ratio_split.train_set.uid_map.get(user_id)

    if user_index is None:
        print(f"User {user_id} is not in the train set")
        continue

    # Flter by items to predict 
    items_to_predict = targets.loc[targets.UserId == user_id, "ItemId"].to_list()

    # Get the train dataframe index of the items to predict
    items_to_predict_index = np.array([ratio_split.train_set.iid_map.get(item_id) for item_id in items_to_predict])

    items_to_predict_tensor = torch.tensor([idx for idx in items_to_predict_index if idx is not None])

    # Get the position of items that are not in the train set
    none_indices = [i for i, x in enumerate(items_to_predict_index) if x is None]

    # Get the prediction for the items
    line_rating = dmrl_recommender.score(user_index=user_index, item_indices=items_to_predict_tensor)

    # Insert -1 in the position of items that are not in the train set
    for index_to_insert in none_indices:
        line_rating = np.insert(line_rating, index_to_insert, -1)

    # Insert the prediction in the target_prediction dataframe
    target_prediction.loc[targets.UserId == user_id, "Rating"] = line_rating

User 019f8b946a is not in the train set
User 046dd69e7a is not in the train set
User 08b0ad8c2b is not in the train set
User 0b49b88c68 is not in the train set
User 0bf04bee73 is not in the train set
User 0d858ba37c is not in the train set
User 0e095e054e is not in the train set
User 1230a1696b is not in the train set
User 1344a31569 is not in the train set
User 14345de2c2 is not in the train set
User 19892cd2c9 is not in the train set
User 1c53aae1c1 is not in the train set
User 1e44fdf9c4 is not in the train set
User 23ea75a4cd is not in the train set
User 254a5ecb7a is not in the train set
User 263138a8b9 is not in the train set
User 289319b9e5 is not in the train set
User 290db70122 is not in the train set
User 2a38a4a931 is not in the train set
User 2b12a84466 is not in the train set
User 2b634a9d15 is not in the train set
User 2bba9f4124 is not in the train set
User 2e61866ea9 is not in the train set
User 2efbe23ad5 is not in the train set
User 3002c4f348 is not in the train set


In [79]:
target_prediction = target_prediction.sort_values(["UserId", "Rating"], ascending=[True, False])

In [80]:
target_prediction.to_csv("submissao_2_DMRL.csv", index=False)

In [81]:
target_prediction = target_prediction.drop(columns="Rating")

In [82]:
target_prediction.to_csv("submissao_2_DMRL_sem_rating.csv", index=False)