# Disentangled Multimodal Representation Learning for Recommendation (DMRL)

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import torch
import numpy as np
import pandas as pd
import cornac
from cornac.metrics import NDCG
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality
from cornac.models.dmrl.recom_dmrl import DMRL

from utils import load_data, preprocessing_content_data


# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

ModuleNotFoundError: No module named 'torch'

## Load and process data

In [29]:
ratings, content, targets = load_data()

In [30]:
ratings["TimestampDate"] = ratings['Timestamp'].dt.date
ratings.loc[ratings.Rating == 0, "Rating"] = 0.01

In [31]:
content_columns = content.columns.to_list()
content_columns.pop(0)

'ItemId'

In [32]:
content_processed = content[['ItemId']].copy()
content_processed["text"] = content[content_columns].astype(str).fillna('').agg(' '.join, axis=1)

In [6]:
# ratings_cornac_dataset = Dataset.build(df[['UserId', 'ItemId', 'Rating', "text"]].values.tolist(), fmt='UIR')

## Basic analysis

In [33]:
ratings.head()

Unnamed: 0,UserId,ItemId,Timestamp,Rating,TimestampDate
0,c4ca4238a0,91766eac45,2013-10-05 22:00:50,8.0,2013-10-05
1,c81e728d9d,5c739554f7,2013-08-17 16:26:38,9.0,2013-08-17
2,c81e728d9d,48f6d7ce7c,2013-08-17 13:28:27,8.0,2013-08-17
3,c81e728d9d,e9318d627a,2013-06-15 15:38:09,1.0,2013-06-15
4,a87ff679a2,17e6357973,2014-01-31 23:27:59,8.0,2014-01-31


In [34]:
# Number of unique users and items
ratings.UserId.nunique(), ratings.ItemId.nunique()

(51671, 29674)

In [35]:
# how many itens purchased by each user purchase
ratings.groupby(["UserId", 'Timestamp'])["ItemId"].nunique().value_counts()

1     659392
2         54
3         14
6          3
7          2
11         2
4          2
22         1
28         1
20         1
8          1
38         1
Name: ItemId, dtype: int64

In [36]:
# how many itens purchased by each user day by day
ratings.groupby(["UserId", 'TimestampDate'])["ItemId"].nunique().value_counts()

1      420843
2       60533
3       14547
4        4755
5        2065
        ...  
60          1
363         1
145         1
189         1
82          1
Name: ItemId, Length: 88, dtype: int64

In [37]:
# how many times each user purchased items
ratings.groupby("UserId")['Timestamp'].nunique().value_counts()

1      23092
2       6193
3       3341
4       2229
5       1646
       ...  
471        1
427        1
321        1
429        1
392        1
Name: Timestamp, Length: 440, dtype: int64

In [12]:
# how many times each user purchased items per day
ratings.groupby("UserId")['TimestampDate'].nunique().value_counts()

1      25113
2       6048
3       3158
4       2187
5       1579
       ...  
420        1
198        1
602        1
224        1
332        1
Name: TimestampDate, Length: 341, dtype: int64

In [180]:
import json
import pandas as pd

# Step 1: Load the JSONL file containing movie data
with open('data/content.jsonl', 'r') as f:
    movies_data = [json.loads(line) for line in f]

# Step 2: Load the CSV file containing user ratings
df = pd.read_csv('submissoes/submissao_2_DMRL.csv')
# Normalize the Rating column
df["Rating"] = df["Rating"].apply(lambda x: min(x, 10))
print(df.head())


       UserId      ItemId     Rating
0  0006246bee  80d1dae630  10.000000
1  0006246bee  aad36aac60   9.308985
2  0006246bee  c1ee6829f5   8.054197
3  0006246bee  5506c7cf69   7.856953
4  0006246bee  ade4907055   7.736493


In [181]:
# Step 3: Create a dictionary of imdbRatings from the movies data
imdbRatings = {movie['ItemId']: movie['imdbRating'] for movie in movies_data}
metascore = {movie['ItemId']: movie['Metascore'] for movie in movies_data}
imdbVotes = {movie['ItemId']: movie['imdbVotes'] for movie in movies_data}
boxOffice = {}

for movie in movies_data:
    item_id = movie['ItemId']
    box_office = movie.get('BoxOffice', 'N/A')
    if box_office == 'N/A':
        box_office = 0  # Replace 'N/A' with 0
    else:
        # Remove any non-numeric characters like "$" and commas, then convert to an integer
        box_office = int(''.join(c for c in box_office if c.isdigit()))
    boxOffice[item_id] = box_office

print(boxOffice)

# Step 4: Add the imdbRating to each row in the CSV file
df['Metascore'] = df['ItemId'].map(metascore)
df['imdbRating'] = df['ItemId'].map(imdbRatings)
df['BoxOffice'] = df['ItemId'].map(boxOffice)
df['imdbVotes'] = df['ItemId'].map(imdbVotes)

df["Metascore"] = pd.to_numeric(df["Metascore"], errors='coerce')
df["imdbRating"] = pd.to_numeric(df["imdbRating"], errors='coerce')
df["imdbVotes"] = pd.to_numeric(df["imdbVotes"], errors='coerce')
df["BoxOffice"] = df["BoxOffice"].replace({"$": "", ",": ""}, regex=True)
df["BoxOffice"] = pd.to_numeric(df["BoxOffice"], errors='coerce')
min_box_office = df['BoxOffice'].min()
max_box_office = df['BoxOffice'].max()
df['BoxOffice'] = (df['BoxOffice'] - min_box_office) / (max_box_office - min_box_office) * 10
min_imdb_vote = df['imdbVotes'].min()
max_imdb_vote = df['imdbVotes'].max()
df['imdbVotes'] = (df['imdbVotes'] - min_imdb_vote) / (max_imdb_vote - min_imdb_vote) * 10

df["Metascore"] = df["Metascore"]/10
df["Metascore"] = df["Metascore"].fillna(5)
df["imdbRating"] = df["imdbRating"].fillna(5)
df["imdbVotes"] = df["imdbVotes"].fillna(0)
df['WeightedScore'] = (
    0.3 * df["imdbRating"] +
    0.3 * df["imdbVotes"] +
    0.2 * df["BoxOffice"] +
    0.1 * df["Metascore"] +
    0.1 * df["Rating"]
)

df_ordered = df.sort_values(['UserId', 'WeightedScore'], ascending=[True, False])
df_ordered.to_csv('submissao_2_with_imdb.csv', index=False)

{'c9f0f895fb': 0, 'd3d9446802': 0, 'c20ad4d76f': 0, '8e296a067a': 0, '54229abfcf': 0, '1afa34a7f9': 0, '41ae36ecb9': 0, 'eed5af6add': 0, '13f3cf8c53': 0, '42e77b6363': 0, '013a006f03': 0, '43cca4b3de': 0, '54f5f4071f': 0, 'c5866e93ca': 0, 'f15d337c70': 0, 'd254c8a084': 0, '6f1d0705c9': 0, '33ef701c80': 0, 'c4d2ce3f3e': 130000, '68897f19b1': 0, 'a3ec6dd8d5': 0, '6abcc8f243': 0, 'ef35613fc5': 0, 'f016f25df0': 0, 'aa36c88c27': 0, '70afbf2259': 0, '597c7b407a': 0, '09def3ebbc': 0, '64a7157cf3': 0, 'e32c51ad39': 0, 'e1cd50f4a9': 0, 'e1054bf2d7': 0, '412604be30': 0, '3366297a63': 0, '233f1dd0f3': 0, 'd98c1545b7': 0, '1b84c4cee2': 0, 'e562cd9c07': 0, '1ee942c6b1': 0, '498f2c2168': 0, '55479c55eb': 0, '324545ee1d': 0, '77ef24b42f': 0, 'f6b6d2a114': 0, '7dd3ed2e12': 0, '89c86ad4bb': 0, '46e0eae7d5': 0, '64697505ab': 0, 'ed0e6f99c8': 0, '46ba59a699': 0, '7aa7b77461': 0, '67b878df6c': 0, 'a24bdc3e59': 0, 'e0be0edcb0': 0, '8f2f470bb9': 0, '8fe6833df8': 0, '8808eda0dd': 0, 'f65854da46': 0, 'ede862d

In [169]:
df_ordered = df_ordered.drop(columns=["Rating", "Metascore", "imdbRating", "BoxOffice", "imdbVotes"])
df_ordered.to_csv('submissao_2_with_imdb.csv', index=False)

In [117]:
df_ordered["BoxOffice"].max()

10.0

In [13]:
content.isna().sum()

ItemId              0
Title               0
Year                0
Rated               0
Released            0
Runtime             0
Genre               0
Director            0
Writer              0
Actors              0
Plot                0
Language            0
Country             0
Awards              0
Poster              0
Ratings             0
Metascore           0
imdbRating          0
imdbVotes           0
Type                0
DVD                24
BoxOffice          24
Production         24
Website            24
Response            0
totalSeasons    37989
Season          38011
Episode         38011
seriesID        38011
dtype: int64

In [14]:
ratings.Rating.unique()

array([ 8.  ,  9.  ,  1.  ,  7.  ,  6.  , 10.  ,  5.  ,  4.  ,  2.  ,
        3.  ,  0.01])

## Train model

In [15]:
item_text_modality = TextModality(
    corpus=content_processed.text.to_list(),
    ids=content_processed.ItemId.to_list(),
)

In [17]:
ratio_split = RatioSplit(
    data=ratings[['UserId', 'ItemId', 'Rating']].values.tolist(),
    test_size=0.2,
    exclude_unknowns=False,
    verbose=True,
    seed=123,
    rating_threshold=0.5,
    item_text=item_text_modality,
)

rating_threshold = 0.5
exclude_unknowns = False
---
Training data:
Number of users = 46776
Number of items = 26964
Number of ratings = 527776
Max rating = 10.0
Min rating = 0.0
Global mean = 7.3
---
Test data:
Number of users = 51671
Number of items = 29674
Number of ratings = 131944
Number of unknown users = 4895
Number of unknown items = 2710
---
Total users = 51671
Total items = 29674


In [33]:
# ratio_split.train_set.uid_map

In [38]:
uid_map = ratio_split.train_set.uid_map
iid_map = ratio_split.train_set.iid_map

In [18]:
# dir(ratio_split)

In [40]:
uid_map

OrderedDict([('e1228be46d', 0),
             ('9d7f7160ba', 1),
             ('cf655092dc', 2),
             ('1c37dd710f', 3),
             ('1de77f6211', 4),
             ('2180529230', 5),
             ('0479081e54', 6),
             ('4b49840660', 7),
             ('f8f380f45c', 8),
             ('0a552abf94', 9),
             ('16437d40c2', 10),
             ('c180ea7992', 11),
             ('50523d9bd9', 12),
             ('1a01b44869', 13),
             ('df1a336b7e', 14),
             ('8a165b628f', 15),
             ('811f58cb17', 16),
             ('c219b83bdb', 17),
             ('bcdbe9d490', 18),
             ('49901fe640', 19),
             ('0c78695a47', 20),
             ('a8b97b6718', 21),
             ('cc635eaebd', 22),
             ('6d6b72ce0e', 23),
             ('0a6ad41f08', 24),
             ('f72a23858b', 25),
             ('296add4760', 26),
             ('7716d0fc31', 27),
             ('812b4ba287', 28),
             ('c0ab525d63', 29),
             ('96337

In [41]:
iid_map

OrderedDict([('d758333742', 0),
             ('1c8d7fa9b1', 1),
             ('4990232eaf', 2),
             ('f4bbd22c88', 3),
             ('cc1ed84fbe', 4),
             ('25f3f689dd', 5),
             ('a583adc4c0', 6),
             ('dc69f6da04', 7),
             ('4ed2719efd', 8),
             ('76f67e470a', 9),
             ('8ba41c73bd', 10),
             ('1656111407', 11),
             ('12ae84b41c', 12),
             ('dd220f791f', 13),
             ('d6936b08a6', 14),
             ('19041e4b04', 15),
             ('627ff0c9d3', 16),
             ('85069ffbee', 17),
             ('52e1600389', 18),
             ('c67bc1e6f3', 19),
             ('20642c89bb', 20),
             ('166bd88c0d', 21),
             ('e97b853fb7', 22),
             ('f1042a1edc', 23),
             ('b109cf95f7', 24),
             ('4e9e77c396', 25),
             ('28c8b008da', 26),
             ('2d71d7660f', 27),
             ('5260445cdb', 28),
             ('070ab2d42f', 29),
             ('03e32

In [20]:

# Instantiate DMRL recommender
dmrl_recommender = DMRL(
    batch_size=4096,
    epochs=20,
    log_metrics=False,
    learning_rate=0.01,
    num_factors=2,
    decay_r=0.5,
    decay_c=0.01,
    num_neg=3,
    embedding_dim=100,
)

In [24]:
# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split, models=[dmrl_recommender], metrics=[NDCG()]
).run()


[DMRL] Training started!
Pre-encoding the entire corpus. This might take a while.
Using device cpu for training
  batch 5 loss: 2839.757568359375
  batch 10 loss: 2839.749658203125
  batch 15 loss: 2830.976318359375
  batch 20 loss: 2783.773046875
  batch 25 loss: 2676.637255859375
  batch 30 loss: 2577.335986328125
  batch 35 loss: 2486.578271484375
  batch 40 loss: 2375.962060546875
  batch 45 loss: 2244.845458984375
  batch 50 loss: 2090.9804443359376
  batch 55 loss: 1927.3907958984375
  batch 60 loss: 1770.8798583984376
  batch 65 loss: 1691.787744140625
  batch 70 loss: 1628.7230712890625
  batch 75 loss: 1590.2730712890625
  batch 80 loss: 1567.23955078125
  batch 85 loss: 1549.6627197265625
  batch 90 loss: 1586.110791015625
  batch 95 loss: 1533.3525634765624
  batch 100 loss: 1549.5770263671875
  batch 105 loss: 1527.08916015625
  batch 110 loss: 1525.4596435546875
  batch 115 loss: 1477.9546630859375
  batch 120 loss: 1474.546875
  batch 125 loss: 1515.8119873046876
Epoch: 

Ranking:   0%|          | 0/24991 [00:00<?, ?it/s]


IndexError: index out of range in self

In [73]:
# como usar esse algoritmo para tratar itens novos?
# # como usar esse algoritmo para tratar usuarios novos?


# eu tenho o conteudo de todos os itens, incluive itens que estao apenas n conjunto de teste?

In [77]:
target_prediction = targets.copy()
target_prediction["Rating"] = -1

user_id_list = targets.UserId.unique()
for user_id in user_id_list:
    # Get the train dataframe index of the user to predict
    user_index = ratio_split.train_set.uid_map.get(user_id)

    if user_index is None:
        print(f"User {user_id} is not in the train set")
        continue

    # Flter by items to predict 
    items_to_predict = targets.loc[targets.UserId == user_id, "ItemId"].to_list()

    # Get the train dataframe index of the items to predict
    items_to_predict_index = np.array([ratio_split.train_set.iid_map.get(item_id) for item_id in items_to_predict])

    items_to_predict_tensor = torch.tensor([idx for idx in items_to_predict_index if idx is not None])

    # Get the position of items that are not in the train set
    none_indices = [i for i, x in enumerate(items_to_predict_index) if x is None]

    # Get the prediction for the items
    line_rating = dmrl_recommender.score(user_index=user_index, item_indices=items_to_predict_tensor)

    # Insert -1 in the position of items that are not in the train set
    for index_to_insert in none_indices:
        line_rating = np.insert(line_rating, index_to_insert, -1)

    # Insert the prediction in the target_prediction dataframe
    target_prediction.loc[targets.UserId == user_id, "Rating"] = line_rating

User 019f8b946a is not in the train set
User 046dd69e7a is not in the train set
User 08b0ad8c2b is not in the train set
User 0b49b88c68 is not in the train set
User 0bf04bee73 is not in the train set
User 0d858ba37c is not in the train set
User 0e095e054e is not in the train set
User 1230a1696b is not in the train set
User 1344a31569 is not in the train set
User 14345de2c2 is not in the train set
User 19892cd2c9 is not in the train set
User 1c53aae1c1 is not in the train set
User 1e44fdf9c4 is not in the train set
User 23ea75a4cd is not in the train set
User 254a5ecb7a is not in the train set
User 263138a8b9 is not in the train set
User 289319b9e5 is not in the train set
User 290db70122 is not in the train set
User 2a38a4a931 is not in the train set
User 2b12a84466 is not in the train set
User 2b634a9d15 is not in the train set
User 2bba9f4124 is not in the train set
User 2e61866ea9 is not in the train set
User 2efbe23ad5 is not in the train set
User 3002c4f348 is not in the train set


In [79]:
target_prediction = target_prediction.sort_values(["UserId", "Rating"], ascending=[True, False])

In [26]:
target_prediction

NameError: name 'target_prediction' is not defined

In [80]:
target_prediction.to_csv("submissao_2_DMRL.csv", index=False)

In [81]:
target_prediction = target_prediction.drop(columns="Rating")

In [82]:
target_prediction.to_csv("submissao_2_DMRL_sem_rating.csv", index=False)