# `00.` Preparation

## `i.` Importing Libraries

In [None]:
%%capture
!pip install LibRecommender

In [None]:
import numpy as np
import pandas as pd
from libreco.data import random_split, split_by_ratio_chrono, DatasetPure, DatasetFeat, DataInfo
from libreco.algorithms import NCF, DeepFM
from libreco.evaluation import evaluate

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
import torch
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product
from sklearn.preprocessing import MultiLabelBinarizer
from IPython.display import display, clear_output
from torch.utils.data import Dataset, DataLoader, SequentialSampler, BatchSampler
import tensorflow as tf

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# for reproducibility
import os
import random

def seed_everything(seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
seed_everything()

## `ii.` Loading Dataset

In [None]:
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
tags_df = pd.read_csv('tags.csv')
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [None]:
links = pd.read_csv('links.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


## `iii.` Data Exploration

### `a.` Explore Null Values


In [None]:
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [None]:
links.isnull().sum()

movieId    0
imdbId     0
tmdbId     8
dtype: int64

In [None]:
tags_df.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [None]:
ratings_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### `b.` Count Unique Items & Users

In [None]:
print(f"""Unique Users: {ratings_df['userId'].nunique()}
Unique Items: {movies_df['movieId'].nunique()}""")

Unique Users: 610
Unique Items: 9742


### `c.` Check duplicates

In [None]:
movies_df.duplicated().sum()

0

In [None]:
tags_df.duplicated().sum()

0

In [None]:
links.duplicated().sum()

0

In [None]:
ratings_df.duplicated().sum()

0

## `iv.` Data Preprocessing

#### Preprocessing for User Similarity Model

In [None]:
movie_ratings = pd.merge(ratings_df, movies_df, on='movieId')
movie_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [None]:
movie_ratings['genres'] = movie_ratings['genres'].str.split('|')
movie_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,5,1,4.0,847434962,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,7,1,4.5,1106635946,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
3,15,1,2.5,1510577970,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
4,17,1,4.5,1305696483,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),"[Action, Thriller]"
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),"[Action, Crime, Drama]"
100833,610,160836,3.0,1493844794,Hazard (2005),"[Action, Drama, Thriller]"
100834,610,163937,3.5,1493848789,Blair Witch (2016),"[Horror, Thriller]"


In [None]:
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movie_ratings['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
genres_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
100832,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
100833,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
100834,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0


In [None]:
movie_ratings_users_df = pd.concat([movie_ratings, genres_df], axis=1)
movie_ratings_users_df.drop(columns = ['genres', '(no genres listed)'], axis=1, inplace=True)
movie_ratings_users_df

Unnamed: 0,userId,movieId,rating,timestamp,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,964982703,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,847434962,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,7,1,4.5,1106635946,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,15,1,2.5,1510577970,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,17,1,4.5,1305696483,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
100833,610,160836,3.0,1493844794,Hazard (2005),1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
100834,610,163937,3.5,1493848789,Blair Witch (2016),0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0


In [None]:
movie_ratings_users_df.to_csv('movie_ratings_users_df.csv', index=False)

In [None]:
def prepend_text_based_on_length(imdbid):
  length = len(imdbid)
  if length ==3:
    return 'tt0000' + imdbid
  elif length == 4:
    return 'tt000' + imdbid
  elif length == 5:
    return 'tt00' + imdbid
  else:
    return 'tt0' + imdbid

# Apply the function to the imdbid column
links['imdbId'] = links['imdbId'].apply(prepend_text_based_on_length)

In [None]:
links

Unnamed: 0,movieId,imdbId
0,1,tt0114709
1,2,tt0113497
2,3,tt0113228
3,4,tt0114885
4,5,tt0113041
...,...,...
9737,193581,tt05476944
9738,193583,tt05914996
9739,193585,tt06397426
9740,193587,tt08391976


In [None]:
links.to_csv('Links_processed.csv', index=False)

In [None]:
links_df = pd.read_csv("/content/Links_processed.csv")

In [None]:
links_df

Unnamed: 0,movieId,imdbId
0,1,tt0114709
1,2,tt0113497
2,3,tt0113228
3,4,tt0114885
4,5,tt0113041
...,...,...
9737,193581,tt05476944
9738,193583,tt05914996
9739,193585,tt06397426
9740,193587,tt08391976


#### Preprocessing for Item Similarity Model

In [None]:
import pandas as pd

def preprocess_movie_ratings(ratings_df, movies_df):
    movie_ratings = pd.merge(ratings_df, movies_df, on='movieId')

    movie_ratings['year'] = movie_ratings['title'].str.extract(r'\((\d{4})\)', expand=False)
    movie_ratings['title'] = movie_ratings['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()

    movie_ratings.dropna(subset=['year'], inplace=True)

    movie_ratings['genres'] = movie_ratings['genres'].str.split('|')
    return movie_ratings


movie_ratings_item = preprocess_movie_ratings(ratings_df, movies_df)
movie_ratings_item.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1,1,4.0,964982703,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,5,1,4.0,847434962,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
2,7,1,4.5,1106635946,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
3,15,1,2.5,1510577970,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
4,17,1,4.5,1305696483,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995


In [None]:
movie_ratings_item.to_csv("movies_rating_item_df.csv",index =False)

In [None]:
movie_ratings_item_df = pd.read_csv("/content/movies_rating_item_df.csv")
movie_ratings_item_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1,1,4.0,964982703,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']",1995
1,5,1,4.0,847434962,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']",1995
2,7,1,4.5,1106635946,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']",1995
3,15,1,2.5,1510577970,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']",1995
4,17,1,4.5,1305696483,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']",1995


In [None]:
def load_and_preprocess_movies_data(movies_df):
    movies_df['genres'] = movies_df['genres'].str.split('|')
    movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)', expand=False)
    movies_df['title'] = movies_df['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()
    movies_df['year'] = pd.to_numeric(movies_df['year'], errors='coerce')
    return movies_df
movies_genres_df = load_and_preprocess_movies_data(movies_df)
movies_genres_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II,[Comedy],1995.0


In [None]:
movies_genres_df.to_csv("movies_genres_df.csv",index= False)

In [None]:
movies_genres_df_new = pd.read_csv("/content/movies_genres_df.csv")
movies_genres_df_new

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']",1995.0
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']",1995.0
2,3,Grumpier Old Men,"['Comedy', 'Romance']",1995.0
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",1995.0
4,5,Father of the Bride Part II,['Comedy'],1995.0
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"['Action', 'Animation', 'Comedy', 'Fantasy']",2017.0
9738,193583,No Game No Life: Zero,"['Animation', 'Comedy', 'Fantasy']",2017.0
9739,193585,Flint,['Drama'],2017.0
9740,193587,Bungo Stray Dogs: Dead Apple,"['Action', 'Animation']",2018.0


-----------------------------------

# `01.` DeepFM

### `#` Split Data into Training and Test Dataframes

In [None]:
movie_ratings_df = movie_ratings_df.rename(columns={'userId': 'user', 'movieId': 'item', 'rating': 'label', 'timestamp': 'time'})

In [None]:
train_data, eval_data, test_data = random_split(movie_ratings_df, multi_ratios=[0.8, 0.1, 0.1], seed=42)

In [None]:
train_data, data_info = DatasetFeat.build_trainset(train_data)
eval_data = DatasetFeat.build_evalset(eval_data)
test_data = DatasetFeat.build_testset(test_data)

### `#` Build and Train Model

In [None]:
import tensorflow as tf
tf.compat.v1.reset_default_graph()
deepfm = DeepFM(
        "ranking",
        data_info,
        loss_type="cross_entropy",
        embed_size=16,
        n_epochs=5,
        lr=1e-3,
        lr_decay=False,
        reg=None,
        batch_size=128,
        num_neg=1,
        use_bn=False,
        dropout_rate=None,
        hidden_units=(128, 64, 32),
        tf_sess_config=None,
    )

In [None]:
deepfm.fit(
        train_data,
        neg_sampling=True,
        verbose=2,
        shuffle=True,
        eval_data=eval_data,
        metrics=[
            "loss",
            "balanced_accuracy",
            "roc_auc",
            "pr_auc",
            "precision",
            "recall",
            "map",
            "ndcg",
        ],
    )

Training start time: [35m2024-05-25 17:30:24[0m
total params: [33m177,405[0m | embedding params: [33m163,018[0m | network params: [33m14,387[0m


train: 100%|██████████| 1261/1261 [00:06<00:00, 187.99it/s]


Epoch 1 elapsed: 6.718s
	 [32mtrain_loss: 0.5334[0m


eval_pointwise: 100%|██████████| 3/3 [00:00<00:00, 29.85it/s]
eval_listwise: 100%|██████████| 597/597 [00:07<00:00, 74.81it/s] 


	 eval log_loss: 0.4704
	 eval balanced_accuracy: 0.7809
	 eval roc_auc: 0.8587
	 eval pr_auc: 0.8505
	 eval precision@10: 0.0677
	 eval recall@10: 0.0551
	 eval map@10: 0.1401
	 eval ndcg@10: 0.2088


train: 100%|██████████| 1261/1261 [00:04<00:00, 300.09it/s]


Epoch 2 elapsed: 4.214s
	 [32mtrain_loss: 0.4773[0m


eval_pointwise: 100%|██████████| 3/3 [00:00<00:00, 126.19it/s]
eval_listwise: 100%|██████████| 597/597 [00:06<00:00, 85.30it/s] 


	 eval log_loss: 0.4612
	 eval balanced_accuracy: 0.7839
	 eval roc_auc: 0.8626
	 eval pr_auc: 0.8530
	 eval precision@10: 0.0677
	 eval recall@10: 0.0559
	 eval map@10: 0.1526
	 eval ndcg@10: 0.2185


train: 100%|██████████| 1261/1261 [00:04<00:00, 294.63it/s]


Epoch 3 elapsed: 4.292s
	 [32mtrain_loss: 0.462[0m


eval_pointwise: 100%|██████████| 3/3 [00:00<00:00, 120.54it/s]
eval_listwise: 100%|██████████| 597/597 [00:07<00:00, 81.43it/s] 


	 eval log_loss: 0.4580
	 eval balanced_accuracy: 0.7877
	 eval roc_auc: 0.8657
	 eval pr_auc: 0.8560
	 eval precision@10: 0.0719
	 eval recall@10: 0.0603
	 eval map@10: 0.1387
	 eval ndcg@10: 0.2108


train: 100%|██████████| 1261/1261 [00:04<00:00, 291.00it/s]


Epoch 4 elapsed: 4.343s
	 [32mtrain_loss: 0.4428[0m


eval_pointwise: 100%|██████████| 3/3 [00:00<00:00, 129.91it/s]
eval_listwise: 100%|██████████| 597/597 [00:05<00:00, 104.23it/s]


	 eval log_loss: 0.4648
	 eval balanced_accuracy: 0.7887
	 eval roc_auc: 0.8677
	 eval pr_auc: 0.8607
	 eval precision@10: 0.0879
	 eval recall@10: 0.0754
	 eval map@10: 0.1957
	 eval ndcg@10: 0.2733


train: 100%|██████████| 1261/1261 [00:05<00:00, 212.04it/s]


Epoch 5 elapsed: 5.957s
	 [32mtrain_loss: 0.4098[0m


eval_pointwise: 100%|██████████| 3/3 [00:00<00:00, 107.61it/s]
eval_listwise: 100%|██████████| 597/597 [00:05<00:00, 110.22it/s]


	 eval log_loss: 0.4629
	 eval balanced_accuracy: 0.7955
	 eval roc_auc: 0.8778
	 eval pr_auc: 0.8709
	 eval precision@10: 0.0925
	 eval recall@10: 0.0852
	 eval map@10: 0.1977
	 eval ndcg@10: 0.2823


In [None]:
# do final evaluation on test data
evaluate(
    model=deepfm,
    data=test_data,
    neg_sampling=True,
    metrics=["loss"],
)

eval_pointwise: 100%|██████████| 3/3 [00:00<00:00, 70.15it/s]


{'loss': 0.45941932315759293}

In [None]:
deepfm.recommend_user(user=3, n_rec=10)

{3: array([ 296,  356,    1, 2858, 2959,  858, 1265, 5952,  588,  780])}

In [None]:
data_info.save(path="/content/", model_name="deepfm")

In [None]:
deepfm.save(
        path="/content/", model_name="deepfm", manual=True, inference_only=True
    )

In [None]:
tf.compat.v1.reset_default_graph()
data_info = DataInfo.load("/content/", model_name="deepfm")
print(data_info)

n_users: 610, n_items: 8964, data density: 1.4753 %


In [None]:
model = DeepFM.load(
        path="/content/", model_name="deepfm", data_info=data_info, manual=True
    )

total params: [33m177,405[0m | embedding params: [33m163,018[0m | network params: [33m14,387[0m
