In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from math import sqrt
from pomegranate.bayesian_network import BayesianNetwork
from torch import tensor
from torch.masked import MaskedTensor

## Load data

In [None]:
movies = pd.read_csv("./data/movies.dat", sep="::", header=None, engine="python", encoding_errors="ignore")
movies.columns = ["movie_id", "title", "genres"]
movies = movies.set_index("movie_id")
movies

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy
...,...,...
3948,Meet the Parents (2000),Comedy
3949,Requiem for a Dream (2000),Drama
3950,Tigerland (2000),Drama
3951,Two Family House (2000),Drama


In [3]:
ratings = pd.read_csv("../data/ratings.dat", sep="::", header=None, engine="python")
ratings.columns = ["user_id", "movie_id", "rating", "timestamp"]
ratings = ratings.set_index(["user_id", "movie_id"])
ratings

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
user_id,movie_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291
...,...,...,...
6040,1091,1,956716541
6040,1094,5,956704887
6040,562,5,956704746
6040,1096,4,956715648


## Preprocessing

### Movies

In [4]:
movies["genres"] = movies["genres"].apply(lambda genres: genres.split("|"))
genres = set()

for g in movies["genres"]:
    genres.update(g)

genres = dict([*zip(sorted([*genres]), range(len(genres)))])
genres

{'Action': 0,
 'Adventure': 1,
 'Animation': 2,
 "Children's": 3,
 'Comedy': 4,
 'Crime': 5,
 'Documentary': 6,
 'Drama': 7,
 'Fantasy': 8,
 'Film-Noir': 9,
 'Horror': 10,
 'Musical': 11,
 'Mystery': 12,
 'Romance': 13,
 'Sci-Fi': 14,
 'Thriller': 15,
 'War': 16,
 'Western': 17}

In [5]:
movies["genres"] = movies["genres"].apply(lambda genres_list: tuple([genres[g] for g in genres_list]))
movies = movies.drop(columns="title")
movies

Unnamed: 0_level_0,genres
movie_id,Unnamed: 1_level_1
1,"(2, 3, 4)"
2,"(1, 3, 8)"
3,"(4, 13)"
4,"(4, 7)"
5,"(4,)"
...,...
3948,"(4,)"
3949,"(7,)"
3950,"(7,)"
3951,"(7,)"


### Ratings

In [6]:
most_interactive_user = ratings.index.get_level_values("user_id").value_counts(ascending=False).iloc[:1]
user_id_test, nmax_ratings = [*zip(most_interactive_user.index, most_interactive_user.values)][0]
print(user_id_test, nmax_ratings)

4169 2314


In [7]:
ratings_test = ratings.loc[user_id_test].sample(int(nmax_ratings * 0.2)).reset_index()
ratings_test["user_id"] = user_id_test
ratings_test = ratings_test.set_index(["user_id", "movie_id"])
ratings_train = ratings.loc[ratings.index.difference(ratings_test.index)]
ratings_test = ratings_test.reset_index("user_id", drop=True)
print(ratings.shape, ratings_train.shape, ratings_test.shape)

(1000209, 2) (999747, 2) (462, 2)


## Graph contruct

In [8]:
parents_node_list = [() for _ in range(len(genres))]
parents_node_list.extend(movies["genres"])
parents_node_list = tuple(parents_node_list)
parents_node_list

((),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (2, 3, 4),
 (1, 3, 8),
 (4, 13),
 (4, 7),
 (4,),
 (0, 5, 15),
 (4, 13),
 (1, 3),
 (0,),
 (0, 1, 15),
 (4, 7, 13),
 (4, 10),
 (2, 3),
 (7,),
 (0, 1, 13),
 (7, 15),
 (7, 13),
 (15,),
 (4,),
 (0,),
 (0, 4, 7),
 (5, 7, 15),
 (15,),
 (7, 14),
 (7, 13),
 (7,),
 (7,),
 (13,),
 (1, 14),
 (7,),
 (7,),
 (7, 14),
 (1, 13),
 (3, 4, 7),
 (7, 13),
 (7,),
 (6,),
 (4,),
 (4, 13),
 (7,),
 (7, 16),
 (0, 5, 7),
 (7,),
 (0, 1),
 (4, 7),
 (7, 13),
 (5, 15),
 (2, 3, 11, 13),
 (7, 13),
 (5, 15),
 (0, 7, 15),
 (4,),
 (7,),
 (3, 4),
 (7,),
 (1, 3, 8),
 (7,),
 (7, 13),
 (7, 12),
 (1, 3, 8),
 (7, 15),
 (7,),
 (4,),
 (4, 13),
 (4,),
 (14, 15),
 (7,),
 (4, 13),
 (4,),
 (0, 4, 5, 10, 15),
 (0,),
 (4, 7),
 (7, 11),
 (7, 13),
 (4, 7),
 (14, 15),
 (6,),
 (7,),
 (7, 15),
 (7,),
 (5, 7, 13),
 (7,),
 (7,),
 (4, 7),
 (7, 13),
 (1, 7),
 (3, 4),
 (4,),
 (0, 15),
 (7,),
 (7, 15),
 (4, 13),
 (7,),
 (0, 15),
 (4,),
 (7,),
 (0, 15),
 (6,)

In [9]:
pgm = BayesianNetwork(structure=parents_node_list)

## Fitting & Predicting

In [10]:
observations = ratings_train.reset_index().pivot(index="user_id", columns="movie_id", values="rating").fillna(0)
nodes = pd.Index([*genres.keys(), *movies.index.astype(str)])
observations.columns = observations.columns.astype(str)
missing_movies_id = nodes.difference(observations.columns)
tmp = pd.DataFrame(0, index=observations.index, columns=missing_movies_id)
observations = pd.concat([tmp, observations], axis=1)
observations = observations[nodes].astype(int)
observations

Unnamed: 0_level_0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
pgm.fit(observations.values)

BayesianNetwork(
  (distributions): ModuleList(
    (0-17): 18 x Categorical()
    (18): ConditionalCategorical(
      (probs): ParameterList(  (0): Parameter containing: [torch.float32 of size 1x1x1x6])
      (_w_sum): [tensor([[[0.]]])]
      (_xw_sum): [tensor([[[[0., 0., 0., 0., 0., 0.]]]])]
      (_log_probs): [tensor([[[[-0.4214, -5.9336, -4.5953, -2.8626, -1.9787, -1.9969]]]])]
    )
    (19): ConditionalCategorical(
      (probs): ParameterList(  (0): Parameter containing: [torch.float32 of size 1x1x1x6])
      (_w_sum): [tensor([[[0.]]])]
      (_xw_sum): [tensor([[[[0., 0., 0., 0., 0., 0.]]]])]
      (_log_probs): [tensor([[[[-0.1234, -4.9685, -4.0240, -3.1227, -3.2381, -4.8350]]]])]
    )
    (20): ConditionalCategorical(
      (probs): ParameterList(  (0): Parameter containing: [torch.float32 of size 1x1x6])
      (_w_sum): [tensor([[0.]])]
      (_xw_sum): [tensor([[[0., 0., 0., 0., 0., 0.]]])]
      (_log_probs): [tensor([[[-0.0824, -4.9220, -4.1523, -3.4539, -4.0522, -4.

In [12]:
test = tensor([observations.loc[user_id_test]])
test_masked = MaskedTensor(test, test != 0)
test_masked.shape

  test = tensor([observations.loc[user_id_test]])
  test_masked = MaskedTensor(test, test != 0)


torch.Size([1, 3901])

In [13]:
preds = pgm.predict_proba(test_masked)

In [14]:
ratings_test["probs"] = ratings_test.apply(lambda row: preds[nodes.get_loc(str(row.name))][0].numpy()[1:], axis=1)
ratings_test["probs"] = ratings_test["probs"].apply(lambda probs: [(value + 1, prob / sum(probs)) for value, prob in enumerate(probs)])
ratings_test["pred"] = ratings_test["probs"].apply(lambda probs: sum([value * prob for value, prob in probs]))
ratings_test

Unnamed: 0_level_0,rating,timestamp,probs,pred
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1281,5,967165087,"[(1, 0.018604653), (2, 0.046511628), (3, 0.213...",4.027907
2696,3,965334304,"[(1, 0.0875), (2, 0.050000004), (3, 0.18750001...",3.700000
842,2,976589775,"[(1, 0.20666668), (2, 0.28), (3, 0.28), (4, 0....",2.593333
1677,3,973310173,"[(1, 0.16666667), (2, 0.19444442), (3, 0.36111...",2.750000
1747,5,973310735,"[(1, 0.026490066), (2, 0.13245033), (3, 0.3200...",3.487859
...,...,...,...,...
513,3,975805057,"[(1, 0.045454547), (2, 0.22727273), (3, 0.3333...",3.121212
2013,3,965693742,"[(1, 0.022544283), (2, 0.117552325), (3, 0.351...",3.455716
1026,4,967165158,"[(1, 0.14285713), (2, 0.0), (3, 0.14285713), (...",3.571428
1913,5,971578874,"[(1, 0.012048192), (2, 0.10240963), (3, 0.1987...",3.819277


In [15]:
rmse = sqrt(mean_absolute_error(ratings_test["rating"], ratings_test["pred"]))
rmse

0.8306743424775149