# Predict from user ratings

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import f1_score, accuracy_score
from torchmetrics.classification import MultilabelF1Score

import os

In [2]:
ratings = pd.read_csv('../dataset/ratings.csv', index_col=False)

In [3]:
ratings

Unnamed: 0,userid,movieid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [4]:
users = pd.read_csv('../dataset/users.csv')
movies_train = pd.read_csv('../dataset/movies_train.csv')
movies_test = pd.read_csv('../dataset/movies_test.csv')
movies_test

Unnamed: 0,movieid,title,genre,img_path
0,3397,the great muppet caper,"[""Children's"", 'Comedy']",dataset/images/3397.jpg
1,2067,doctor zhivago,"['Drama', 'Romance', 'War']",dataset/images/2067.jpg
2,2651,frankenstein meets the wolf man,['Horror'],dataset/images/2651.jpg
3,2989,for your eyes only,['Action'],dataset/images/2989.jpg
4,3415,the mirror,['Drama'],dataset/images/3415.jpg
...,...,...,...,...
772,2309,the inheritors,['Drama'],dataset/images/2309.jpg
773,2421,"the karate kid, part ii","['Action', 'Adventure', 'Drama']",dataset/images/2421.jpg
774,3255,a league of their own,"['Comedy', 'Drama']",dataset/images/3255.jpg
775,974,algiers,"['Drama', 'Romance']",dataset/images/974.jpg


In [16]:
user_rating_train = {}
for movie_id in movies_train['movieid']:
  rating_for_current_movie = np.zeros(6040)
  # rated_users = ratings.index[ratings['movieid'] == movie_id].tolist()
  # get list of users that rated the movies, adifferent way
  rated_users = ratings.loc[ratings['movieid'] == movie_id].userid.tolist()

  rated_v = ratings['rating'].values
  for user in rated_users:
    rating_for_current_movie[user - 1] = int(rated_v[user])
  # user_rating_train[movie_id] = rating_for_current_movie
    user_rating_train.update({movie_id:rating_for_current_movie})
#   user_rating_train.append({movie_id:rating_for_current_movie})
# user_rating_train = pd.DataFrame(user_rating_train, columns=['movieid', 'ratings'])
user_rating_train

{1281: array([0., 0., 0., ..., 0., 0., 0.]),
 3755: array([0., 0., 0., ..., 0., 0., 0.]),
 1852: array([0., 0., 0., ..., 0., 0., 0.]),
 2735: array([0., 0., 4., ..., 0., 0., 0.]),
 2424: array([0., 0., 0., ..., 0., 0., 0.]),
 2047: array([0., 0., 0., ..., 0., 0., 0.]),
 526: array([0., 0., 0., ..., 0., 0., 0.]),
 2826: array([0., 0., 0., ..., 0., 0., 0.]),
 824: array([0., 0., 0., ..., 0., 0., 0.]),
 2141: array([0., 0., 0., ..., 0., 0., 0.]),
 3674: array([0., 0., 0., ..., 0., 0., 0.]),
 3238: array([0., 0., 0., ..., 0., 0., 0.]),
 2166: array([0., 0., 0., ..., 0., 0., 0.]),
 1511: array([0., 0., 0., ..., 0., 0., 0.]),
 2084: array([0., 0., 0., ..., 0., 0., 0.]),
 3485: array([0., 0., 0., ..., 0., 0., 0.]),
 2263: array([0., 0., 0., ..., 0., 0., 0.]),
 2429: array([0., 0., 0., ..., 0., 0., 0.]),
 1331: array([0., 0., 0., ..., 0., 0., 0.]),
 3734: array([0., 0., 0., ..., 0., 0., 0.]),
 2109: array([0., 0., 0., ..., 0., 0., 0.]),
 3404: array([0., 0., 0., ..., 0., 0., 0.]),
 1238: array

In [76]:
user_rating_test = []
for movie_id in movies_test['movieid']:
  rating_of_movie = np.zeros(len(users['userid']))
  rated_users = ratings.index[ratings['movieid'] == movie_id].tolist()
  for index in rated_users:
    rating_of_movie[int(ratings['userid'].values[index]) - 1] = int(ratings['rating'].values[index])
  user_rating_test.append(rating_of_movie)

In [36]:
with open("../dataset/genres.txt", "r") as file:
  genres = file.read().split('\n')

genres

['Crime',
 'Thriller',
 'Fantasy',
 'Horror',
 'Sci-Fi',
 'Comedy',
 'Documentary',
 'Adventure',
 'Film-Noir',
 'Animation',
 'Romance',
 'Drama',
 'Western',
 'Musical',
 'Action',
 'Mystery',
 'War',
 "Children's"]

In [49]:
genres_map = {}
for index, genre in enumerate(genres):
  genres_map[genre] = index

genres_map

{'Crime': 0,
 'Thriller': 1,
 'Fantasy': 2,
 'Horror': 3,
 'Sci-Fi': 4,
 'Comedy': 5,
 'Documentary': 6,
 'Adventure': 7,
 'Film-Noir': 8,
 'Animation': 9,
 'Romance': 10,
 'Drama': 11,
 'Western': 12,
 'Musical': 13,
 'Action': 14,
 'Mystery': 15,
 'War': 16,
 "Children's": 17}

In [42]:
X_train = np.array(user_rating_train, dtype='int32')
X_test = np.array(user_rating_test, dtype='int32')
X_train.shape

(2484, 6040)

In [51]:
y_train = np.empty(shape=(len(movies_train['movieid']), 18), dtype='int32')
for index, genres_of_movie in enumerate(movies_train['genre']):
  genres_of_movie = eval(genres_of_movie)  # Convert string representation of list to actual list
  movie_genres = np.zeros(18)
  for genre in genres_of_movie:
    x = genres_map[genre]
    movie_genres[x] = 1
  y_train[index] = movie_genres

y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int32)

In [55]:
y_test = np.empty(shape=(len(movies_test['movieid']), 18), dtype='int32')
for index, genres_of_movie in enumerate(movies_test['genre']):
  movie_genres = np.zeros(18)
  genres_of_movie = eval(genres_of_movie)  # Convert string representation of list to actual list
  for genre in genres_of_movie:
    movie_genres[genres_map[genre]] = 1
  y_test[index] = movie_genres

y_test

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [56]:
sum_labels = np.sum(y_train, axis=0)
sum_labels

array([ 139,  306,   52,  213,  180,  750,   73,  188,   31,   70,  291,
       1032,   43,   87,  331,   64,   99,  173])

In [68]:
model = MultiOutputClassifier(MultinomialNB())
# model = MultiOutputClassifier(ComplementNB())
# model = OneVsRestClassifier(ComplementNB())

In [69]:
# fit
model.fit(X_train, y_train)

In [70]:
# predict
preds = model.predict(X_test)
preds_proba = model.predict_proba(X_test)
print(preds)
print(preds_proba)

[[0 0 1 ... 0 0 1]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]]
[array([[1.00000000e+000, 3.92155082e-252],
       [1.00000000e+000, 6.01925477e-277],
       [1.00000000e+000, 2.44689887e-044],
       ...,
       [1.00000000e+000, 0.00000000e+000],
       [9.95642367e-001, 4.35763336e-003],
       [1.00000000e+000, 1.32519769e-086]]), array([[1.00000000e+000, 2.23010801e-288],
       [1.00000000e+000, 3.86202580e-298],
       [9.99916174e-001, 8.38259745e-005],
       ...,
       [1.00000000e+000, 0.00000000e+000],
       [9.99999996e-001, 3.86924150e-009],
       [1.00000000e+000, 4.39348544e-028]]), array([[7.93516689e-44, 1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 2.83909007e-43],
       ...,
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.66897301e-21],
       [1.00000000e+00, 1.48575168e-41]]), array([[1.00000000e+000, 9.30194427e-182],
       [1.00000000e+000, 0.00000

In [71]:
preds_proba = np.array(preds_proba, dtype='float32')
preds_proba.shape

(18, 777, 2)

In [None]:
print(f"{preds_proba[0]}")

In [72]:
# f1 score from sklearn
f1_score(y_test, preds, average='macro')

0.48075922607992033

In [73]:

f1 = MultilabelF1Score(num_labels=len(genres), threshold=0.5, average="macro")
preds = torch.Tensor(preds)
y_test = torch.Tensor(y_test)
print('mF1 :', f1(preds, y_test))

mF1 : tensor(0.4808)


In [74]:
accuracy_score(y_test, preds)

0.16216216216216217