# Predict from user ratings

In [17]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import f1_score, accuracy_score
from torchmetrics.classification import MultilabelF1Score

import os

In [18]:
ratings = pd.read_csv('../dataset/ratings.csv', index_col=False)

In [19]:
ratings

Unnamed: 0,userid,movieid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [20]:
users = pd.read_csv('../dataset/users.csv')
movies_train = pd.read_csv('../dataset/movies_train.csv')
movies_test = pd.read_csv('../dataset/movies_test.csv')
movies_test

Unnamed: 0,movieid,title,genre,img_path
0,3397,the great muppet caper,"[""Children's"", 'Comedy']",dataset/images/3397.jpg
1,2067,doctor zhivago,"['Drama', 'Romance', 'War']",dataset/images/2067.jpg
2,2651,frankenstein meets the wolf man,['Horror'],dataset/images/2651.jpg
3,2989,for your eyes only,['Action'],dataset/images/2989.jpg
4,3415,the mirror,['Drama'],dataset/images/3415.jpg
...,...,...,...,...
772,2309,the inheritors,['Drama'],dataset/images/2309.jpg
773,2421,"the karate kid, part ii","['Action', 'Adventure', 'Drama']",dataset/images/2421.jpg
774,3255,a league of their own,"['Comedy', 'Drama']",dataset/images/3255.jpg
775,974,algiers,"['Drama', 'Romance']",dataset/images/974.jpg


In [21]:
user_rating_train = []
for movie_id in movies_train['movieid']:
  rating_of_movie = np.zeros(len(users['userid']))
  rated_users = ratings.index[ratings['movieid'] == movie_id].tolist()
  for index in rated_users:
    rating_of_movie[int(ratings['userid'].values[index]) - 1] = int(ratings['rating'].values[index])
  user_rating_train.append(rating_of_movie)

In [22]:
user_rating_test = []
for movie_id in movies_test['movieid']:
  rating_of_movie = np.zeros(len(users['userid']))
  rated_users = ratings.index[ratings['movieid'] == movie_id].tolist()
  for index in rated_users:
    rating_of_movie[int(ratings['userid'].values[index]) - 1] = int(ratings['rating'].values[index])
  user_rating_test.append(rating_of_movie)

In [23]:
with open("../dataset/genres.txt", "r") as file:
  genres = file.read().split('\n')

genres

['Crime',
 'Thriller',
 'Fantasy',
 'Horror',
 'Sci-Fi',
 'Comedy',
 'Documentary',
 'Adventure',
 'Film-Noir',
 'Animation',
 'Romance',
 'Drama',
 'Western',
 'Musical',
 'Action',
 'Mystery',
 'War',
 "Children's"]

In [24]:
genres_map = {}
for index, genre in enumerate(genres):
  genres_map[genre] = index

genres_map

{'Crime': 0,
 'Thriller': 1,
 'Fantasy': 2,
 'Horror': 3,
 'Sci-Fi': 4,
 'Comedy': 5,
 'Documentary': 6,
 'Adventure': 7,
 'Film-Noir': 8,
 'Animation': 9,
 'Romance': 10,
 'Drama': 11,
 'Western': 12,
 'Musical': 13,
 'Action': 14,
 'Mystery': 15,
 'War': 16,
 "Children's": 17}

In [25]:
X_train = np.array(user_rating_train, dtype='int32')
X_test = np.array(user_rating_test, dtype='int32')
X_train.shape

(2484, 6040)

In [26]:
y_train = np.empty(shape=(len(movies_train['movieid']), 18), dtype='int32')
for index, genres_of_movie in enumerate(movies_train['genre']):
  genres_of_movie = eval(genres_of_movie)  # Convert string representation of list to actual list
  movie_genres = np.zeros(18)
  for genre in genres_of_movie:
    x = genres_map[genre]
    movie_genres[x] = 1
  y_train[index] = movie_genres

y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int32)

In [27]:
y_test = np.empty(shape=(len(movies_test['movieid']), 18), dtype='int32')
for index, genres_of_movie in enumerate(movies_test['genre']):
  movie_genres = np.zeros(18)
  genres_of_movie = eval(genres_of_movie)  # Convert string representation of list to actual list
  for genre in genres_of_movie:
    movie_genres[genres_map[genre]] = 1
  y_test[index] = movie_genres

y_test

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [28]:
sum_labels = np.sum(y_train, axis=0)
sum_labels

array([ 139,  306,   52,  213,  180,  750,   73,  188,   31,   70,  291,
       1032,   43,   87,  331,   64,   99,  173])

In [29]:
model = MultiOutputClassifier(MultinomialNB())
# model = MultiOutputClassifier(ComplementNB())
# model = OneVsRestClassifier(ComplementNB())

In [30]:
# fit
model.fit(X_train, y_train)

In [31]:
# predict
preds = model.predict(X_test)
preds_proba = model.predict_proba(X_test)
print(preds)
print(preds_proba)

[[0 0 1 ... 0 0 1]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]]
[array([[1.00000000e+000, 3.92155082e-252],
       [1.00000000e+000, 6.01925477e-277],
       [1.00000000e+000, 2.44689887e-044],
       ...,
       [1.00000000e+000, 0.00000000e+000],
       [9.95642367e-001, 4.35763336e-003],
       [1.00000000e+000, 1.32519769e-086]]), array([[1.00000000e+000, 2.23010801e-288],
       [1.00000000e+000, 3.86202580e-298],
       [9.99916174e-001, 8.38259745e-005],
       ...,
       [1.00000000e+000, 0.00000000e+000],
       [9.99999996e-001, 3.86924150e-009],
       [1.00000000e+000, 4.39348544e-028]]), array([[7.93516689e-44, 1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 2.83909007e-43],
       ...,
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.66897301e-21],
       [1.00000000e+00, 1.48575168e-41]]), array([[1.00000000e+000, 9.30194427e-182],
       [1.00000000e+000, 0.00000

In [32]:
preds_proba = np.array(preds_proba, dtype='float32')
preds_proba.shape

(18, 777, 2)

In [33]:
print(f"{preds_proba[0]}")

[[1.0000000e+00 0.0000000e+00]
 [1.0000000e+00 0.0000000e+00]
 [1.0000000e+00 2.3822074e-44]
 ...
 [1.0000000e+00 0.0000000e+00]
 [9.9564236e-01 4.3576332e-03]
 [1.0000000e+00 0.0000000e+00]]


In [72]:
# f1 score from sklearn
f1_score(y_test, preds, average='macro')

0.48075922607992033

In [73]:

f1 = MultilabelF1Score(num_labels=len(genres), threshold=0.5, average="macro")
preds = torch.Tensor(preds)
y_test = torch.Tensor(y_test)
print('mF1 :', f1(preds, y_test))

mF1 : tensor(0.4808)


In [74]:
accuracy_score(y_test, preds)

0.16216216216216217

In [34]:
import torch

def P_at_K(k, pred, truth):
    # print(pred)
    _, indices = torch.topk(pred, k=k)
    correct = 0
    for id in indices:
        if truth[id] > 0:
            correct += 1
    return correct / k

def AP_at_K(k, pred, truth):
    AP = 0
    for i in range(1, k+1):
        AP += P_at_K(i, pred, truth) 
    return AP / k

def MAP_at_K(k, pred_list, truth_list):
    MAP = 0
    for i in range(len(pred_list)):
        MAP += AP_at_K(k, pred_list[i], truth_list[i])
    return MAP / len(pred_list)


def normalize(pred, topk=False):
    pred_1 = torch.zeros(pred.shape)
    for i in range(pred.shape[0]):
        if topk:
            ids = torch.topk(pred[i], k=3).indices
        else:
            ids = [j*(pred[i][j] > 0.0) for j in range(len(pred[i]))]
        for id in ids:
            pred_1[i][id] = 1
    return pred_1

def print_metrics(pred, truth, thres=0.5):
    print("--------------------------------------")
    print('MAP@1 ', MAP_at_K(1, pred, truth))
    print('MAP@2 ', MAP_at_K(2, pred, truth))
    print('MAP@3 ', MAP_at_K(3, pred, truth))
    print('MAP@4 ', MAP_at_K(4, pred, truth))

    print("--------------------------------------")
    from torchmetrics.functional.classification import multilabel_f1_score
    from torchmetrics.classification import MultilabelF1Score


    # f1arr = []
    # for thres in range(1, 20):
    #     thres /= 20
    #     f1ma = MultilabelF1Score(num_labels=18, threshold=thres, average='macro')
    #     # f1arr.append((thres,f1ma(pred, truth).tolist()))
    #     print(thres, ' : ', f1ma(pred, truth).tolist())

    # print('mF1 - micro:    ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='micro').tolist())
    # print('mF1 - macro:    ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='macro').tolist())
    # print('mF1 - weighted: ', multilabel_f1_score(pred, truth, num_labels=18, threshold=thres, average='weighted').tolist())

    f1ma = MultilabelF1Score(num_labels=18, threshold=thres, average='macro')
    print('F1 macro :', f1ma(pred, truth).tolist())
    f1mi = MultilabelF1Score(num_labels=18, threshold=thres, average='micro')
    print('F1 micro :', f1mi(pred, truth).tolist())

    print("--------------------------------------")
    # from torchmetrics.functional.classification import multilabel_accuracy
    from torchmetrics.classification import MultilabelAccuracy
    acc = MultilabelAccuracy(num_labels=18, threshold=thres)
    print('Accuracy :', acc(pred, truth).tolist())

    print("--------------------------------------")
    # from torchmetrics.functional.classification import multilabel_precision
    from torchmetrics.classification import MultilabelPrecision
    prec = MultilabelPrecision(num_labels=18, threshold=thres, average='macro')
    print('Precision :', prec(pred, truth).tolist())

    print("--------------------------------------")
    # from torchmetrics.functional.classification import multilabel_recall
    from torchmetrics.classification import MultilabelRecall
    rec = MultilabelRecall(num_labels=18, threshold=thres, average='macro')
    print('Recall :', rec(pred, truth).tolist())


In [35]:
print_metrics(preds, y_test, thres=0.7)

--------------------------------------


TypeError: topk(): argument 'input' (position 1) must be Tensor, not numpy.ndarray