<a href="https://colab.research.google.com/github/MariaZharova/HM-recommender-system/blob/main/Movie_recommender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [285]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

## Data structure

In [286]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [287]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [288]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [289]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [290]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


## Data preprocessing

In [291]:
# prepare unique columns, merge and clean data
movies['mov_num'] = movies.index
ratings = pd.merge(ratings, movies, on='movieId', how='left')
ratings['movieId'] = ratings['mov_num']

In [292]:
# prepare data for training and test 
X = ratings[['userId', 'movieId']]
y = ratings['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
train_rating_df = pd.merge(X_train, y_train, right_index=True, left_index=True)
test_rating_df = pd.merge(X_test, y_test, right_index=True, left_index=True)

In [293]:
train_rating_df.head()

Unnamed: 0,userId,movieId,rating
3047,20,1545,5.0
31659,219,1282,1.5
34075,232,3010,3.5
86750,561,1807,3.0
33503,226,5917,4.5


In [294]:
test_rating_df.head()

Unnamed: 0,userId,movieId,rating
92163,597,10,3.0
71427,459,7212,5.0
993,7,5376,1.0
6514,45,287,4.0
6011,42,911,5.0


In [295]:
# numbers of uninique users, movies
user_max = ratings['userId'].max() + 1
movie_max = ratings['movieId'].max() + 1
test_max = np.shape(test_rating_df)[0]

In [296]:
# create a user rating matrices for training and test sets
train_rating = np.zeros((user_max, movie_max))
for index, row in train_rating_df.iterrows():
    train_rating[int(row['userId']), int(row['movieId'])] = row['rating']
test_rating = np.zeros((user_max, movie_max))
for index, row in test_rating_df.iterrows():
    test_rating[int(row['userId']), int(row['movieId'])] = row['rating']

In [297]:
# create training and test sets 
train_record = train_rating > 0
train_record = np.array(train_record, dtype=int)
test_record = test_rating > 0
test_record = np.array(test_record, dtype=int)

## Implementation of ASVD algorithm

In [336]:
def ASVD_reccommender(mean_rating, user_max, movie_max, train_record, test_record, train_rating, test_rating):
    """
        Implementation of ASVD algorithm (matrix factorization).

        INPUT:
        mean_rating - mean value of ratings,
        user_max, movie_max - numbers of users and ratings + 1, 
        train_record, test_record - training and test sets (contains 1 or 0), 
        train_rating, test_rating - train and test data,
        alpha - parameter for loss function.

        OUTPUT:
        predict - predict value
    """
    mu = mean_rating
    bu = tf.Variable(tf.zeros((user_max, 1), dtype=tf.float32))
    bi = tf.Variable(tf.zeros((1, movie_max), dtype=tf.float32))
    Q = tf.Variable(tf.random.normal([movie_max, f], stddev=1/(f**0.5), dtype=tf.float32))
    x = tf.Variable(tf.random.normal([movie_max, f], stddev=1/(f**0.5), dtype=tf.float32))
    y = tf.Variable(tf.random.normal([movie_max, f], stddev=1/(f**0.5), dtype=tf.float32))
    train_rec = tf.constant(train_record, dtype=tf.float32)
    test_rec = tf.constant(test_record, dtype=tf.float32)
    N = np.zeros((user_max, movie_max), dtype=np.float32)
    R = np.zeros((user_max, movie_max), dtype=np.float32)
    Nu = np.sum(train_record, axis=1)
    for i in range(0, user_max):
        if Nu[i] == 0:
            Nu[i] = 1
    for i in range(0, movie_max):
        N[:,i] = Nu
    Nu = N ** 0.5
    Ru = np.sum(test_record, axis=1)
    for i in range(0, user_max):
        if Ru[i] == 0:
            Ru[i] = 1
    for i in range(0, movie_max):
        R[:,i] = Ru
    Ru = R ** 0.5
    QX = tf.matmul(Q, x, transpose_b=True)
    Qxt = tf.matmul(test_rec, QX)
    Qxt =(test_rating - mu - bi - bu) * Qxt
    QY = tf.matmul(Q, y, transpose_b=True)
    Qyt = tf.matmul(train_rec, QY)
    predict = mu + bu + bi + Qxt / Ru + Qyt / Nu

    return predict

## Test algorithm

In [337]:
# launching function and get prediction
f = 100
alpha = 0.01
mean_rating = np.mean(train_rating_df['rating'])
predict = ASVD_reccommender(mean_rating, user_max, movie_max, train_record, test_record, train_rating, test_rating)

In [377]:
RMSE = (np.sum(((test_rating - predict) * test_record) ** 2) / test_max)  ** 0.5
MAE = np.sum(((np.abs(test_rating - predict) * test_record)) / test_max)
print(RMSE)
print(MAE)

1.0511658554147856
0.8366933410800012


## Test with polara

In [361]:
from polara.recommender.data import RecommenderData
from polara.recommender.models import SVDModel
data_model = RecommenderData(merged, 'userId', 'movieId', 'rating', seed=10)

In [378]:
data_model.prepare_training_only()
svd = SVDModel(data_model)
svd.build()
svd.switch_positive = 4
svd.evaluate('relevance')

Preparing data...
Data is ready. No action was taken.
Done.
There are 100836 events in the training and 0 events in the holdout.
PureSVD training time: 0.067s
Relevance(precision=0.5671698376088903, recall=0.2879056679025798, fallout=0.0687745859398142, specifity=0.8675655478084651, miss_rate=0.6332628399391265)
