In [27]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from numpy import loadtxt

In [28]:
ratings=pd.read_csv('ratings_small.csv')
print(ratings)

Y = np.ndarray(
    shape=(np.max(ratings.movieId.values), np.max(ratings.userId.values)),
    dtype=np.uint8)

Y[ratings.movieId.values-1, ratings.userId.values-1] = ratings.rating.values
print(Y)
print(Y.shape)

        userId  movieId  rating   timestamp
0            1       31     2.5  1260759144
1            1     1029     3.0  1260759179
2            1     1061     3.0  1260759182
3            1     1129     2.0  1260759185
4            1     1172     4.0  1260759205
...        ...      ...     ...         ...
99999      671     6268     2.5  1065579370
100000     671     6269     4.0  1065149201
100001     671     6365     4.0  1070940363
100002     671     6385     2.5  1070979663
100003     671     6565     3.5  1074784724

[100004 rows x 4 columns]
[[0 0 0 ... 0 4 5]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(163949, 671)


In [29]:
def cost_func(X,W,b,Y,R,lambda_):
  j=(tf.linalg.matmul(X,tf.transpose(W))+b-Y)*R
  J=0.5*tf.reduce_sum(j**2)+(lambda_/2)*(tf.reduce_sum(X**2)+tf.reduce_sum(W**2)) 
  return J

In [30]:
def normalizeRatings(Y, R):
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1) # denominator can be 0 so we add a very small number so that division is always defined
    Ynorm = Y - np.multiply(Ymean, R)
    return(Ynorm, Ymean)

In [31]:
def compute_R(Y):
  R=(Y!=0)
  return R

In [32]:
num_features=100
num_movies,num_users=Y.shape

tf.random.set_seed(1234)
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

optimizer = keras.optimizers.Adam(learning_rate=1e-1)
R=compute_R(Y)
Ynorm,Ymean=normalizeRatings(Y,R)

In [None]:
def cost_func_validation(X,W,b,Y,R):  # Cost function for finding optimum value of regularization parameter(without regularization) and to be called on the validation set to compare
  j=(tf.linalg.matmul(X,tf.transpose(W))+b-Y)*R
  J=0.5*tf.reduce_sum(j**2) 
  return J

In [33]:
iterations = 200
lambda_ = 1 # Here also we can use cross validation to find the best value of lambda by iterating over a range of values and then choosing the one with the least cost function value for the validation set
for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value = cost_func(X, W, b, Ynorm, R, lambda_)
    grads = tape.gradient( cost_value, [X,W,b] )
    optimizer.apply_gradients( zip(grads, [X,W,b]) )
    print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 13205588.7
Training loss at iteration 1: 9802253.9
Training loss at iteration 2: 7555731.1
Training loss at iteration 3: 6033305.0
Training loss at iteration 4: 4949016.7
Training loss at iteration 5: 4132707.5
Training loss at iteration 6: 3489885.3
Training loss at iteration 7: 2967797.7
Training loss at iteration 8: 2535359.1
Training loss at iteration 9: 2173022.8
Training loss at iteration 10: 1868116.1
Training loss at iteration 11: 1611921.5
Training loss at iteration 12: 1397546.3
Training loss at iteration 13: 1218812.6
Training loss at iteration 14: 1070058.9
Training loss at iteration 15: 946330.8
Training loss at iteration 16: 843409.3
Training loss at iteration 17: 757628.6
Training loss at iteration 18: 685735.2
Training loss at iteration 19: 624932.1
Training loss at iteration 20: 572961.5
Training loss at iteration 21: 528066.2
Training loss at iteration 22: 488858.3
Training loss at iteration 23: 454216.8
Training loss at iteration 24: 423

In [34]:
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
pm = p + Ymean # These are the predictions

In [35]:
print(pm*R)

[[ 0.          0.          0.         ...  0.          3.98482421
   4.93884506]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.          0.         -0.         ...  0.          0.
  -0.        ]
 [-0.          0.         -0.         ...  0.          0.
  -0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [36]:
for i in range(pm.shape[0]):
  for j in range(pm.shape[1]):
    if R[i][j]==True:
      print(f"The predicted value is {pm[i][j]} and real value is {Y[i][j]}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
The predicted value is 1.1252547413726683 and real value is 1
The predicted value is 2.9050465615124463 and real value is 3
The predicted value is 2.9105747887216444 and real value is 3
The predicted value is 2.0411686955799233 and real value is 2
The predicted value is 1.0811890612915689 and real value is 1
The predicted value is 2.966593297842264 and real value is 3
The predicted value is 4.797830547407292 and real value is 5
The predicted value is 2.918172705734518 and real value is 3
The predicted value is 1.0720819709090634 and real value is 1
The predicted value is 1.9981233874108149 and real value is 2
The predicted value is 4.05544293681247 and real value is 4
The predicted value is 3.088599125029532 and real value is 3
The predicted value is 4.902086722856748 and real value is 5
The predicted value is 4.041545419249175 and real value is 4
The predicted value is 1.014858974480442 and real value is 1
The predicted 

In [37]:
# now for a new user we need to learn its parameters w and b and then make predictions using k closest movies to x_i using ||(x(k)-x(i))||^2 distance metric
# this is better than getting predictions for all movies and then sorting them because we are only considering k movies and not all movies

In [38]:
# But collaborative filtering suffers from a cold start problem for a new user who has not yet rated many movies or when a new movie comes out which has yet not been rated by many users
# and to address this other hybrid methods are used alongside collaborative filtering
# Moreover we cannot utilise the extra information available regarding the user and items like demography, genres etc which is done by content-based filtering
# So collaborative filtering is used when  we already have many many user ratings for different movies

In [39]:
# For existing user say user 15 we make predictions by

user_i_predictions=pm[:,15]
top_predictions=np.argsort(-user_i_predictions)
print(user_i_predictions[top_predictions])

# So we can then suggest maybe like top 100 of them to the user

[5.61108139 5.59284256 5.57964235 ... 0.5457503  0.54574946 0.54574746]


In [40]:
print(X)

<tf.Variable 'X:0' shape=(163949, 100) dtype=float64, numpy=
array([[ 6.98665714e-01,  2.48201733e-01,  3.39791154e-01, ...,
         2.33969864e-01, -6.87878544e-01,  2.11163037e-01],
       [ 2.51696370e-01,  7.79864938e-02, -3.83311322e-02, ...,
         6.91003959e-02,  6.92823100e-02, -5.71004933e-01],
       [ 2.10981489e-01,  5.66983113e-02, -9.91609560e-03, ...,
         7.27563660e-02,  2.20906098e-01, -1.87668972e-01],
       ...,
       [ 4.42851811e-06,  1.36559759e-05,  4.15360062e-07, ...,
        -8.02446809e-06, -6.99659544e-06,  1.50837647e-05],
       [-6.18792701e-06,  8.50461241e-07, -6.90297670e-06, ...,
        -1.90874737e-06, -5.95848660e-05, -5.72301777e-05],
       [-4.29432973e-03,  1.73374885e-03, -2.05681439e-03, ...,
         5.56651795e-04,  7.90600993e-03, -5.23180650e-03]])>


In [41]:
def compute_distance(a,b):
  dist=(np.linalg.norm(a-b))**2
  return dist

In [43]:
# Then we make the distance matrix between each X_m
dim=X.numpy().shape[0]
print(dim)
dim=100  # It takes a long time for the distance for all the movies to be computed so we showcase the functionality by only 1000 movies
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = compute_distance(X[i, :], X[j, :])
print(dist)

# Then we can take from each row or each column minimum k distance items which wil be most similar to the respective row or column where k is the number of similar items we want to a particular item

163949
[[ 0.         20.03541868 17.55678642 ... 12.62996586 13.92827721
  13.8940047 ]
 [20.03541868  0.         13.16419382 ...  6.73889732  7.67279592
   9.79285791]
 [17.55678642 13.16419382  0.         ...  5.70644781  7.03194179
   7.86230807]
 ...
 [12.62996586  6.73889732  5.70644781 ...  0.          0.8728681
   2.31779674]
 [13.92827721  7.67279592  7.03194179 ...  0.8728681   0.
   3.49162801]
 [13.8940047   9.79285791  7.86230807 ...  2.31779674  3.49162801
   0.        ]]
