In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [3]:
movies = pd.read_csv(r'D:\Reinforcement_Learning\ml-latest-small\movies.csv')
ratings = pd.read_csv(r'D:\Reinforcement_Learning\ml-latest-small\ratings.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Reinforcement_Learning\\ml-latest-small\\movies.csv'

In [None]:
nu = len(ratings['userId'].unique())#number of unique users
nm = len(movies['movieId'].unique())#number of unique movies
print(nu, nm)

610 9742


In [None]:
# Step 1: Get full list of movieIds
all_movie_ids = movies['movieId'].unique()

# Step 2: Pivot ratings
rates = ratings.pivot(index='movieId', columns='userId', values='rating')

# Step 3: Reindex to include all movieIds (adds NaNs for missing movies)
rates = rates.reindex(all_movie_ids)

# Optional: reset index (so movieId is a column)
rates = rates.reset_index()

# Step 2: Add genre columns (get dummies) from movies
genres = movies[['movieId', 'genres']].copy()
genre_dummies = genres['genres'].str.get_dummies(sep='|')
genre_dummies = genre_dummies.drop(columns=['(no genres listed)'], errors='ignore')  # skip if not present

# Combine movieId with genre columns
genres_df = pd.concat([genres['movieId'], genre_dummies], axis=1)

# Step 3: Merge with the pivoted ratings
final_df = pd.merge(rates.reset_index(), genres_df, on='movieId')

In [None]:
final_df.to_csv(r'D:\Reinforcement_Learning\ml-latest-small\rates.csv', index=False)

In [None]:
genres_df

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9738,193583,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9739,193585,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9740,193587,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
genre_list = genre_dummies.columns.tolist()
k = len(genre_list)  # Number of genres
genre_list.extend(['movieId', 'index'])  # Add movieId to the genre list
num_df = final_df.drop(columns=genre_list)
num_df = np.array(num_df, dtype = np.float64)
np.nan_to_num(num_df, copy=False, nan=0.0)  # Replace NaN with 0

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [None]:
#mean normalsation part
mean_ratings = np.nanmean(num_df, axis=0)
normalised = num_df - mean_ratings
Y = normalised

In [None]:
tf.random.set_seed(1234)
#movie seen or not
R = np.array(num_df > 0, dtype = np.int32)
#X parameter
X = tf.Variable(tf.random.normal(shape=(Y.shape[0], k), dtype=tf.float64), name='X')
#W parameter
W = tf.Variable(tf.random.normal(shape=(nu, k), dtype=tf.float64), name='W')
#b parameter
b = tf.Variable(tf.zeros(shape=(1, nu), dtype = tf.float64), name='b')

In [None]:
print("X:", X.shape)
print("W:", W.shape)
print("b:", b.shape)
print("Y:", Y.shape)
print("R:", R.shape)

X: (9742, 19)
W: (610, 19)
b: (1, 610)
Y: (9742, 610)
R: (9742, 610)


In [None]:
optimiser = tf.keras.optimizers.Adam(learning_rate=0.05)

In [None]:
iters = 2000
lambda_ = 1

for i in range(iters):
    with tf.GradientTape() as tape:
        J = cofi_cost_func_v(X, W, b, Y, R, lambda_)
    gradients = tape.gradient(J, [X, W, b])
    optimiser.apply_gradients(zip(gradients, [X, W, b]))
    if i%20==0:
        print(f"Training loss at iteration {i}:{J:0.4f}")

Training loss at iteration 0:1647410.9435
Training loss at iteration 20:185649.2420
Training loss at iteration 40:66171.6543
Training loss at iteration 60:42375.3748
Training loss at iteration 80:33147.4769
Training loss at iteration 100:28042.7659
Training loss at iteration 120:24672.3843
Training loss at iteration 140:22265.9670
Training loss at iteration 160:20479.1605
Training loss at iteration 180:19116.4457
Training loss at iteration 200:18053.2530
Training loss at iteration 220:17207.6723
Training loss at iteration 240:16524.0230
Training loss at iteration 260:15963.4432
Training loss at iteration 280:15497.6187
Training loss at iteration 300:15105.2586
Training loss at iteration 320:14770.0771
Training loss at iteration 340:14479.5893
Training loss at iteration 360:14224.3033
Training loss at iteration 380:13997.0133
Training loss at iteration 400:13792.2115
Training loss at iteration 420:13605.6573
Training loss at iteration 440:13434.1092
Training loss at iteration 460:13275.

In [None]:
predictions = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
predictions = predictions + mean_ratings  # Adding the mean ratings back to the predictions

In [None]:
print("Predictions shape:", predictions.shape)
print("Predictions sample:", predictions[0])  # Displaying a sample

Predictions shape: (9742, 610)
Predictions sample: [4.51311555 3.9838566  2.68956362 3.89212656 4.07588901 5.14358214
 4.87399592 3.98156557 4.5951443  2.41852374 3.54996301 4.45721011
 3.78516494 3.32167211 3.23380926 3.92390304 4.28519289 3.91766213
 3.24537961 4.9430709  4.25922198 4.03385205 3.11613713 4.04541423
 4.95522859 3.53810957 3.827355   3.05066848 4.36593864 4.79398933
 4.40482269 3.82219646 4.07427072 3.42432416 4.40064477 2.59474218
 4.09823463 3.62448032 3.03941082 4.51786919 2.41564006 3.72097213
 5.04765085 3.56977627 3.86502924 4.65871063 3.47662766 4.32898898
 4.29784798 3.19679659 4.72567812 3.87981803 5.         3.14816516
 3.93062118 4.12652672 4.6551857  4.37848839 4.50377832 3.59972683
 4.09587973 3.83935148 4.31357607 3.98568824 3.74759687 3.81385495
 4.37467613 3.36029982 4.43593442 3.63814676 4.21874177 3.93959339
 3.9463666  4.27474233 3.68259002 1.33809753 3.89359912 3.54135524
 4.76961955 4.13733566 2.70287212 2.78279752 3.74745493 3.5438555
 4.52661583 