In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [4]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [5]:
movies = pd.read_csv(r'D:\Reinforcement_Learning\ml-latest-small\movies.csv')
ratings = pd.read_csv(r'D:\Reinforcement_Learning\ml-latest-small\ratings.csv')
rates = pd.read_csv(r'D:\Reinforcement_Learning\ml-latest-small\ratings_filled_25_percent.csv')

In [6]:
nu = len(ratings['userId'].unique())#number of unique users
nm = len(movies['movieId'].unique())#number of unique movies

print(nu, nm)

610 9742


In [7]:
# Step 1: Get full list of movieIds
all_movie_ids = movies['movieId'].unique()

# Step 2: Add genre columns (get dummies) from movies
genres = movies[['movieId', 'genres']].copy()
genre_dummies = genres['genres'].str.get_dummies(sep='|')
genre_dummies = genre_dummies.drop(columns=['(no genres listed)'], errors='ignore')  # skip if not present

# Combine movieId with genre columns
genres_df = pd.concat([genres['movieId'], genre_dummies], axis=1)


In [8]:
genres_df

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9738,193583,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9739,193585,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9740,193587,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
genre_list = genre_dummies.columns.tolist()
k = len(genre_list)  # Number of genres
genre_list.extend(['movieId', 'index'])  # Add movieId to the genre list
num_df = rates.drop(columns=genre_list)
num_df = np.array(num_df, dtype = np.float64)


In [10]:
#mean normalsation part
mean_ratings = np.nanmean(num_df, axis=0)
normalised = num_df - mean_ratings
np.nan_to_num(normalised, copy=False, nan=0.0)
Y=normalised

In [11]:
tf.random.set_seed(1234)
#movie seen or not
R = np.array(num_df > 0, dtype = np.int32)
#X parameter
X = tf.Variable(tf.random.normal(shape=(Y.shape[0], k), dtype=tf.float64), name='X')
#W parameter
W = tf.Variable(tf.random.normal(shape=(nu, k), dtype=tf.float64), name='W')
#b parameter
b = tf.Variable(tf.zeros(shape=(1, nu), dtype = tf.float64), name='b')

In [12]:
print("X:", X.shape)
print("W:", W.shape)
print("b:", b.shape)
print("Y:", Y.shape)
print("R:", R.shape)

X: (9742, 19)
W: (610, 19)
b: (1, 610)
Y: (9742, 610)
R: (9742, 610)


In [13]:
optimiser = tf.keras.optimizers.Adam(learning_rate=0.1)

In [14]:
iters = 1500
lambda_ = 0.5

for i in range(iters):
    with tf.GradientTape() as tape:
        J = cofi_cost_func_v(X, W, b, Y, R, lambda_)
    gradients = tape.gradient(J, [X, W, b])
    optimiser.apply_gradients(zip(gradients, [X, W, b]))
    if i%20==0:
        print(f"Training loss at iteration {i}:{J:0.4f}")

Training loss at iteration 0:14996024.9643
Training loss at iteration 20:187652.5082
Training loss at iteration 40:67984.6511
Training loss at iteration 60:47546.2662
Training loss at iteration 80:42185.5437
Training loss at iteration 100:39248.7341
Training loss at iteration 120:37295.3983
Training loss at iteration 140:35597.8070
Training loss at iteration 160:34118.6197
Training loss at iteration 180:32851.8809
Training loss at iteration 200:31763.1358
Training loss at iteration 220:30803.9804
Training loss at iteration 240:29938.3119
Training loss at iteration 260:29148.4992
Training loss at iteration 280:28428.7855
Training loss at iteration 300:27778.4145
Training loss at iteration 320:27197.5537
Training loss at iteration 340:26685.0927
Training loss at iteration 360:26238.0189
Training loss at iteration 380:25851.7621
Training loss at iteration 400:25520.7666
Training loss at iteration 420:25239.0293
Training loss at iteration 440:25000.4827
Training loss at iteration 460:24799

In [15]:
predictions = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
predictions = predictions + mean_ratings  # Adding the mean ratings back to the predictions
my_predictions = predictions[:, 0] #Assuming the first user is the one we want to recommend movies for
my_predictions = pd.Series(my_predictions, index=genres_df['movieId'])
my_predictions = my_predictions.sort_values(ascending=False)

In [31]:
pred_df = pd.DataFrame({
    'movieId': my_predictions.index,
    'predicted_rating': my_predictions.values
})

pred_df = pd.merge(pred_df, movies[['movieId', 'title', 'genres']], on='movieId', how='left')


In [32]:
user_id = 1  # or whichever user you're evaluating
user_ratings = ratings[ratings['userId'] == user_id][['movieId', 'rating']]
user_ratings = user_ratings.rename(columns={'rating': 'actual_rating'})

# Merge with predictions
pred_df = pd.merge(pred_df, user_ratings, on='movieId', how='left')
# Sort predictions
pred_df = pred_df.sort_values(by='predicted_rating', ascending=False)

In [40]:
print("Top 20 movie recommendations for the first user:")
for i in range(20):
    movie_id = pred_df['movieId'].iloc[i]
    title = pred_df['title'].iloc[i]
    genres = pred_df['genres'].iloc[i]
    pred = pred_df['predicted_rating'].iloc[i]
    actual = pred_df['actual_rating'].iloc[i] if not pd.isna(pred_df['actual_rating'].iloc[i]) else 'N/A'

    print(f"{i+1}. ID: {movie_id:<10} Title: {title:<60} Genre: {genres:<40} Predicted: {pred:.2f} \t Actual: {actual}")
    

Top 20 movie recommendations for the first user:
1. ID: 912        Title: Casablanca (1942)                                            Genre: Drama|Romance                            Predicted: 4.83 	 Actual: N/A
2. ID: 260        Title: Star Wars: Episode IV - A New Hope (1977)                    Genre: Action|Adventure|Sci-Fi                  Predicted: 4.82 	 Actual: 5.0
3. ID: 55820      Title: No Country for Old Men (2007)                                Genre: Crime|Drama                              Predicted: 4.80 	 Actual: N/A
4. ID: 1262       Title: Great Escape, The (1963)                                     Genre: Action|Adventure|Drama|War               Predicted: 4.80 	 Actual: N/A
5. ID: 1197       Title: Princess Bride, The (1987)                                   Genre: Action|Adventure|Comedy|Fantasy|Romance  Predicted: 4.80 	 Actual: 5.0
6. ID: 1250       Title: Bridge on the River Kwai, The (1957)                         Genre: Adventure|Drama|War                   

In [None]:
new_user = np.array([1,])

Predictions shape: (9742, 610)
Predictions sample: [4.36935707 4.26931466 4.18799004 ... 4.36840841 4.36347712 4.37208063]
