In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [3]:
movies = pd.read_csv(r'movies.csv')
ratings = pd.read_csv(r'ratings.csv')

In [4]:
nu = len(ratings['userId'].unique())#number of unique users
nm = len(movies['movieId'].unique())#number of unique movies
print(nu, nm)

610 9742


In [5]:
# Step 1: Get full list of movieIds
all_movie_ids = movies['movieId'].unique()

# Step 2: Pivot ratings
rates = ratings.pivot(index='movieId', columns='userId', values='rating')

# Step 3: Reindex to include all movieIds (adds NaNs for missing movies)
rates = rates.reindex(all_movie_ids)

# Optional: reset index (so movieId is a column)
rates = rates.reset_index()

# Step 2: Add genre columns (get dummies) from movies
genres = movies[['movieId', 'genres']].copy()
genre_dummies = genres['genres'].str.get_dummies(sep='|')
genre_dummies = genre_dummies.drop(columns=['(no genres listed)'], errors='ignore')  # skip if not present

# Combine movieId with genre columns
genres_df = pd.concat([genres['movieId'], genre_dummies], axis=1)

# Step 3: Merge with the pivoted ratings
final_df = pd.merge(rates.reset_index(), genres_df, on='movieId')

In [7]:
rates = final_df

In [8]:
genre_list = genre_dummies.columns.tolist()
k = len(genre_list)  # Number of genres
genre_list.extend(['movieId', 'index'])  # Add movieId to the genre list
num_df = rates.drop(columns=genre_list)
num_df = np.array(num_df, dtype = np.float64)


In [9]:
num_df

array([[4. , nan, nan, ..., 2.5, 3. , 5. ],
       [nan, nan, nan, ..., 2. , nan, nan],
       [4. , nan, nan, ..., 2. , nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [10]:
#mean normalsation part
mean_ratings = np.nanmean(num_df, axis=1)
mean_ratings = np.nan_to_num(mean_ratings, copy=False, nan=0.0)
mean_ratings = mean_ratings.reshape(-1, 1)  # Reshape to a column vector
num_df = np.nan_to_num(num_df, copy=False,  nan=0.0)
normalised = num_df - mean_ratings
Y=normalised

  mean_ratings = np.nanmean(num_df, axis=1)


In [11]:
tf.random.set_seed(1234)
#movie seen or not
R = np.array(num_df > 0, dtype = np.int32)
#X parameter
X = tf.Variable(tf.random.normal(shape=(Y.shape[0], k), dtype=tf.float64), name='X')
#W parameter
W = tf.Variable(tf.random.normal(shape=(nu, k), dtype=tf.float64), name='W')
#b parameter
b = tf.Variable(tf.zeros(shape=(1, nu), dtype = tf.float64), name='b')

In [12]:
print("X:", X.shape)
print("W:", W.shape)
print("b:", b.shape)
print("Y:", Y.shape)
print("R:", R.shape)

X: (9742, 19)
W: (610, 19)
b: (1, 610)
Y: (9742, 610)
R: (9742, 610)


In [13]:
optimiser = tf.keras.optimizers.Adam(learning_rate=0.1)

In [14]:
iters = 1500
lambda_ = 0.5

for i in range(iters):
    with tf.GradientTape() as tape:
        J = cofi_cost_func_v(X, W, b, Y, R, lambda_)
    gradients = tape.gradient(J, [X, W, b])
    optimiser.apply_gradients(zip(gradients, [X, W, b]))
    if i%20==0:
        print(f"Training loss at iteration {i}:{J:0.4f}")

Training loss at iteration 0:1027601.5322
Training loss at iteration 20:47128.7672
Training loss at iteration 40:27257.2680
Training loss at iteration 60:19142.1986
Training loss at iteration 80:15092.5671
Training loss at iteration 100:12879.7148
Training loss at iteration 120:11543.6053
Training loss at iteration 140:10681.8778
Training loss at iteration 160:10105.8072
Training loss at iteration 180:9710.3778
Training loss at iteration 200:9429.1992
Training loss at iteration 220:9220.3319
Training loss at iteration 240:9059.9701
Training loss at iteration 260:8933.5466
Training loss at iteration 280:8831.5898
Training loss at iteration 300:8747.5675
Training loss at iteration 320:8677.2777
Training loss at iteration 340:8617.4582
Training loss at iteration 360:8565.9214
Training loss at iteration 380:8520.2692
Training loss at iteration 400:8479.5113
Training loss at iteration 420:8442.2896
Training loss at iteration 440:8407.6691
Training loss at iteration 460:8375.5435
Training lo

In [15]:
P = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
P_new = P + mean_ratings  # Adding the mean ratings back to the predictions
P_new[0][np.argmax(P_new[0])]

5.699972963344608

In [16]:
predictions = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
predictions = predictions + mean_ratings  # Adding the mean ratings back to the predictions
predictions = np.clip(predictions, 0, 5)
my_predictions = predictions[:, 0] #Assuming the first user is the one we want to recommend movies for
my_predictions = pd.Series(my_predictions, index=genres_df['movieId'])
my_predictions = my_predictions.sort_values(ascending=False)

In [17]:
print(predictions[:, 0])

[4.24304894 3.80935433 3.72070743 ... 4.21088815 4.2108448  4.66457381]


In [18]:
pred_df = pd.DataFrame({
    'movieId': my_predictions.index,
    'predicted_rating': my_predictions.values
})

pred_df = pd.merge(pred_df, movies[['movieId', 'title', 'genres']], on='movieId', how='left')


In [19]:
pred_df

Unnamed: 0,movieId,predicted_rating,title,genres
0,3983,5.000000,You Can Count on Me (2000),Drama|Romance
1,27320,5.000000,"Nine Lives of Tomas Katz, The (2000)",Comedy|Drama|Fantasy
2,92475,5.000000,All Watched Over by Machines of Loving Grace (...,Documentary
3,92494,5.000000,Dylan Moran: Monster (2004),Comedy|Documentary
4,92694,5.000000,Perfect Sense (2011),Drama|Romance|Sci-Fi
...,...,...,...,...
9737,8765,0.684953,This Gun for Hire (1942),Crime|Film-Noir|Thriller
9738,6668,0.684937,"Road Home, The (Wo de fu qin mu qin) (1999)",Drama|Romance
9739,32160,0.684362,Twentieth Century (1934),Comedy
9740,1407,0.655110,Scream (1996),Comedy|Horror|Mystery|Thriller


In [20]:
mean_ratings_df= pd.DataFrame({
    'movieId': genres_df['movieId'],
    'mean_rating': mean_ratings.flatten()
})

In [21]:
user_id = 1  # or whichever user you're evaluating
user_ratings = ratings[ratings['userId'] == user_id][['movieId', 'rating']]
user_ratings = user_ratings.rename(columns={'rating': 'actual_rating'})
# Merge with predictions
pred_df = pd.merge(pred_df, user_ratings, on='movieId', how='left')
# Merge with mean ratings
pred_df = pd.merge(pred_df, mean_ratings_df, on='movieId', how='left')
# Sort predictions
pred_df = pred_df.sort_values(by='predicted_rating', ascending=False)

In [22]:
pred_df

Unnamed: 0,movieId,predicted_rating,title,genres,actual_rating,mean_rating
0,3983,5.000000,You Can Count on Me (2000),Drama|Romance,,4.166667
839,131098,5.000000,Saving Santa (2013),Animation|Children|Comedy,,5.000000
846,5404,5.000000,84 Charing Cross Road (1987),Drama|Romance,,4.500000
845,172589,5.000000,Winter in Prostokvashino (1984),Animation,,5.000000
844,172587,5.000000,Vacations in Prostokvashino (1980),Animation,,5.000000
...,...,...,...,...,...,...
9737,8765,0.684953,This Gun for Hire (1942),Crime|Film-Noir|Thriller,,0.000000
9738,6668,0.684937,"Road Home, The (Wo de fu qin mu qin) (1999)",Drama|Romance,,0.000000
9739,32160,0.684362,Twentieth Century (1934),Comedy,,0.000000
9740,1407,0.655110,Scream (1996),Comedy|Horror|Mystery|Thriller,,3.200000


In [23]:
print("Top 20 movie recommendations for the first user:")
for i in range(20):
    movie_id = pred_df['movieId'].iloc[i]
    title = pred_df['title'].iloc[i]
    genres = pred_df['genres'].iloc[i]
    pred = pred_df['predicted_rating'].iloc[i]
    mean = pred_df['mean_rating'].iloc[i] if not pd.isna(pred_df['mean_rating'].iloc[i]) else 'N/A'

    print(f"{i+1}. ID: {movie_id:<10} Title: {title:<60} Genre: {genres:<40} Predicted: {pred:.2f} MEAN_RATING: {mean:<40}")
    

Top 20 movie recommendations for the first user:
1. ID: 3983       Title: You Can Count on Me (2000)                                   Genre: Drama|Romance                            Predicted: 5.00 MEAN_RATING: 4.166666666666667                       
2. ID: 131098     Title: Saving Santa (2013)                                          Genre: Animation|Children|Comedy                Predicted: 5.00 MEAN_RATING: 5.0                                     
3. ID: 5404       Title: 84 Charing Cross Road (1987)                                 Genre: Drama|Romance                            Predicted: 5.00 MEAN_RATING: 4.5                                     
4. ID: 172589     Title: Winter in Prostokvashino (1984)                              Genre: Animation                                Predicted: 5.00 MEAN_RATING: 5.0                                     
5. ID: 172587     Title: Vacations in Prostokvashino (1980)                           Genre: Animation                                P

In [24]:
new_user = np.zeros((X.shape[0], 1), dtype=np.float64)
# Assuming new_user is a vector of zeros, we can use the same prediction logic
R_new = np.concatenate((R, new_user), axis = 1)

In [25]:
W_new = tf.Variable(tf.random.normal(shape=(1,k), dtype=tf.float64))
W_new = np.concatenate((W, W_new), axis=0)

In [26]:
b_new = tf.Variable(tf.zeros(shape=(1, 1), dtype=tf.float64))
b_new = tf.concat((b, b_new), axis=1)

In [27]:
mean_ratings_new = np.nan_to_num(mean_ratings, copy=False, nan=0.0)
mean_ratings_new.shape

(9742, 1)

In [28]:
new_Pred = (tf.linalg.matmul(X, tf.transpose(W_new))+b_new)*R_new + mean_ratings_new

In [29]:
new_Pred.shape

TensorShape([9742, 611])

In [30]:
new_user_pred = new_Pred[:, -1]
new_user_pred = pd.Series(new_user_pred, index=genres_df['movieId'])
new_user_pred = new_user_pred.sort_values(ascending=False)
print("Top 20 movie recommendations for the new user with no ratings:")
for i in range(20):
    movie_id = new_user_pred.index[i]
    pred_rating = new_user_pred.iloc[i]
    title = movies[movies['movieId'] == movie_id]['title'].values[0] if not movies[movies['movieId'] == movie_id].empty else 'Unknown Title'
    genres = genres_df[genres_df['movieId'] == movie_id].drop(columns='movieId').columns.tolist()
    mean = pred_df['mean_rating'].iloc[i] if not pd.isna(pred_df['mean_rating'].iloc[i]) else 'N/A'

    print(f"{i+1}. ID: {movie_id:<10} Title: {title:<80} Genre: {', '.join(genres):<60} Predicted Rating: {pred_rating:.2f} MEAN_RATING: {mean:<40}")

Top 20 movie recommendations for the new user with no ratings:
1. ID: 88448      Title: Paper Birds (Pájaros de papel) (2010)                                            Genre: Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, IMAX, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western Predicted Rating: 5.00 MEAN_RATING: 4.166666666666667                       
2. ID: 3795       Title: Five Senses, The (1999)                                                          Genre: Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, IMAX, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western Predicted Rating: 5.00 MEAN_RATING: 5.0                                     
3. ID: 138966     Title: Nasu: Summer in Andalusia (2003)                                                 Genre: Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, IMAX, M