<a href="https://colab.research.google.com/github/Jasxpreet/Data-Science-Assignments/blob/main/Recommendation_Systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MapReduce & Recommender Systems

**Dataset: MovieLens 100k**


*   u.data: Contains user ratings → user_id, movie_id, rating, timestamp
*   u.item: Contains movie metadata → movie_id, movie_title, (and other unused fields)




---





**Part 1: MapReduce**


1.   Load the u.data file using built-in Python (i.e., open() and readlines()).
2. Parse and clean the data to extract: user_id, movie_id, rating.
3. Drop or skip malformed entries during parsing.



In [None]:
def load_data(filepath):
    cleaned_data = []
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')  # Tab-separated
            if len(parts) != 4:
                continue  # skip malformed lines
            try:
                user_id = int(parts[0])
                movie_id = int(parts[1])
                rating = int(parts[2])
                cleaned_data.append((user_id, movie_id, rating))
            except ValueError:
                continue  # skip if conversion fails
    return cleaned_data

In [None]:
data=load_data("/content/u.data")
print(f"Total records: {len(data)}")
for i in range(5):
  print(f"Sample record {i}: ", data[i])

Total records: 100000
Sample record 0:  (196, 242, 3)
Sample record 1:  (186, 302, 3)
Sample record 2:  (22, 377, 1)
Sample record 3:  (244, 51, 2)
Sample record 4:  (166, 346, 1)




---


4.  Most Rated Movies (Simulated MapReduce)
  *   Mapper: For each line in the dataset, emit a tuple (movie_id, 1)
  *   Reducer: Sum up all the values for each unique movie_id
  *   Output: A list of tuples (movie_id, rating_count) sorted by rating_count (descending)










In [None]:
from collections import defaultdict

def get_most_rated_movies(data):
  movie_rating_count=defaultdict(int)
  for user_id,movie_id,rating in data:
    movie_rating_count[movie_id]+=1

  sorted_movies=sorted(movie_rating_count.items(),key=lambda x:x[1],reverse=True)
  return sorted_movies

In [None]:
most_rated_movies=get_most_rated_movies(data)
for movie_id,rating_count in most_rated_movies[:10]:
  print(f"Movie ID: {movie_id}, Rating Count: {rating_count}")

Movie ID: 50, Rating Count: 583
Movie ID: 258, Rating Count: 509
Movie ID: 100, Rating Count: 508
Movie ID: 181, Rating Count: 507
Movie ID: 294, Rating Count: 485
Movie ID: 286, Rating Count: 481
Movie ID: 288, Rating Count: 478
Movie ID: 1, Rating Count: 452
Movie ID: 300, Rating Count: 431
Movie ID: 121, Rating Count: 429




---


5. Average Rating per Movie (Simulated MapReduce)
  *   Mapper: Emit (movie_id, rating) for each record
  *   Reducer: For each movie, calculate the average of all ratings
  *   Output: A list of tuples (movie_id, average_rating) sorted by movie_id (optional)







In [None]:
def get_avg_ratings(data):
  rating_sum=defaultdict(int)
  rating_count=defaultdict(int)

  #Mapper
  for user_id,movie_id,rating in data:
    rating_sum[movie_id]+=rating
    rating_count[movie_id]+=1

  #Reducer
  average_ratings=[]
  for movie_id in rating_sum:
    average_ratings.append((movie_id,round(rating_sum[movie_id]/rating_count[movie_id],2)))

  average_ratings.sort(key=lambda x:x[0])
  return average_ratings

In [None]:
average_ratings=get_avg_ratings(data)
for movie_id,avg_rating in average_ratings[:10]:
  print(f"Movie ID: {movie_id}, Average Rating: {avg_rating}")

Movie ID: 1, Average Rating: 3.88
Movie ID: 2, Average Rating: 3.21
Movie ID: 3, Average Rating: 3.03
Movie ID: 4, Average Rating: 3.55
Movie ID: 5, Average Rating: 3.3
Movie ID: 6, Average Rating: 3.58
Movie ID: 7, Average Rating: 3.8
Movie ID: 8, Average Rating: 4.0
Movie ID: 9, Average Rating: 3.9
Movie ID: 10, Average Rating: 3.83




---


6. Top N Movies by Average Rating (Filtered)
  *   Combine outputs from Task 1 and Task 2
  *   Filter to include only movies with at least 50 ratings
  *   Sort the remaining movies by average rating (descending)
  *   Output: Top 10 movies as a list of tuples:
(movie_id, average_rating, rating_count)









In [None]:
def get_top_n_movies(most_rated_movies,average_ratings,min_ratings=50,top_n=10):
  most_rated_movies=dict(most_rated_movies)
  filtered_movies=[]
  for movie_id,avg_rating in average_ratings:
    count=most_rated_movies.get(movie_id,0)
    if count>min_ratings:
      filtered_movies.append((movie_id,avg_rating,count))

  top_movies=sorted(filtered_movies,key=lambda x:x[1],reverse=True)
  return top_movies[:top_n]

In [None]:
top_10_movies=get_top_n_movies(most_rated_movies,average_ratings)
for movie_id,avg_rating,count in top_10_movies:
  print(f"Movie ID: {movie_id}, Average Rating: {avg_rating}, Rating Count: {count}")

Movie ID: 408, Average Rating: 4.49, Rating Count: 112
Movie ID: 169, Average Rating: 4.47, Rating Count: 118
Movie ID: 318, Average Rating: 4.47, Rating Count: 298
Movie ID: 483, Average Rating: 4.46, Rating Count: 243
Movie ID: 64, Average Rating: 4.45, Rating Count: 283
Movie ID: 114, Average Rating: 4.45, Rating Count: 67
Movie ID: 12, Average Rating: 4.39, Rating Count: 267
Movie ID: 603, Average Rating: 4.39, Rating Count: 209
Movie ID: 50, Average Rating: 4.36, Rating Count: 583
Movie ID: 178, Average Rating: 4.34, Rating Count: 125




---



In [None]:
def load_movie_titles(file_path='u.item'):
    movie_titles = {}
    with open(file_path, encoding="ISO-8859-1") as f:
        for line in f:
            parts = line.strip().split('|')
            if len(parts) >= 2:
                movie_id = int(parts[0])
                title = parts[1]
                movie_titles[movie_id] = title
    return movie_titles

In [None]:
movie_titles = load_movie_titles()
print("Sample movie titles:")
for i, (mid, title) in enumerate(movie_titles.items()):
    print(f"{mid}: {title}")
    if i == 4: break

Sample movie titles:
1: Toy Story (1995)
2: GoldenEye (1995)
3: Four Rooms (1995)
4: Get Shorty (1995)
5: Copycat (1995)


In [None]:
for movie_id, avg, count in top_10_movies:
    title = movie_titles.get(movie_id, "Unknown Title")
    print(f"{title} -> Avg Rating: {avg}, Ratings: {count}")

Close Shave, A (1995) -> Avg Rating: 4.49, Ratings: 112
Wrong Trousers, The (1993) -> Avg Rating: 4.47, Ratings: 118
Schindler's List (1993) -> Avg Rating: 4.47, Ratings: 298
Casablanca (1942) -> Avg Rating: 4.46, Ratings: 243
Shawshank Redemption, The (1994) -> Avg Rating: 4.45, Ratings: 283
Wallace & Gromit: The Best of Aardman Animation (1996) -> Avg Rating: 4.45, Ratings: 67
Usual Suspects, The (1995) -> Avg Rating: 4.39, Ratings: 267
Rear Window (1954) -> Avg Rating: 4.39, Ratings: 209
Star Wars (1977) -> Avg Rating: 4.36, Ratings: 583
12 Angry Men (1957) -> Avg Rating: 4.34, Ratings: 125


#Part 2: TensorFlow Recommender Systems

**1. Preprocessing**
*   Split data into train/test (80/20)
*   Normalize/standardize ratings if needed
*   Encode user IDs, movie IDs








In [None]:
import pandas as pd
df=pd.DataFrame(data, columns=["user_id", "movie_id", "rating"])
df.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [None]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df['user'] = user_encoder.fit_transform(df['user_id'])
df['movie'] = movie_encoder.fit_transform(df['movie_id'])

df.head()

Unnamed: 0,user_id,movie_id,rating,user,movie
0,196,242,3,195,241
1,186,302,3,185,301
2,22,377,1,21,376
3,244,51,2,243,50
4,166,346,1,165,345


In [None]:
from sklearn.model_selection import train_test_split

#It's a dataframe-based row-level random split, and it’s perfect for recommendation systems
train, test = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train size: {len(train)}, Test size: {len(test)}")

Train size: 80000, Test size: 20000


**2.	Content-Based Filtering**


*   Extract movie features (genres or TF-IDF from titles)
*   Train a neural network to predict user ratings based on movie features
*   Evaluate RMSE on test set


In [None]:
movie_titles = load_movie_titles()

In [None]:
movies_df = pd.DataFrame(list(movie_titles.items()), columns=['movie_id', 'title'])

In [None]:
movies_df["movie"] = movie_encoder.transform(movies_df["movie_id"])

In [None]:
movies_df.head()

Unnamed: 0,movie_id,title,movie
0,1,Toy Story (1995),0
1,2,GoldenEye (1995),1
2,3,Four Rooms (1995),2
3,4,Get Shorty (1995),3
4,5,Copycat (1995),4


In [None]:
df_merged = pd.merge(df, movies_df, on="movie")

In [None]:
df_merged.head()

Unnamed: 0,user_id,movie_id_x,rating,user,movie,movie_id_y,title
0,196,242,3,195,241,242,Kolya (1996)
1,186,302,3,185,301,302,L.A. Confidential (1997)
2,22,377,1,21,376,377,Heavyweights (1994)
3,244,51,2,243,50,51,Legends of the Fall (1994)
4,166,346,1,165,345,346,Jackie Brown (1997)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_features=100)
tfidf_matrix=tfidf.fit_transform(df_merged['title'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
df_final = pd.concat([df_merged.reset_index(drop=True), tfidf_df], axis=1)

In [None]:
df_final.head()

Unnamed: 0,user_id,movie_id_x,rating,user,movie,movie_id_y,title,1939,1940,1941,...,terminator,the,time,to,trek,under,wars,when,with,you
0,196,242,3,195,241,242,Kolya (1996),0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,186,302,3,185,301,302,L.A. Confidential (1997),0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22,377,1,21,376,377,Heavyweights (1994),0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,244,51,2,243,50,51,Legends of the Fall (1994),0.0,0.0,0.0,...,0.0,0.404557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,166,346,1,165,345,346,Jackie Brown (1997),0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X=df_final[tfidf.get_feature_names_out()]
y=df_final['rating']

X_train = X.loc[train.index]
X_test = X.loc[test.index]
y_train = y.loc[train.index]
y_test = y.loc[test.index]

In [None]:
!pip install tensorflow



In [None]:
import tensorflow as tf
from tensorflow.keras import layers

model = tf.keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Predict rating
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - loss: 2.5676 - mae: 1.2213 - val_loss: 1.2052 - val_mae: 0.8979
Epoch 2/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1.1946 - mae: 0.8913 - val_loss: 1.2048 - val_mae: 0.8906
Epoch 3/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1.1810 - mae: 0.8842 - val_loss: 1.2002 - val_mae: 0.8882
Epoch 4/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1.1768 - mae: 0.8821 - val_loss: 1.2187 - val_mae: 0.8907
Epoch 5/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 1.1818 - mae: 0.8843 - val_loss: 1.2138 - val_mae: 0.8847
Epoch 6/10


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

preds = model.predict(X_test).flatten()
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Test RMSE: {rmse:.4f}")

**3.	User-Based Collaborative Filtering (Matrix Factorization with Embeddings)**

*   Create user and item embedding layers
*   Concatenate and pass through a neural net to predict ratings

*   Loss function: MSE
*   Compare training with different embedding sizes


In [None]:
X_collab = df[['user', 'movie']]
y_collab = df['rating'] / 5.0


X_train_collab = X_collab.loc[train.index]
X_test_collab = X_collab.loc[test.index]
y_train_collab = y_collab.loc[train.index]
y_test_collab = y_collab.loc[test.index]

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense

# Define unique counts
num_users = df['user'].nunique()
num_movies = df['movie'].nunique()
embedding_size = 32

# User embedding
user_input = Input(shape=(1,))
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size)(user_input)
user_vec = Flatten()(user_embedding)

# Movie embedding
movie_input = Input(shape=(1,))
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size)(movie_input)
movie_vec = Flatten()(movie_embedding)


concat = Concatenate()([user_vec, movie_vec]) #64-length vector (32 + 32)
dense = Dense(128, activation='relu')(concat)
output = Dense(1, activation='linear')(dense)

model = Model([user_input, movie_input], output)
model.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
model.summary()

In [None]:
history = model.fit(
    [X_train_collab['user'], X_train_collab['movie']],
    y_train_collab,
    epochs=10,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

Epoch 1/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 0.0901 - root_mean_squared_error: 0.2844 - val_loss: 0.0370 - val_root_mean_squared_error: 0.1924
Epoch 2/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.0348 - root_mean_squared_error: 0.1865 - val_loss: 0.0360 - val_root_mean_squared_error: 0.1897
Epoch 3/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.0331 - root_mean_squared_error: 0.1819 - val_loss: 0.0356 - val_root_mean_squared_error: 0.1887
Epoch 4/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0315 - root_mean_squared_error: 0.1775 - val_loss: 0.0352 - val_root_mean_squared_error: 0.1876
Epoch 5/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0289 - root_mean_squared_error: 0.1700 - val_loss: 0.0355 - val_root_mean_squared_error: 0.1885
Epoch 6/10
[1m1125/1125[0m 

In [None]:
test_loss, test_rmse = model.evaluate(
    [X_test_collab['user'], X_test_collab['movie']],
    y_test_collab,
    verbose=1
)

print(f"\nTest RMSE: {test_rmse:.4f}")

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0409 - root_mean_squared_error: 0.2021

Test RMSE: 0.2027


**4.	Item-Based Collaborative Filtering**

*   Compute item-item similarity using cosine similarity from embeddings
*   Recommend top-N similar movies to a given one


In [None]:
# Extract the movie embedding layer's weights
movie_embeddings_matrix = model.get_layer('embedding_1').get_weights()[0]  # 'embedding_1' is the movie embedding layer

print("Shape of movie embeddings:", movie_embeddings_matrix.shape)


Shape of movie embeddings: (1682, 32)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#cosine similarity
cosine_sim = cosine_similarity(movie_embeddings_matrix)

print("Cosine similarity shape:", cosine_sim.shape)

Cosine similarity shape: (1682, 1682)


In [None]:
movie_id_to_title = load_movie_titles()
def recommend_similar_movies(movie_id_encoded, top_n=10):
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[movie_id_encoded]))
    # Sort by score (excluding the movie itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    print(f"\nTop {top_n} movies similar to '{movie_id_to_title.get(movie_encoder.inverse_transform([movie_id_encoded])[0])}':")
    for idx, score in sim_scores:
        original_id =movie_encoder.inverse_transform([idx])[0]
        print(f"{movie_id_to_title.get(original_id)} -> Similarity: {score:.4f}")


In [None]:
recommend_similar_movies(movie_id_encoded=240, top_n=10)


Top 10 movies similar to 'Last of the Mohicans, The (1992)':
Pretty Woman (1990) -> Similarity: 0.5772
While You Were Sleeping (1995) -> Similarity: 0.4910
So Dear to My Heart (1949) -> Similarity: 0.4880
Frankie Starlight (1995) -> Similarity: 0.4854
When Harry Met Sally... (1989) -> Similarity: 0.4671
Malice (1993) -> Similarity: 0.4591
Ghost (1990) -> Similarity: 0.4560
City Hall (1996) -> Similarity: 0.4525
Killer (Bulletproof Heart) (1994) -> Similarity: 0.4502
Enfer, L' (1994) -> Similarity: 0.4440


**5.	Hybrid Model**


*   Combine content-based + collaborative embeddings for better accuracy


In [None]:
# Convert movie_titles dict to DataFrame
movie_titles_df = pd.DataFrame(list(movie_titles.items()), columns=["movie_id", "title"])

# Merge with your main df to get title column
df = df.merge(movie_titles_df, on="movie_id", how="left")

df[['movie_id', 'movie', 'title']].head()

Unnamed: 0,movie_id,movie,title
0,242,241,Kolya (1996)
1,302,301,L.A. Confidential (1997)
2,377,376,Heavyweights (1994)
3,51,50,Legends of the Fall (1994)
4,346,345,Jackie Brown (1997)


In [None]:
# Get unique movie_id and title combinations
unique_movies = df[['movie_id', 'title']].drop_duplicates()

# Apply TF-IDF on just the unique titles
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_matrix = tfidf.fit_transform(unique_movies['title'])
tfidf_features_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Add encoded 'movie' column from df (needed for indexing and similarity lookup)
movie_id_to_encoded = df[['movie_id', 'movie']].drop_duplicates()
unique_movies = unique_movies.merge(movie_id_to_encoded, on='movie_id')
tfidf_features_df['movie'] = unique_movies['movie'].values

# Set encoded movie id as index
tfidf_features_df.set_index('movie', inplace=True)


In [None]:
# Final dataset for hybrid model
df_hybrid = df[['user', 'movie', 'rating']].copy()

# from sklearn.preprocessing import MinMaxScaler
# df_hybrid['rating'] = MinMaxScaler().fit_transform(df_hybrid[['rating']])


In [None]:
train_df, test_df = train_test_split(df_hybrid, test_size=0.2, random_state=42)

In [None]:
num_users = df['user'].nunique()
num_movies = df['movie'].nunique()
content_dim = tfidf_features_df.shape[1]  # TF-IDF feature size

# User embedding
user_input = layers.Input(shape=(1,), name='user')
user_embed = layers.Embedding(input_dim=num_users, output_dim=32)(user_input)
user_vec = layers.Flatten()(user_embed)

# Movie embedding
movie_input = layers.Input(shape=(1,), name='movie')
movie_embed = layers.Embedding(input_dim=num_movies, output_dim=32)(movie_input)
movie_vec = layers.Flatten()(movie_embed)

# Content-based TF-IDF input (non-trainable)
movie_tfidf_input = layers.Input(shape=(content_dim,), name='movie_tfidf')

# Concatenate embeddings + tf-idf
concat = layers.Concatenate()([user_vec, movie_vec, movie_tfidf_input])

# Dense layers
dense = layers.Dense(128, activation='relu')(concat)
dense = layers.Dense(64, activation='relu')(dense)
output = layers.Dense(1)(dense)

# Build model
hybrid_model = Model(inputs=[user_input, movie_input, movie_tfidf_input], outputs=output)
hybrid_model.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
hybrid_model.summary()


In [None]:
train_tfidf = tfidf_features_df.loc[train_df['movie']].values.astype('float32')
test_tfidf = tfidf_features_df.loc[test_df['movie']].values.astype('float32')


In [None]:
history = hybrid_model.fit(
    x={
        'user': train_df['user'],
        'movie': train_df['movie'],
        'movie_tfidf': train_tfidf
    },
    y=train_df['rating'],
    validation_split=0.1,
    epochs=5,
    batch_size=64
)

Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 2.3138 - root_mean_squared_error: 1.4414 - val_loss: 0.9302 - val_root_mean_squared_error: 0.9645
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.8821 - root_mean_squared_error: 0.9392 - val_loss: 0.9213 - val_root_mean_squared_error: 0.9599
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.8470 - root_mean_squared_error: 0.9203 - val_loss: 0.9006 - val_root_mean_squared_error: 0.9490
Epoch 4/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.8100 - root_mean_squared_error: 0.9000 - val_loss: 0.8880 - val_root_mean_squared_error: 0.9423
Epoch 5/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.7756 - root_mean_squared_error: 0.8807 - val_loss: 0.8878 - val_root_mean_squared_error: 0.9423


**6.	Evaluation**

*   RMSE, MAE
*   Precision@K and Recall@K for top-N recommendations


In [None]:
test_loss, test_rmse = hybrid_model.evaluate(
    x={
        'user': test_df['user'],
        'movie': test_df['movie'],
        'movie_tfidf': test_tfidf
    },
    y=test_df['rating']
)

print(f"Hybrid Model Test RMSE: {test_rmse:.4f}")

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.8703 - root_mean_squared_error: 0.9328
Hybrid Model Test RMSE: 0.9386


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Predict on the test set
test_predictions = hybrid_model.predict({
    'user': test_df['user'],
    'movie': test_df['movie'],
    'movie_tfidf': test_tfidf
})

# Flatten predictions
test_predictions = test_predictions.flatten()

# Calculate RMSE and MAE
rmse = np.sqrt(mean_squared_error(test_df['rating'], test_predictions))
mae = mean_absolute_error(test_df['rating'], test_predictions)

print(f"Hybrid Model - Test RMSE: {rmse:.4f}")
print(f"Hybrid Model - Test MAE: {mae:.4f}")


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Hybrid Model - Test RMSE: 0.9386
Hybrid Model - Test MAE: 0.7394


In [None]:
def precision_at_k(y_true, y_pred, k=10):
    """
    y_true: Ground truth ratings
    y_pred: Predicted ratings sorted by score
    """
    y_true = np.array(y_true) >= 4  #
    return np.sum(y_true[:k]) / k

def recall_at_k(y_true, y_pred, k=10):
    y_true = np.array(y_true) >= 4
    relevant_total = np.sum(y_true)
    if relevant_total == 0:
        return 0.0
    return np.sum(y_true[:k]) / relevant_total


In [None]:
from collections import defaultdict

def evaluate_top_k(test_df, predictions, k=10):
    test_df = test_df.copy()
    test_df['predicted_rating'] = predictions

    precision_scores = []
    recall_scores = []

    # Group by user
    grouped = test_df.groupby('user')

    for user_id, group in grouped:
        true_ratings = group.sort_values('predicted_rating', ascending=False)['rating'].values
        precision = precision_at_k(true_ratings, None, k)
        recall = recall_at_k(true_ratings, None, k)

        precision_scores.append(precision)
        recall_scores.append(recall)

    avg_precision = np.mean(precision_scores)
    avg_recall = np.mean(recall_scores)

    print(f"Precision@{k}: {avg_precision:.4f}")
    print(f"Recall@{k}: {avg_recall:.4f}")

evaluate_top_k(test_df, test_predictions, k=10)


Precision@10: 0.5693
Recall@10: 0.7103
