Book Recommendation System using Collaborative Filtering

Part I: User-Based Nearest Neighbor Recommendation using Pearson Correlation

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('/Users/laminthiri/Desktop/SP-1/mini projects/project2/rating10user91_trainset.csv')
test = pd.read_csv('/Users/laminthiri/Desktop/SP-1/mini projects/project2/rating10user91_testset.csv')

In [3]:
# user-item matrix
user_item_matrix = train.pivot_table(index="userid", columns="isbn", values="rating", aggfunc="mean")
user_item_matrix

isbn,014028009X,014029628X,034538475X,043935806X,044021145X,044022165X,044023722X,044651652X,059035342X,067976402X,...,671003755,671027360,671041789,679781587,743418174,786868716,804106304,805063897,842329129,971880107
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6251,,,,,,,,,,8.0,...,,,,,,,,,,
6575,,,,,,,,,,,...,,,,,,,,,,
7346,,8.0,,,,,,,,,...,,,,,,7.0,9.0,,,
11676,8.0,7.0,6.0,,1.0,,8.0,8.0,10.0,10.0,...,1.0,,5.0,,8.0,9.0,,,9.0,6.0
13552,,,,,,8.0,,8.0,,,...,,,,,10.0,9.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261829,,,,,,,,,,8.0,...,,9.0,,,,,,,,
265115,7.0,9.0,,,,,,,,,...,,,,,,,,,,
270713,,7.0,,,,,,,,,...,,,,10.0,,10.0,8.0,8.0,,
271448,,,,10.0,,,10.0,,,,...,,,,,,,,,,2.0


In [4]:
# function to get common ratings between two users
def get_common_ratings(user_item_matrix, user1, user2):
    user1_ratings = user_item_matrix.loc[user1]
    user2_ratings = user_item_matrix.loc[user2]
    
    common_mask = user1_ratings.notna() & user2_ratings.notna()
    
    common_ratings = pd.DataFrame({
        user1: user1_ratings[common_mask],
        user2: user2_ratings[common_mask]
    })
    
    return common_ratings


In [5]:
def cal_pearson(common_ratings, user1, user2):
    user1_ratings = common_ratings[user1]
    user2_ratings = common_ratings[user2]
        
    # mean
    user1_mean = user1_ratings.mean()
    user2_mean = user2_ratings.mean()
    
    # deviations from mean
    user1_dev = user1_ratings - user1_mean
    user2_dev = user2_ratings - user2_mean
    
    #  numerator -> sum of products of deviations
    numerator = (user1_dev * user2_dev).sum()
    
    #  denominator -> product of standard deviations
    denominator = np.sqrt((user1_dev**2).sum()) * np.sqrt((user2_dev**2).sum())
    
    # Check for division by zero
    if denominator == 0:
        return 0
    
    result = numerator / denominator
    return result


In [6]:
def cal_user_similarity(user1, user2):
    
    if user1 == user2:
        return 1.0
        
    user1_ratings = user_item_matrix.loc[user1]
    user2_ratings = user_item_matrix.loc[user2]
    
    common_items = user1_ratings.notna() & user2_ratings.notna()
    
    if len(common_items) == 0:
        return 0
     
    common_ratings = pd.DataFrame({
        user1: user1_ratings[common_items],
        user2: user2_ratings[common_items]
    })
    
        
    # Combine ratings into a DataFrame for better visualization
    common_ratings = pd.DataFrame({
        user1: user_item_matrix.loc[user1, common_items],
        user2: user_item_matrix.loc[user2, common_items]
    })
    
    return cal_pearson(common_ratings, user1, user2)

In [7]:
# change dataframe to list 
users = user_item_matrix.index.tolist()

# Initialize similarity matrix
similarity_score = pd.DataFrame(index=users, columns=users)

# similarity matrix between users
for i in range(len(users)):
    for j in range(i, len(users)): 
        r = cal_user_similarity(users[i], users[j])
        similarity_score.at[users[i], users[j]] = r
        similarity_score.at[users[j], users[i]] = r
            
print("\nUser-User Similarity Matrix:")
similarity_score


User-User Similarity Matrix:


Unnamed: 0,6251,6575,7346,11676,13552,16795,17950,21014,23872,23902,...,241980,242083,245410,255489,258534,261829,265115,270713,271448,274004
6251,1.0,0,0,0.0,0,1.0,0,0,0,0,...,0,1.0,1.0,0,0,0,0,0,0,0
6575,0,1.0,0,0.228692,0,0,0,0,0,-1.0,...,0,0,0,0,0.5,0,0,0,0,0
7346,0,0,1.0,-0.185419,0,-1.0,0,0,0,0,...,-1.0,-1.0,0,0,0.0,0,-1.0,-0.654654,0,0
11676,0.0,0.228692,-0.185419,1.0,-0.603023,0.09557,-0.991241,-0.413664,0.904534,0.209032,...,-0.065478,-0.141731,-0.384111,0.094283,-0.378326,0.157483,0.618025,-0.339855,0.240723,-0.016062
13552,0,0,0,-0.603023,1.0,0.394771,1.0,0,0,0,...,0,0.870388,0,0,0,0,0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261829,0,0,0,0.157483,0,-0.319801,0,0,0,0,...,1.0,-0.055989,0,0.188982,0,1.0,0,1.0,0,0
265115,0,0,-1.0,0.618025,0,0,0,0,0,0,...,1.0,1.0,0,0,0.5,0,1.0,0,0,0
270713,0,0,-0.654654,-0.339855,1.0,-0.080064,0,1.0,0,0,...,0,0,0,-1.0,0,1.0,0,1.0,-1.0,0
271448,0,0,0,0.240723,0,0.054866,1.0,0.0,0,0,...,0,0,0,0,1.0,0,0,-1.0,1.0,0


In [8]:
similarity_score.to_csv('PCC.csv', index=True)

In [9]:
# Calculate the average rating for each user and store in a new DataFrame
users_average_rating = pd.DataFrame({
    'average_rating': user_item_matrix.mean(axis=1)
}, index=user_item_matrix.index)
users_average_rating

Unnamed: 0_level_0,average_rating
userid,Unnamed: 1_level_1
6251,8.545455
6575,6.222222
7346,8.230769
11676,7.628571
13552,8.700000
...,...
261829,7.625000
265115,8.181818
270713,8.142857
271448,7.857143


In [10]:
unread_books = {}
for user in user_item_matrix.index:
    unread_books[user] = user_item_matrix.loc[user][user_item_matrix.loc[user].isna()].index[:5]
    print(f"User {user} - First 5 unread books: {list(unread_books[user])}")

User 6251 - First 5 unread books: ['014028009X', '014029628X', '034538475X', '043935806X', '044021145X']
User 6575 - First 5 unread books: ['014028009X', '014029628X', '034538475X', '043935806X', '044021145X']
User 7346 - First 5 unread books: ['014028009X', '034538475X', '043935806X', '044021145X', '044022165X']
User 11676 - First 5 unread books: ['043935806X', '044022165X', '068484267X', '1400034779', '142001740']
User 13552 - First 5 unread books: ['014028009X', '014029628X', '034538475X', '043935806X', '044021145X']
User 16795 - First 5 unread books: ['014028009X', '014029628X', '034538475X', '043935806X', '044021145X']
User 17950 - First 5 unread books: ['014028009X', '014029628X', '034538475X', '043935806X', '044021145X']
User 21014 - First 5 unread books: ['014028009X', '034538475X', '043935806X', '044021145X', '044022165X']
User 23872 - First 5 unread books: ['014028009X', '014029628X', '043935806X', '044021145X', '044022165X']
User 23902 - First 5 unread books: ['014029628X', 

In [11]:
# Predict rating for each unread book in each user using User-User Nearest Neighbors (UNN) recommendation, k=5
predicted_ratings = {}
k = 5
for user in user_item_matrix.index:
    # Find k most similar users (excluding self)
    sim_scores = similarity_score.loc[user].drop(user).astype(float)  # Ensure float dtype
    top_k_users = sim_scores.sort_values(ascending=False).head(k).index  # Use sort_values instead of nlargest
    predicted_ratings[user] = {}
    # For each unread book (NaN rating)
    unread_isbns = user_item_matrix.loc[user][user_item_matrix.loc[user].isna()].index
    for isbn in unread_isbns:
        # Get ratings for this book from top k similar users
        neighbor_ratings = user_item_matrix.loc[top_k_users, isbn]
        neighbor_sims = sim_scores[top_k_users]
        # Only use neighbors who have rated this book
        read_books = neighbor_ratings.notna()
        sims_sum = neighbor_sims[read_books].sum()
        if read_books.sum() == 0 or sims_sum == 0:
            # If no neighbor rated or sum of similarities is zero, use user's average rating
            pred = users_average_rating.loc[user, 'average_rating']
        else:
            # Weighted average by similarity
            pred = (neighbor_ratings[read_books] * neighbor_sims[read_books]).sum() / sims_sum
        predicted_ratings[user][isbn] = pred

# Example: print predictions for first user
first_user = user_item_matrix.index[0]
print(f"Predicted ratings for user {first_user}:")
for isbn, rating in list(predicted_ratings[first_user].items())[:5]:
    print(f"ISBN: {isbn}, Predicted Rating: {rating}")

Predicted ratings for user 6251:
ISBN: 014028009X, Predicted Rating: 8.545454545454545
ISBN: 014029628X, Predicted Rating: 8.545454545454545
ISBN: 034538475X, Predicted Rating: 7.0
ISBN: 043935806X, Predicted Rating: 8.666666666666666
ISBN: 044021145X, Predicted Rating: 8.0


In [12]:
k = 5
for user in user_item_matrix.index:
    # Find k most similar users (excluding self)
    sim_scores = similarity_score.loc[user].drop(user).astype(float)  # Ensure float dtype
    top_k_users = sim_scores.sort_values(ascending=False).head(k).index  
        
top_k_users


Index([245410, 28634, 105517, 158433, 164323], dtype='int64')

In [13]:
# Store predicted ratings for only the first five items per user in a CSV file
pred_list = []
for user, books in predicted_ratings.items():
    count = 0
    for isbn, rating in books.items():
        if count < 5:
            pred_list.append({'userid': user, 'isbn': isbn, 'predicted_rating': rating})
            count += 1
        else:
            break
pred_df = pd.DataFrame(pred_list)
pred_df.to_csv('P2Part1_2Recommendation_Group5.csv', index=False)
print('Predicted ratings (first 5 items per user) saved to P2Part1_2Recommendation_Group5.csv')

Predicted ratings (first 5 items per user) saved to P2Part1_2Recommendation_Group5.csv


Part II: Item-Based Collaborative Filtering using Cosine Similarity for Top-10 Book Recommendation

In [14]:
train.head()

Unnamed: 0,userid,isbn,rating
0,6251,60392452,10
1,6251,61009059,7
2,6251,140067477,10
3,6251,375727345,6
4,6251,380789035,7


In [15]:
train["userid"] = train["userid"].astype(str)
train["isbn"] = train["isbn"].astype(str)
test["userid"] = test["userid"].astype(str)
test["isbn"] = test["isbn"].astype(str)


In [16]:
ratings_matrix = train.pivot_table(index="userid", columns="isbn", values="rating")
ratings_matrix.head()

isbn,014028009X,014029628X,034538475X,043935806X,044021145X,044022165X,044023722X,044651652X,059035342X,067976402X,...,671003755,671027360,671041789,679781587,743418174,786868716,804106304,805063897,842329129,971880107
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101851,,,,10.0,,,,,,,...,,,,,,,,,,
102647,,,,,,,,,,,...,,,,,,,,,,
104636,,,,,10.0,8.0,,,10.0,,...,8.0,,,,,,,,,
105028,,,,,,,,,,,...,,,,,,,,,,
105517,,,,4.0,,,,,,,...,,,,,,,,,,


In [17]:
user_mean = ratings_matrix.mean(axis=1)
item_mean = ratings_matrix.mean(axis=0)
global_mean = train["rating"].mean()
user_profile = pd.DataFrame({
    'user_mean': user_mean,
    'global_mean': global_mean
})
user_profile.head()


Unnamed: 0_level_0,user_mean,global_mean
userid,Unnamed: 1_level_1,Unnamed: 2_level_1
101851,9.0,8.046491
102647,7.727273,8.046491
104636,8.625,8.046491
105028,7.625,8.046491
105517,6.428571,8.046491


In [18]:
user_profile.to_csv('P2Part2_1Profile_Group5.csv', index=True)

In [19]:
# Centered ratings for adjusted cosine similarity
ratings_centered = ratings_matrix.sub(user_mean, axis=0)


items = ratings_centered.columns
sim_matrix = pd.DataFrame(index=items, columns=items, dtype=float)

for i in items:
    for j in items:
        if i == j:
            sim_matrix.loc[i, j] = 1.0
        else:
            common = ratings_centered[[i, j]].dropna()
            if common.empty:
                sim_matrix.loc[i, j] = 0
            else:
                num = (common[i] * common[j]).sum()
                den = np.sqrt((common[i] ** 2).sum()) * np.sqrt((common[j] ** 2).sum())
                sim_matrix.loc[i, j] = num / den if den != 0 else 0

sim_matrix.head()

isbn,014028009X,014029628X,034538475X,043935806X,044021145X,044022165X,044023722X,044651652X,059035342X,067976402X,...,671003755,671027360,671041789,679781587,743418174,786868716,804106304,805063897,842329129,971880107
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
014028009X,1.0,0.090457,-1.0,-0.955511,-0.321353,0.0,-0.188963,-0.558852,0.193226,0.721767,...,-1.0,0.541769,-0.261459,0.725888,-0.846506,0.831413,0.0,0.0,1.0,0.580312
014029628X,0.090457,1.0,-0.263681,0.0,1.0,0.0,0.116626,-1.0,-1.0,-1.0,...,1.0,1.0,1.0,-0.748832,-0.650103,-0.785472,-0.015621,-0.139962,-1.0,0.098421
034538475X,-1.0,-0.263681,1.0,1.0,0.382436,-1.0,-1.0,-1.0,0.21779,-0.316947,...,1.0,-0.833406,-0.300864,0.0,-1.0,-1.0,0.0,0.0,-1.0,1.0
043935806X,-0.955511,0.0,1.0,1.0,0.612487,1.0,0.906183,0.0,-0.52498,0.921894,...,-1.0,1.0,-0.506291,-1.0,0.0,0.0,-1.0,0.0,0.0,-0.838725
044021145X,-0.321353,1.0,0.382436,0.612487,1.0,-0.515565,-0.031152,-0.709758,-0.434953,-0.496349,...,0.91557,0.707137,0.660549,-0.11139,-0.064257,-0.708507,-1.0,0.0,-0.902048,0.431095


In [20]:
sim_matrix.to_csv('Model.csv', index=True)

In [21]:
def predict_rating(user, item, k=10):
    if user not in ratings_matrix.index:
        return global_mean

    # Unknown item
    if item not in sim_matrix.columns:
        return item_mean.get(item, global_mean)

    u_mean = user_mean[user]
    user_ratings = ratings_matrix.loc[user].dropna()

    # Keep only rated items present in sim_matrix
    rated_items = [i for i in user_ratings.index if i in sim_matrix.columns]
    if not rated_items:
        return u_mean

    # Similarities between target item and rated items
    sims = sim_matrix[item].loc[rated_items]

    # Top-k neighbors
    sims = sims.sort_values(ascending=False).head(k)
    rated_items = sims.index  # align rated_items with sims

    # Weighted sum
    num = ((ratings_matrix.loc[user, rated_items] - u_mean).values * sims.values).sum()
    den = sims.abs().sum()

    if den == 0:
        pred = u_mean
    else:
        pred = u_mean + (num / den)

    # Clip prediction to rating scale
    pred = max(1, min(10, pred))
    return pred


print("\n--- Example Predictions ---")
print("User 1, Item first_isbn =>", predict_rating("1", ratings_matrix.columns[0]))
print("User 2, Item first_isbn =>", predict_rating("2", ratings_matrix.columns[0]))




--- Example Predictions ---
User 1, Item first_isbn => 8.046491228070176
User 2, Item first_isbn => 8.046491228070176


In [22]:
recs = []

for user in test["userid"].unique():
    # match type of user to ratings_matrix index
    user_key = user
    if isinstance(ratings_matrix.index[0], int):
        user_key = int(user)
    else:
        user_key = str(user)

    # Items not yet rated
    if user_key in ratings_matrix.index:
        user_ratings = ratings_matrix.loc[user_key]
        not_rated = user_ratings[user_ratings.isna()].index
    else:
        not_rated = ratings_matrix.columns

    # Keep only items in sim_matrix
    if isinstance(sim_matrix.columns[0], int):
        not_rated = [item for item in not_rated if item in sim_matrix.columns]
    else:
        not_rated = [str(item) for item in not_rated if str(item) in sim_matrix.columns]

    preds = []
    for item in not_rated:
        try:
            pr = predict_rating(user_key, item)
            preds.append((item, pr))
        except KeyError:
            continue

    top10 = sorted(preds, key=lambda x: x[1], reverse=True)[:10]
    for rank, (item, pr) in enumerate(top10, 1):
        recs.append([user_key, item, pr, rank])

# Create DataFrame of recommendations
df_recs = pd.DataFrame(recs, columns=["userid", "isbn", "predicted_rating", "rank"])
print(df_recs.head(10))


  userid        isbn  predicted_rating  rank
0   6251  1400034779         10.000000     1
1   6251   345339681         10.000000     2
2   6251   345350499         10.000000     3
3   6251   440225701         10.000000     4
4   6251   446310786         10.000000     5
5   6251    60938455         10.000000     6
6   6251   743418174         10.000000     7
7   6251   312980140          9.806936     8
8   6251  043935806X          9.788090     9
9   6251   312976275          9.759253    10


In [23]:
df_recs.to_csv('Recommendation.csv', index=False)

In [24]:
preds = []

for _, row in test.iterrows():
    user = row["userid"]
    item = row["isbn"]
    pr = predict_rating(user, item)  # predict_rating handles unknowns
    preds.append(pr)

rmse = np.sqrt(((test["rating"] - preds) ** 2).mean())
print("RMSE =", rmse)
rmse_df = pd.DataFrame([{'RMSE': round(rmse, 3)}])

RMSE = 1.623682631862843


In [25]:
rmse_df.to_csv('RMSE.csv', index=False)