In [36]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')


In [37]:
print("Loading data...")
book = pd.read_csv('/content/Books.csv')
user = pd.read_csv('/content/Users.csv')
rating = pd.read_csv('/content/Ratings.csv')
print(f"Books: {book.shape}, Users: {user.shape}, Ratings: {rating.shape}")

Loading data...
Books: (271360, 8), Users: (278858, 3), Ratings: (1149780, 3)


In [38]:
print("\n" + "="*60)
print("BUILDING POPULARITY-BASED RECOMMENDATION")
print("="*60)


BUILDING POPULARITY-BASED RECOMMENDATION


In [39]:
rating_with_name = rating.merge(book, on='ISBN')

num_rating_df = rating_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating': 'Num_rating'}, inplace=True)

avg_rating_df = rating_with_name.groupby('Book-Title').mean(numeric_only=True)['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'Avg_rating'}, inplace=True)

popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')

pbr_df = popular_df[popular_df['Num_rating'] >= 300].sort_values('Avg_rating', ascending=False).head(100)
pbr_df = pbr_df.merge(book, on='Book-Title').drop_duplicates('Book-Title')[
    ['Book-Title', 'Book-Author', 'Publisher', 'Image-URL-M', 'Num_rating', 'Avg_rating']]

print(f"Top 100 popular books created: {pbr_df.shape}")

Top 100 popular books created: (100, 6)


In [40]:
print("\n" + "="*60)
print("PREPARING DATA FOR COLLABORATIVE FILTERING")
print("="*60)


PREPARING DATA FOR COLLABORATIVE FILTERING


In [41]:
b = rating_with_name.groupby('User-ID').count()['Book-Rating'] > 250
users_with_ratings = b[b].index
print(f"Users with 250+ ratings: {len(users_with_ratings)}")

Users with 250+ ratings: 613


In [42]:
filtered_rating = rating_with_name[rating_with_name['User-ID'].isin(users_with_ratings)]

c = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = c[c].index
print(f"Books with 50+ ratings: {len(famous_books)}")

Books with 50+ ratings: 574


In [43]:
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]
print(f"Final ratings shape: {final_ratings.shape}")

Final ratings shape: (45093, 10)


In [44]:
pt = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pt.fillna(0, inplace=True)
print(f"Pivot table shape: {pt.shape}")

Pivot table shape: (574, 613)


In [45]:
print("\n" + "="*60)
print("CREATING TRAIN-TEST SPLIT")
print("="*60)


CREATING TRAIN-TEST SPLIT


In [46]:
train_data, test_data = train_test_split(final_ratings, test_size=0.2, random_state=42)
print(f"Train size: {train_data.shape}, Test size: {test_data.shape}")

Train size: (36074, 10), Test size: (9019, 10)


In [47]:
train_pt = train_data.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
train_pt.fillna(0, inplace=True)

In [48]:
print("\n" + "="*60)
print("MODEL 1: COSINE SIMILARITY")
print("="*60)


MODEL 1: COSINE SIMILARITY


In [49]:
cosine_sim = cosine_similarity(train_pt)
print(f"Cosine similarity matrix shape: {cosine_sim.shape}")

Cosine similarity matrix shape: (574, 574)


In [50]:
print("\n" + "="*60)
print("MODEL 2: K-NEAREST NEIGHBORS WITH GRIDSEARCH")
print("="*60)


MODEL 2: K-NEAREST NEIGHBORS WITH GRIDSEARCH


In [51]:
train_sparse = csr_matrix(train_pt.values)

In [52]:
param_grid_knn = {
    'n_neighbors': [5, 10, 15, 20, 25],
    'metric': ['cosine', 'euclidean', 'manhattan'],
    'algorithm': ['brute', 'auto']
}

In [53]:
print("Testing KNN with different parameters...")
print(f"Parameter grid: {param_grid_knn}")

Testing KNN with different parameters...
Parameter grid: {'n_neighbors': [5, 10, 15, 20, 25], 'metric': ['cosine', 'euclidean', 'manhattan'], 'algorithm': ['brute', 'auto']}


In [54]:
best_knn_score = float('inf')
best_knn_params = {}
knn_results = []

In [55]:
for n in param_grid_knn['n_neighbors']:
    for metric in param_grid_knn['metric']:
        for algo in param_grid_knn['algorithm']:
            try:
                knn_model = NearestNeighbors(n_neighbors=n, metric=metric, algorithm=algo)
                knn_model.fit(train_sparse)

                distances, indices = knn_model.kneighbors(train_sparse[:10])
                avg_distance = distances.mean()

                knn_results.append({
                    'n_neighbors': n,
                    'metric': metric,
                    'algorithm': algo,
                    'avg_distance': avg_distance
                })

                if avg_distance < best_knn_score:
                    best_knn_score = avg_distance
                    best_knn_params = {'n_neighbors': n, 'metric': metric, 'algorithm': algo}
                    best_knn_model = knn_model

                print(f"n={n}, metric={metric}, algo={algo}: avg_dist={avg_distance:.4f}")
            except:
                continue

print(f"\nBest KNN parameters: {best_knn_params}")
print(f"Best KNN score: {best_knn_score:.4f}")

print("\n" + "="*60)
print("MODEL 3: MATRIX FACTORIZATION (SVD) WITH GRIDSEARCH")
print("="*60)

n=5, metric=cosine, algo=brute: avg_dist=0.5636
n=5, metric=cosine, algo=auto: avg_dist=0.5636
n=5, metric=euclidean, algo=brute: avg_dist=22.9556
n=5, metric=euclidean, algo=auto: avg_dist=22.9556
n=5, metric=manhattan, algo=brute: avg_dist=92.9500
n=5, metric=manhattan, algo=auto: avg_dist=92.9500
n=10, metric=cosine, algo=brute: avg_dist=0.6577
n=10, metric=cosine, algo=auto: avg_dist=0.6577
n=10, metric=euclidean, algo=brute: avg_dist=26.6284
n=10, metric=euclidean, algo=auto: avg_dist=26.6284
n=10, metric=manhattan, algo=brute: avg_dist=109.5700
n=10, metric=manhattan, algo=auto: avg_dist=109.5700
n=15, metric=cosine, algo=brute: avg_dist=0.6963
n=15, metric=cosine, algo=auto: avg_dist=0.6963
n=15, metric=euclidean, algo=brute: avg_dist=28.1259
n=15, metric=euclidean, algo=auto: avg_dist=28.1259
n=15, metric=manhattan, algo=brute: avg_dist=116.9678
n=15, metric=manhattan, algo=auto: avg_dist=116.9678
n=20, metric=cosine, algo=brute: avg_dist=0.7189
n=20, metric=cosine, algo=auto: 

In [56]:

param_grid_svd = {
    'n_components': [10, 20, 30, 50, 100],
    'n_iter': [5, 10, 20],
    'random_state': [42]
}

best_svd_score = float('inf')
best_svd_params = {}
svd_results = []

print("Testing SVD with different parameters...")
print(f"Parameter grid: {param_grid_svd}")

Testing SVD with different parameters...
Parameter grid: {'n_components': [10, 20, 30, 50, 100], 'n_iter': [5, 10, 20], 'random_state': [42]}


In [57]:
for n_comp in param_grid_svd['n_components']:
    for n_it in param_grid_svd['n_iter']:
        try:
            svd = TruncatedSVD(n_components=n_comp, n_iter=n_it, random_state=42)
            svd_matrix = svd.fit_transform(train_pt)

            reconstructed = svd.inverse_transform(svd_matrix)
            mse = mean_squared_error(train_pt.values.flatten(), reconstructed.flatten())

            svd_results.append({
                'n_components': n_comp,
                'n_iter': n_it,
                'mse': mse,
                'explained_variance': svd.explained_variance_ratio_.sum()
            })

            if mse < best_svd_score:
                best_svd_score = mse
                best_svd_params = {'n_components': n_comp, 'n_iter': n_it}
                best_svd_model = svd
                best_svd_matrix = svd_matrix

            print(f"n_comp={n_comp}, n_iter={n_it}: MSE={mse:.4f}, Var={svd.explained_variance_ratio_.sum():.4f}")
        except:
            continue

print(f"\nBest SVD parameters: {best_svd_params}")
print(f"Best SVD score (MSE): {best_svd_score:.4f}")

n_comp=10, n_iter=5: MSE=1.2221, Var=0.1652
n_comp=10, n_iter=10: MSE=1.2218, Var=0.1654
n_comp=10, n_iter=20: MSE=1.2218, Var=0.1654
n_comp=20, n_iter=5: MSE=1.0812, Var=0.2614
n_comp=20, n_iter=10: MSE=1.0799, Var=0.2623
n_comp=20, n_iter=20: MSE=1.0798, Var=0.2624
n_comp=30, n_iter=5: MSE=0.9694, Var=0.3378
n_comp=30, n_iter=10: MSE=0.9673, Var=0.3392
n_comp=30, n_iter=20: MSE=0.9671, Var=0.3393
n_comp=50, n_iter=5: MSE=0.7917, Var=0.4592
n_comp=50, n_iter=10: MSE=0.7889, Var=0.4610
n_comp=50, n_iter=20: MSE=0.7886, Var=0.4612
n_comp=100, n_iter=5: MSE=0.4924, Var=0.6636
n_comp=100, n_iter=10: MSE=0.4896, Var=0.6655
n_comp=100, n_iter=20: MSE=0.4890, Var=0.6659

Best SVD parameters: {'n_components': 100, 'n_iter': 20}
Best SVD score (MSE): 0.4890


In [58]:
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)

models_summary = {
    'Cosine Similarity': {'type': 'distance-based', 'complexity': 'O(n²)'},
    'K-Nearest Neighbors': {
        'type': 'neighborhood-based',
        'best_params': best_knn_params,
        'avg_distance': best_knn_score
    },
    'SVD (Matrix Factorization)': {
        'type': 'latent-factor',
        'best_params': best_svd_params,
        'mse': best_svd_score
    }
}

for model, info in models_summary.items():
    print(f"\n{model}:")
    for key, val in info.items():
        print(f"  {key}: {val}")


MODEL COMPARISON SUMMARY

Cosine Similarity:
  type: distance-based
  complexity: O(n²)

K-Nearest Neighbors:
  type: neighborhood-based
  best_params: {'n_neighbors': 5, 'metric': 'cosine', 'algorithm': 'brute'}
  avg_distance: 0.5635753327388064

SVD (Matrix Factorization):
  type: latent-factor
  best_params: {'n_components': 100, 'n_iter': 20}
  mse: 0.48902561072265993


In [59]:
def recommend_cosine(book_name, n=8):
    try:
        index = np.where(pt.index == book_name)[0][0]
        similar_items = sorted(list(enumerate(cosine_sim[index])), reverse=True, key=lambda x: x[1])[1:n+1]

        recommendations = []
        for i in similar_items:
            temp_df = book[book['Book-Title'] == pt.index[i[0]]]
            recommendations.append({
                'title': temp_df['Book-Title'].values[0],
                'author': temp_df['Book-Author'].values[0],
                'image': temp_df['Image-URL-M'].values[0],
                'similarity': i[1]
            })
        return recommendations
    except:
        return []

In [60]:
def recommend_knn(book_name, n=8):
    try:
        index = np.where(pt.index == book_name)[0][0]
        distances, indices = best_knn_model.kneighbors(
            csr_matrix(pt.iloc[index].values.reshape(1, -1)),
            n_neighbors=n+1
        )

        recommendations = []
        for i in range(1, len(indices[0])):
            idx = indices[0][i]
            temp_df = book[book['Book-Title'] == pt.index[idx]]
            recommendations.append({
                'title': temp_df['Book-Title'].values[0],
                'author': temp_df['Book-Author'].values[0],
                'image': temp_df['Image-URL-M'].values[0],
                'distance': distances[0][i]
            })
        return recommendations
    except:
        return []

In [61]:
def recommend_svd(book_name, n=8):
    try:
        index = np.where(pt.index == book_name)[0][0]
        book_vector = best_svd_matrix[index].reshape(1, -1)

        similarities = cosine_similarity(book_vector, best_svd_matrix)[0]
        similar_indices = similarities.argsort()[::-1][1:n+1]

        recommendations = []
        for idx in similar_indices:
            temp_df = book[book['Book-Title'] == pt.index[idx]]
            recommendations.append({
                'title': temp_df['Book-Title'].values[0],
                'author': temp_df['Book-Author'].values[0],
                'image': temp_df['Image-URL-M'].values[0],
                'similarity': similarities[idx]
            })
        return recommendations
    except:
        return []

print("\n" + "="*60)
print("TESTING RECOMMENDATIONS")
print("="*60)


TESTING RECOMMENDATIONS


In [62]:
test_book = "1984"
print(f"\nRecommendations for: {test_book}")

print("\n--- Cosine Similarity ---")
recs_cos = recommend_cosine(test_book, n=5)
for i, rec in enumerate(recs_cos, 1):
    print(f"{i}. {rec['title']} by {rec['author']} (sim: {rec['similarity']:.4f})")

print("\n--- K-Nearest Neighbors ---")


Recommendations for: 1984

--- Cosine Similarity ---
1. The Bonesetter's Daughter by Amy Tan (sim: 0.3109)
2. The Handmaid's Tale by Margaret Atwood (sim: 0.2684)
3. The Vampire Lestat (Vampire Chronicles, Book II) by ANNE RICE (sim: 0.2675)
4. The Hitchhiker's Guide to the Galaxy by Douglas Adams (sim: 0.2605)
5. The Catcher in the Rye by J.D. Salinger (sim: 0.2470)

--- K-Nearest Neighbors ---


In [63]:
recs_knn = recommend_knn(test_book, n=5)
for i, rec in enumerate(recs_knn, 1):
    print(f"{i}. {rec['title']} by {rec['author']} (dist: {rec['distance']:.4f})")

print("\n--- SVD (Matrix Factorization) ---")
recs_svd = recommend_svd(test_book, n=5)
for i, rec in enumerate(recs_svd, 1):
    print(f"{i}. {rec['title']} by {rec['author']} (sim: {rec['similarity']:.4f})")

print("\n" + "="*60)
print("SAVING MODELS AND DATA")
print("="*60)



--- SVD (Matrix Factorization) ---
1. The Bonesetter's Daughter by Amy Tan (sim: 0.4364)
2. Don't Sweat the Small Stuff and It's All Small Stuff : Simple Ways to Keep the Little Things from Taking Over Your Life (Don't Sweat the Small Stuff Series) by Richard Carlson (sim: 0.4284)
3. The Catcher in the Rye by J.D. Salinger (sim: 0.4189)
4. The Hitchhiker's Guide to the Galaxy by Douglas Adams (sim: 0.4104)
5. Fall On Your Knees (Oprah #45) by Ann-Marie MacDonald (sim: 0.4101)

SAVING MODELS AND DATA


In [64]:
pickle.dump(pbr_df, open('PopularBookRecommendation.pkl', 'wb'))
pickle.dump(pt, open('pt.pkl', 'wb'))
pickle.dump(book, open('book.pkl', 'wb'))
pickle.dump(cosine_sim, open('cosine_similarity.pkl', 'wb'))
pickle.dump(best_knn_model, open('knn_model.pkl', 'wb'))
pickle.dump(best_svd_model, open('svd_model.pkl', 'wb'))
pickle.dump(best_svd_matrix, open('svd_matrix.pkl', 'wb'))

print("Models saved successfully!")
print("\nFiles created:")
print("- PopularBookRecommendation.pkl")
print("- pt.pkl")
print("- book.pkl")
print("- cosine_similarity.pkl")
print("- knn_model.pkl")
print("- svd_model.pkl")
print("- svd_matrix.pkl")

print("\n" + "="*60)
print("RECOMMENDATION SYSTEM COMPLETE!")
print("="*60)

Models saved successfully!

Files created:
- PopularBookRecommendation.pkl
- pt.pkl
- book.pkl
- cosine_similarity.pkl
- knn_model.pkl
- svd_model.pkl
- svd_matrix.pkl

RECOMMENDATION SYSTEM COMPLETE!
