In [11]:
import pandas as pd
df= pd.read_csv("Ratings.csv", delimiter=';')
df

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [25]:
import pandas as pd
df= pd.read_csv("Books.csv", delimiter=';')
df

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company
...,...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271375,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271377,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [17]:
from scipy.sparse import csr_matrix
import pandas as pd

ratings = pd.read_csv("Ratings.csv", delimiter=';')
ratings = ratings[ratings['Rating'] > 0]  # filter out 0s

# Optional: drop infrequent users/books
user_counts = ratings['User-ID'].value_counts()
book_counts = ratings['ISBN'].value_counts()

ratings = ratings[ratings['User-ID'].isin(user_counts[user_counts >= 5].index)]
ratings = ratings[ratings['ISBN'].isin(book_counts[book_counts >= 10].index)]

# Map IDs to integer indices
user_map = {id: i for i, id in enumerate(ratings['User-ID'].unique())}
book_map = {isbn: i for i, isbn in enumerate(ratings['ISBN'].unique())}

ratings['user_index'] = ratings['User-ID'].map(user_map)
ratings['book_index'] = ratings['ISBN'].map(book_map)

# Create sparse matrix
user_book_matrix = csr_matrix((ratings['Rating'], 
                               (ratings['user_index'], ratings['book_index'])))


In [19]:
from sklearn.datasets import dump_svmlight_file

X = user_book_matrix
y = [0] * X.shape[0]  # placeholder y (not used)

dump_svmlight_file(X, y, 'user_book_matrix.libsvm', zero_based=True)

In [27]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def load_sparse_matrix(ratings_path, books_path):
    ratings = pd.read_csv(ratings_path, delimiter=';')
    books = pd.read_csv(books_path, delimiter=';')

    ratings = ratings[ratings['Rating'] > 0]

    user_id_map = {uid: idx for idx, uid in enumerate(ratings['User-ID'].unique())}
    book_id_map = {isbn: idx for idx, isbn in enumerate(ratings['ISBN'].unique())}
    rev_user_id_map = {v: k for k, v in user_id_map.items()}
    rev_book_id_map = {v: k for k, v in book_id_map.items()}
    
    isbn_to_title = dict(zip(books['ISBN'], books['Title']))

    ratings['user_idx'] = ratings['User-ID'].map(user_id_map)
    ratings['book_idx'] = ratings['ISBN'].map(book_id_map)

    num_users = len(user_id_map)
    num_books = len(book_id_map)

    matrix = csr_matrix((ratings['Rating'], 
                         (ratings['user_idx'], ratings['book_idx'])), 
                         shape=(num_users, num_books))

    return matrix, rev_user_id_map, rev_book_id_map, isbn_to_title

def recommend_sparse(user_idx, matrix, sim_matrix, rev_user_map, rev_book_map, isbn_to_title, k=10, top_n=5):
    user_ratings = matrix[user_idx].toarray().flatten()
    similarities = sim_matrix[user_idx].toarray().flatten()

    similar_users = np.argsort(similarities)[-k-1:-1][::-1]  # Exclude self
    already_read = set(matrix[user_idx].nonzero()[1])
    candidate_books = set()

    for sim_user in similar_users:
        candidate_books.update(matrix[sim_user].nonzero()[1])
    candidate_books -= already_read

    estimated_ratings = {}
    for book_idx in candidate_books:
        numer = 0
        denom = 0
        for sim_user in similar_users:
            rating = matrix[sim_user, book_idx]
            if rating > 0:
                numer += similarities[sim_user] * rating
                denom += similarities[sim_user]
        if denom > 0:
            estimated_ratings[book_idx] = numer / denom

    top_books = sorted(estimated_ratings.items(), key=lambda x: x[1], reverse=True)[:top_n]

    return [{
        'User_ID': rev_user_map[user_idx],
        'Book_ID': rev_book_map[book_idx],
        'Book_Title': isbn_to_title.get(rev_book_map[book_idx], "Unknown Title"),
        'Recommendation_Score': score
    } for book_idx, score in top_books]

def generate_recommendations_sparse(ratings_path, books_path, output_path):
    matrix, rev_user_map, rev_book_map, isbn_to_title = load_sparse_matrix(ratings_path, books_path)
    sim_matrix = cosine_similarity(matrix, dense_output=False)

    all_results = []
    for user_idx in tqdm(range(matrix.shape[0]), desc="Generating recommendations"):
        recs = recommend_sparse(user_idx, matrix, sim_matrix, rev_user_map, rev_book_map, isbn_to_title)
        all_results.extend(recs)

    df = pd.DataFrame(all_results)
    df.to_csv(output_path, index=False)
    print(f"Done. Recommendations saved to {output_path}")

if __name__ == "__main__":
    generate_recommendations_sparse("Ratings.csv", "Books.csv", "SparseUserBookRecommendations.csv")


Generating recommendations: 100%|█████████| 77805/77805 [38:03<00:00, 34.07it/s]


Done. Recommendations saved to SparseUserBookRecommendations.csv
