In [1]:
import pandas as pd

ratings_df = pd.read_csv('Ratings.csv', sep=';', dtype={'User-ID': int, 'ISBN': str, 'Rating': int})
print(ratings_df)

         User-ID         ISBN  Rating
0         276725   034545104X       0
1         276726   0155061224       5
2         276727   0446520802       0
3         276729   052165615X       3
4         276729   0521795028       6
...          ...          ...     ...
1149775   276704   1563526298       9
1149776   276706   0679447156       0
1149777   276709   0515107662      10
1149778   276721   0590442449      10
1149779   276723  05162443314       8

[1149780 rows x 3 columns]


In [2]:
from scipy.sparse import coo_matrix

user_mapping = {user_id: idx for idx, user_id in enumerate(ratings_df['User-ID'].unique())}
book_mapping = {isbn: idx for idx, isbn in enumerate(ratings_df['ISBN'].unique())}

ratings_df['User-ID-Mapped'] = ratings_df['User-ID'].map(user_mapping)
ratings_df['ISBN-Mapped'] = ratings_df['ISBN'].map(book_mapping)

user_ids = ratings_df['User-ID-Mapped']
book_ids = ratings_df['ISBN-Mapped']
ratings = ratings_df['Rating']
sparse_matrix = coo_matrix((ratings, (user_ids, book_ids)))


In [3]:
import pandas as pd

# Convert the user and book mappings to DataFrames with specified column titles
user_mapping_df = pd.DataFrame(list(user_mapping.items()), columns=['User-ID', 'Mapped-ID'])
book_mapping_df = pd.DataFrame(list(book_mapping.items()), columns=['ISBN', 'Mapped-ID'])

# Save the mappings to CSV files with the specified titles
user_mapping_df.to_csv('user_mapping.csv', index=False)
book_mapping_df.to_csv('book_mapping.csv', index=False)

print("User and book mappings have been saved with the specified titles to 'user_mapping.csv' and 'book_mapping.csv'.")

User and book mappings have been saved with the specified titles to 'user_mapping.csv' and 'book_mapping.csv'.


In [4]:
from sklearn.datasets import dump_svmlight_file

dummy_labels = [0] * sparse_matrix.shape[0]

temp_file_path = 'temp_user_book_matrix.libsvm'
dump_svmlight_file(sparse_matrix, dummy_labels, temp_file_path, zero_based=True)

output_file_path = 'user_book_matrix.libsvm'

with open(temp_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        stripped_line = ' '.join(line.split()[1:])
        if stripped_line.strip():
            outfile.write(stripped_line + '\n')

print(f"Sparse matrix saved without labels in LibSVM format: {output_file_path}")

Sparse matrix saved without labels in LibSVM format: user_book_matrix.libsvm


In [5]:
import numpy as np
import pandas as pd
import os
from sklearn.datasets import load_svmlight_file

def calculate_sparse_cosine_similarity(team_32_vector, team_32_matrix):
    """
    Calculate cosine similarity between a single vector and all rows of a sparse matrix.

    Parameters:
        team_32_vector (csr_matrix): A sparse vector (row) for comparison.
        team_32_matrix (csr_matrix): Sparse matrix where each row represents a different entity.

    Returns:
        numpy.ndarray: Array of cosine similarity scores for each row in the matrix.
    """
    team_32_similarity = team_32_matrix.dot(team_32_vector.T).toarray().ravel()
    team_32_vector_norm = np.sqrt(team_32_vector.multiply(team_32_vector).sum())
    team_32_row_norms = np.sqrt(team_32_matrix.power(2).sum(axis=1)).A1
    team_32_scores = team_32_similarity / (team_32_vector_norm * team_32_row_norms + 1e-10)
    return team_32_scores

def recommend_global_top_books(team_32_sparse_matrix, team_32_count=10):
    """
    Recommend globally popular books based on average ratings across all users.

    Parameters:
        team_32_sparse_matrix (csr_matrix): Sparse matrix of user-item ratings.
        team_32_count (int): Number of top books to return.

    Returns:
        list: Top books as tuples [(book_id, average_rating), ...].
    """
    team_32_avg_scores = team_32_sparse_matrix.mean(axis=0).A1
    team_32_top_indices = np.argsort(team_32_avg_scores)[-team_32_count:][::-1]
    team_32_global_books = [(idx, team_32_avg_scores[idx]) for idx in team_32_top_indices]
    return team_32_global_books

def generate_book_recommendations(team_32_sparse_matrix, team_32_similar_users, team_32_target_user, team_32_top_k=10):
    """
    Generate book recommendations for a user using their top-K similar users.

    Parameters:
        team_32_sparse_matrix (csr_matrix): Sparse user-item ratings matrix.
        team_32_similar_users (list): List of tuples [(user_id, similarity_score), ...].
        team_32_target_user (int): ID of the user to recommend books for.
        team_32_top_k (int): Number of similar users to consider.

    Returns:
        list: Recommended books as tuples [(book_id, predicted_score), ...].
    """
    team_32_user_books = set(team_32_sparse_matrix.getrow(team_32_target_user).nonzero()[1])
    team_32_all_books = set()
    
    for team_32_user_id, _ in team_32_similar_users:
        team_32_all_books.update(team_32_sparse_matrix.getrow(team_32_user_id).nonzero()[1])

    team_32_recommend_pool = team_32_all_books - team_32_user_books
    team_32_predicted_scores = {}

    for team_32_book in team_32_recommend_pool:
        team_32_total_score = 0
        team_32_weight_sum = 0
        for team_32_user_id, team_32_similarity in team_32_similar_users:
            team_32_book_rating = team_32_sparse_matrix[team_32_user_id, team_32_book]
            if team_32_book_rating > 0:
                team_32_total_score += team_32_similarity * team_32_book_rating
                team_32_weight_sum += team_32_similarity

        if team_32_weight_sum == 0:
            team_32_predicted_scores[team_32_book] = team_32_sparse_matrix[:, team_32_book].mean()
        else:
            team_32_predicted_scores[team_32_book] = team_32_total_score / team_32_weight_sum

    team_32_personalized_books = sorted(team_32_predicted_scores.items(), key=lambda x: x[1], reverse=True)[:5]

    if len(team_32_personalized_books) < 5:
        team_32_needed_books = 5 - len(team_32_personalized_books)
        team_32_global_books = recommend_global_top_books(
            team_32_sparse_matrix, 
            team_32_count=team_32_needed_books + len(team_32_personalized_books)
        )
        team_32_existing_books = {book_id for book_id, _ in team_32_personalized_books}
        team_32_additional_books = [
            (book_id, score) for book_id, score in team_32_global_books 
            if book_id not in team_32_existing_books
        ]
        team_32_additional_books = team_32_additional_books[:team_32_needed_books]
        team_32_personalized_books.extend(team_32_additional_books)

    return team_32_personalized_books

def find_top_similar_users_sparse(team_32_matrix, team_32_user_id, team_32_top_k=10):
    """
    Identify the top-K most similar users for a specified user based on cosine similarity.

    Parameters:
        team_32_matrix (csr_matrix): Sparse matrix of user-item ratings.
        team_32_user_id (int): The user ID for whom similarity is calculated.
        team_32_top_k (int): The number of similar users to retrieve.

    Returns:
        list: Top-K similar users as tuples [(user_id, similarity_score), ...].
    """
    team_32_user_vector = team_32_matrix.getrow(team_32_user_id)
    team_32_similarities = calculate_sparse_cosine_similarity(team_32_user_vector, team_32_matrix)
    team_32_similarities[team_32_user_id] = -1
    team_32_top_indices = np.argsort(team_32_similarities)[-team_32_top_k:][::-1]
    team_32_similar_users = [(index, team_32_similarities[index]) for index in team_32_top_indices]
    return team_32_similar_users

def process_user_chunk_and_save(
    team_32_data_file, team_32_user_map_file, team_32_book_map_file, 
    team_32_titles_file, team_32_result_file, team_32_user_group, team_32_num_similar=10):
    """
    Process a chunk of users and save their book recommendations to a file, including titles.

    Parameters:
        team_32_data_file (str): Path to the LIBSVM file.
        team_32_user_map_file (str): Path to the user mapping file.
        team_32_book_map_file (str): Path to the book mapping file.
        team_32_titles_file (str): Path to the file containing book titles.
        team_32_result_file (str): Path to save the recommendations.
        team_32_user_group (list): List of user IDs to process.
        team_32_num_similar (int): Number of similar users to consider for recommendations.

    Returns:
        None
    """
    team_32_matrix, _ = load_svmlight_file(team_32_data_file)
    team_32_user_map = pd.read_csv(team_32_user_map_file)
    team_32_book_map = pd.read_csv(team_32_book_map_file)
    team_32_titles = pd.read_csv(team_32_titles_file, delimiter=';')
    team_32_output = []

    for team_32_user in team_32_user_group:
        team_32_user_row = team_32_user_map[team_32_user_map['Mapped-ID'] == team_32_user]
        if team_32_user_row.empty:
            continue

        team_32_resolved_user = team_32_user_row['User-ID'].values[0]
        team_32_similar_users = find_top_similar_users_sparse(team_32_matrix, team_32_user, team_32_num_similar)
        team_32_recommendations = generate_book_recommendations(team_32_matrix, team_32_similar_users, team_32_user)

        for team_32_book_id, team_32_score in team_32_recommendations:
            team_32_book_row = team_32_book_map[team_32_book_map['Mapped-ID'] == team_32_book_id]
            if team_32_book_row.empty:
                team_32_isbn = "Unknown ISBN"
            else:
                team_32_isbn = team_32_book_row['ISBN'].values[0]

            team_32_title_row = team_32_titles[team_32_titles['ISBN'] == team_32_isbn]
            if team_32_title_row.empty:
                team_32_book_title = "Unknown Title"
            else:
                team_32_book_title = team_32_title_row['Title'].values[0]

            team_32_output.append({
                'User_ID': team_32_resolved_user,
                'Book_ID': team_32_isbn,
                'Book_Title': team_32_book_title,
                'Recommendation_Score': team_32_score
            })

    team_32_df = pd.DataFrame(team_32_output)
    if not os.path.isfile(team_32_result_file):
        team_32_df.to_csv(team_32_result_file, index=False)
    else:
        team_32_df.to_csv(team_32_result_file, mode='a', header=False, index=False)


In [6]:
input_file = "user_book_matrix.libsvm"
output_file = "output.libsvm"

with open(input_file, "r") as infile:
    lines = infile.readlines()

with open(output_file, "w") as outfile:
    for index, line in enumerate(lines):
        features = line.strip()
        if features:
            label = index
            libsvm_line = f"{label} {features}"
            outfile.write(libsvm_line + "\n")

print(f"LIBSVM file created successfully: {output_file}")

LIBSVM file created successfully: output.libsvm


In [7]:
data_file = "output.libsvm"
user_mapping_file = "user_mapping.csv"
book_mapping_file = "book_mapping.csv"
titles_file = "Books.csv"
result_file = "final_recommendations.csv"

In [8]:
from sklearn.datasets import load_svmlight_file
import numpy as np

sparse_matrix, _ = load_svmlight_file(data_file)

chunk_size = 100
total_users = sparse_matrix.shape[0]

user_chunks = [range(i, min(i + chunk_size, total_users)) for i in range(0, total_users, chunk_size)]


In [None]:
import os

if os.path.exists(result_file):
    os.remove(result_file)

start_chunk = 719

for i, user_chunk in enumerate(user_chunks[start_chunk:], start=start_chunk):
    try:
        print(f"Processing chunk {i + 1}/{len(user_chunks)} with {len(user_chunk)} users...")
        process_user_chunk_and_save(
            team_32_data_file=data_file,
            team_32_user_map_file=user_mapping_file,
            team_32_book_map_file=book_mapping_file,
            team_32_titles_file=titles_file,
            team_32_result_file=result_file,
            team_32_user_group=user_chunk,
            team_32_num_similar=10  
        )
    except Exception as e:
        print(f"Error processing chunk {i + 1}: {e}")


Processing chunk 720/1053 with 100 users...
Processing chunk 721/1053 with 100 users...
Processing chunk 722/1053 with 100 users...
Processing chunk 723/1053 with 100 users...
Processing chunk 724/1053 with 100 users...
Processing chunk 725/1053 with 100 users...
Processing chunk 726/1053 with 100 users...
Processing chunk 727/1053 with 100 users...
Processing chunk 728/1053 with 100 users...
Processing chunk 729/1053 with 100 users...
Processing chunk 730/1053 with 100 users...
Processing chunk 731/1053 with 100 users...
Processing chunk 732/1053 with 100 users...
Processing chunk 733/1053 with 100 users...
Processing chunk 734/1053 with 100 users...
Processing chunk 735/1053 with 100 users...
Processing chunk 736/1053 with 100 users...
Processing chunk 737/1053 with 100 users...
Processing chunk 738/1053 with 100 users...
Processing chunk 739/1053 with 100 users...
Processing chunk 740/1053 with 100 users...
Processing chunk 741/1053 with 100 users...
Processing chunk 742/1053 with 1

Error processing chunk 873: [Errno 13] Permission denied: 'final_recommendations.csv'
Processing chunk 874/1053 with 100 users...
Error processing chunk 874: [Errno 13] Permission denied: 'final_recommendations.csv'
Processing chunk 875/1053 with 100 users...
Error processing chunk 875: [Errno 13] Permission denied: 'final_recommendations.csv'
Processing chunk 876/1053 with 100 users...
Error processing chunk 876: [Errno 13] Permission denied: 'final_recommendations.csv'
Processing chunk 877/1053 with 100 users...
Error processing chunk 877: [Errno 13] Permission denied: 'final_recommendations.csv'
Processing chunk 878/1053 with 100 users...
Processing chunk 879/1053 with 100 users...
Processing chunk 880/1053 with 100 users...
Processing chunk 881/1053 with 100 users...
Processing chunk 882/1053 with 100 users...
Processing chunk 883/1053 with 100 users...
Processing chunk 884/1053 with 100 users...
Processing chunk 885/1053 with 100 users...
Processing chunk 886/1053 with 100 users..