In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
import os

file_path = './data/books.csv'
if os.path.exists(file_path):
	df = pd.read_csv(file_path, quotechar='"', on_bad_lines='skip')
else:
	print(f"File not found: {file_path}")
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [3]:
df.shape

(11123, 12)

In [4]:
df.isnull().sum()

bookID                0
title                 0
authors               0
average_rating        0
isbn                  0
isbn13                0
language_code         0
  num_pages           0
ratings_count         0
text_reviews_count    0
publication_date      0
publisher             0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11123.0,11123.0,11123.0,11123.0,11123.0,11123.0
mean,21310.856963,3.934075,9759880000000.0,336.405556,17942.85,542.048099
std,13094.727252,0.350485,442975800000.0,241.152626,112499.2,2576.619589
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10277.5,3.77,9780345000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780582000000.0,299.0,745.0,47.0
75%,32104.5,4.14,9780872000000.0,416.0,5000.5,238.0
max,45641.0,5.0,9790008000000.0,6576.0,4597666.0,94265.0


In [6]:
features = ['title','authors','publisher']
df['combined_features'] = df.title + ' ' + df.authors + ' ' + df.publisher

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# Vectorize each feature separately
tfidf_title = TfidfVectorizer().fit_transform(df['title'])
tfidf_publisher = TfidfVectorizer().fit_transform(df['publisher'])
tfidf_author = TfidfVectorizer().fit_transform(df['authors'])

# Reduce weight of author's vector
tfidf_author = tfidf_author * 0.3  # e.g., reduce to 30% influence
tfidf_publisher =  tfidf_publisher * 0.3

# Combine all vectors
combined_matrix = hstack([tfidf_title, tfidf_publisher, tfidf_author])
cosine_sim = cosine_similarity(combined_matrix)


In [15]:
import re

In [16]:
def get_recommendations(title, cosine_sim=cosine_sim):
    user_choice = title
    user_index = df[df.title.str.contains(user_choice,case=False)].index[0]
    sim_movies = list(enumerate(cosine_sim[user_index]))
    sorted_sim_movies = sorted(sim_movies,key=lambda x:x[1],reverse=True)[1:]
    full_title = df.loc[user_index, 'title']
    
    recommend_books = []
    
    print('Recommend books for ' + full_title)
    for i,element in enumerate(sorted_sim_movies):
        book_id = element[0]
        title = df.title.iloc[book_id]
        score = element[1]
        author = df.authors.iloc[book_id]
        
        title = re.sub(r'\s*\([^)]*\)', '', title)
        recommend_books.append([title, score, author])
        print('{:30} {:3f} {:30}'.format(title,score,author))
        if i > 10:
            break
    return recommend_books

In [17]:
print(get_recommendations('Harry Potter'))

Recommend books for Harry Potter and the Half-Blood Prince (Harry Potter  #6)
Harry Potter and the Half-Blood Prince 0.891476 J.K. Rowling                  
Harry Potter and the Order of the Phoenix 0.758564 J.K. Rowling/Mary GrandPré    
Harry Potter Collection        0.757257 J.K. Rowling                  
Harry Potter and the Prisoner of Azkaban 0.745371 J.K. Rowling/Mary GrandPré    
Harry Potter and the Chamber of Secrets 0.724499 J.K. Rowling/Mary GrandPré    
Harry Potter Boxed Set  Books 1-5 0.711038 J.K. Rowling/Mary GrandPré    
Harry Potter and the Chamber of Secrets 0.706434 J.K. Rowling                  
Harry Potter and the Sorcerer's Stone 0.680659 J.K. Rowling/Mary GrandPré    
Harry Potter and the Philosopher's Stone 0.652968 J.K. Rowling                  
Harry Potter and the Goblet of Fire 0.650224 J.K. Rowling                  
Harry Potter and the Prisoner of Azkaban 0.636847 J.K. Rowling                  
Harry Potter Y La Piedra Filosofal 0.599366 J.K. Rowling   