In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv(
    '../data/goodreads.csv', 
    encoding='cp1251', 
    sep=',',
    usecols=['id', 'Book-Title', 'Book-Author', 'ISBN', 'genres']
    )
dataset.head()

Unnamed: 0,id,Book-Title,Book-Author,ISBN,genres
0,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,9780439000000.0,Young Adult
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,9780439000000.0,Fantasy
2,2657,To Kill a Mockingbird,Harper Lee,,Classics
3,1885,Pride and Prejudice,Jane Austen,,Classics
4,41865,"Twilight (Twilight, #1)",Stephenie Meyer,9780316000000.0,Young Adult


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           200 non-null    int64  
 1   Book-Title   200 non-null    object 
 2   Book-Author  200 non-null    object 
 3   ISBN         150 non-null    float64
 4   genres       200 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 7.9+ KB


# Content Based Recommendations

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# Combine author and genres into a feature
dataset['features'] = dataset['Book-Author'] + ' ' + dataset['genres']

In [6]:
# Vectorize features
count_vectorizer = CountVectorizer()
feature_matrix  = count_vectorizer.fit_transform(dataset['features'])
feature_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 627 stored elements and shape (200, 308)>

In [7]:
# Cosine similarity
cosine_sim = cosine_similarity(feature_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.5       ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.33333333, 0.33333333,
        0.        ],
       ...,
       [0.        , 0.        , 0.33333333, ..., 1.        , 0.33333333,
        0.        ],
       [0.        , 0.        , 0.33333333, ..., 0.33333333, 1.        ,
        0.        ],
       [0.5       , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]], shape=(200, 200))

In [8]:
# find book the fellowship of the ring
book = "The Fellowship of the Ring (The Lord of the Rings, #1)"
book_idx = dataset[dataset['Book-Title'] == book].index[0]
book_idx

np.int64(78)

In [9]:
# Find similar books
similar_books = list(enumerate(cosine_sim[book_idx]))
similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)

# Exclude the book itself
similar_books = [x for x in similar_books if x[0] != book_idx]

# Display top recommendations
print(f"Recommendations for '{book}':")
for idx, score in similar_books[:10]:
    print(f"- {dataset.iloc[idx]['Book-Title']} (Score: {score:.2f})")


Recommendations for 'The Fellowship of the Ring (The Lord of the Rings, #1)':
- J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings (Score: 1.00)
- The Hobbit, or There and Back Again (Score: 1.00)
- Harry Potter and the Order of the Phoenix (Harry Potter, #5) (Score: 0.50)
- The Chronicles of Narnia (Chronicles of Narnia, #1-7) (Score: 0.50)
- Harry Potter and the Sorcerer's Stone (Harry Potter, #1) (Score: 0.50)
- Harry Potter and the Deathly Hallows (Harry Potter, #7) (Score: 0.50)
- Harry Potter and the Prisoner of Azkaban (Harry Potter, #3) (Score: 0.50)
- Harry Potter and the Goblet of Fire (Harry Potter, #4) (Score: 0.50)
- Harry Potter and the Half-Blood Prince (Harry Potter, #6) (Score: 0.50)
- Harry Potter and the Chamber of Secrets (Harry Potter, #2) (Score: 0.50)
