In [1]:
import getpass
import pandas as pd
from pymongo import MongoClient

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
password = getpass.getpass("MongoDB password: ")

MongoDB password:  ········


In [3]:
client = MongoClient(f'mongodb://book_group:{password}@macragge.reika.io:47017/?authSource=books')

In [4]:
db = client['books']
collection = db['books']

In [5]:
# Fetch data from MongoDB
query = {'random': {'$lte': 10}}
data = list(collection.find(query))  # Retrieve documents as a list of dictionaries 

In [23]:
# Create DataFrame and preview
df = pd.DataFrame(data)

# Filter the DataFrame to only include rows where 'language_code' is 'eng'
df = df[df['language_code'] == "eng"]

# Display the first few rows of the filtered DataFrame
df.head()

Unnamed: 0,_id,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,...,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,genre,random
3,66df9d085a178eb80fbd4e9f,1849412235,1,[499526],US,eng,"[{'count': '15', 'name': 'to-read'}, {'count':...",,False,4.23,...,2010,https://www.goodreads.com/book/show/9501773-th...,https://s.gr-assets.com/assets/nophoto/book/11...,9501773,31,14387351,The Adventures of Mrs Pepperpot,The Adventures of Mrs Pepperpot,children,0
8,66df9d085a178eb80fbd4fea,0394849736,8,[638089],US,eng,"[{'count': '88', 'name': 'to-read'}, {'count':...",,False,4.04,...,1981,https://www.goodreads.com/book/show/1350015.Pe...,https://s.gr-assets.com/assets/nophoto/book/11...,1350015,173,40872087,Peter Pan and Wendy,Peter Pan and Wendy,children,7
15,66df9d0a5a178eb80fbd54b2,159707487X,13,[614823],US,eng,"[{'count': '15', 'name': 'to-read'}, {'count':...",,False,4.11,...,2013,https://www.goodreads.com/book/show/17454804-a...,https://images.gr-assets.com/books/1413131666m...,17454804,89,24344930,Ariol #3: Happy as a Pig...,Ariol #3: Happy as a Pig...,children,5
16,66df9d0a5a178eb80fbd5505,,18,[355505],US,eng,"[{'count': '301', 'name': 'to-read'}, {'count'...",,False,4.36,...,2013,https://www.goodreads.com/book/show/13540858-f...,https://images.gr-assets.com/books/1331685546m...,13540858,402,19103974,"Famous (Canterwood Crest, #18)","Famous (Canterwood Crest, #18)",children,3
17,66df9d0a5a178eb80fbd5520,0545060613,11,[],US,eng,"[{'count': '33', 'name': 'to-read'}, {'count':...",,False,3.7,...,2008,https://www.goodreads.com/book/show/5944673-th...,https://images.gr-assets.com/books/1259031059m...,5944673,23,6117184,The Silly Turkey Party,The Silly Turkey Party,children,7


In [24]:
df['average_rating'] = df['average_rating'].astype(float)

In [25]:
#Dataframe/pivot table with index of title and genre as user id values of book ratings
pt = df.pivot_table(index='title', columns='genre', values='average_rating', aggfunc='mean')
pt.fillna(0,inplace=True)

In [42]:
pt

genre,children,comics_graphic,fantasy_paranormal,history_biography,mystery_thriller_crime,poetry,romance,young_adult
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"""Verified Kill"" (Assassins, #1)",0.00,0.0,0.00,0.0,0.00,0.0,3.88,0.00
"$10,000,000 Marriage Proposal",0.00,0.0,0.00,0.0,0.00,0.0,3.24,0.00
'Salem's Lot,0.00,0.0,0.00,0.0,3.99,0.0,0.00,0.00
"'Til Death (Conversion, #3)",0.00,0.0,4.19,0.0,0.00,0.0,0.00,0.00
...all five... a true story,0.00,0.0,0.00,0.0,4.07,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...
the single girls to do list,0.00,0.0,0.00,0.0,0.00,0.0,3.94,0.00
the throne of tara,0.00,0.0,3.54,0.0,0.00,0.0,0.00,0.00
the very helpful monsters,3.96,0.0,0.00,0.0,0.00,0.0,0.00,0.00
چشم بهشتی,0.00,0.0,0.00,0.0,0.00,0.0,0.00,3.71


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
similarity_scores = cosine_similarity(pt)

In [29]:
similarity_scores.shape

(5965, 5965)

In [56]:
def recommendation(book_name):
    # Fetching Index
    if book_name in list(pt.index):
        index = np.where(np.array(list(pt.index)) == book_name)[0][0]
        
        # Retrieve similarity scores for the specific book
        similarity_scores_for_book = similarity_scores[index]
        
        # Sort similar items based on the specific book's similarity scores
        similar_items = sorted(list(enumerate(similarity_scores_for_book)), reverse=True, key=lambda x: x[1])[1:9]
        
        data = []
        for i in similar_items:
            title = pt.index[i[0]]
            temp_df = df[df['title'] == title]
            item = list(temp_df.drop_duplicates('title')['title'].values)
            data.append(item)
        
    else:
        print(f"The book '{book_name}' is not found in the index.")

    return data

In [59]:
recommendation("the very helpful monsters")

[['A Chair for My Mother'],
 ['A Dog for Everyone'],
 ['A Fairy Went A-Marketing'],
 ['A Full House: An Austin Family Christmas (Austin Family, #5.6)'],
 ['A Is For Asia'],
 ['A Letter to Amy'],
 ['A Little Princess'],
 ['A Pattern Of Roses']]

In [60]:
client.close()