# Books Recommendation System

## Import the needed libraries

In [1]:
import os
import pandas as pd
import numpy as np
import tiktoken
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity


## Read the Open API Key from .env file

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
# find_dotenv() find .env automatically by walking up directories until it's found 
# load_dotenv() load the environment variables from the .env file
# override=True allows the .env file to override the system environment variables
load_dotenv(find_dotenv(), override=True)

apiKey = os.environ.get('OPENAI_API_KEY')

## Create OpenAI Client

In [3]:
client = OpenAI(api_key=apiKey)

## Read the dat file from disk

In [4]:
books_data = pd.read_csv('books_dataset.csv')
books_data



Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83
2,9780006163831,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97
3,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93
4,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,2002.0,4.15
...,...,...,...,...,...,...,...
6805,9788185300535,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,This collection of the timeless teachings of o...,1999.0,4.51
6806,9788185944609,Secrets Of The Heart,Khalil Gibran,Mysticism,,1993.0,4.08
6807,9788445074879,Fahrenheit 451,Ray Bradbury,Book burning,,2004.0,3.98
6808,9789027712059,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,History,Since the three volume edition ofHegel's Philo...,1981.0,0.00


## Drop Records with Empty Data
- As shown in when we display the loaded data records 6807 and 6808 have empty description (converted to NaN by Pandas)
- there are different ways to handle NaN values
  - Drop the records to not affect the training process
  - Fill them with a constant value
  - Interpolate NaN Values: the process of estimating missing values based on the values of neighboring data points.

For this app we will drop them using dropna method from pandas (it will remove the whole record if at lease one column has NaN value)

In [5]:
books_data = books_data.dropna()
books_data

#check the different count of the data record after removing the null values

Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83
2,9780006163831,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97
3,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93
4,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,2002.0,4.15
...,...,...,...,...,...,...,...
6803,9788173031014,Journey to the East,Hermann Hesse,Adventure stories,This book tells the tale of a man who goes on ...,2002.0,3.70
6804,9788179921623,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82
6805,9788185300535,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,This collection of the timeless teachings of o...,1999.0,4.51
6808,9789027712059,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,History,Since the three volume edition ofHegel's Philo...,1981.0,0.00


In [6]:
# Select only the first 2000 book records for the analysis to reduce the computation time and cost
books_data = books_data.sort_values(by='average_rating', ascending=True).head(2000)
books_data

Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating
5213,9780812927634,Random House Crossword Megaomnibus,United Feature Syndication,Games,his new series contains crosswords slightly le...,1997.0,0.00
1635,9780313309335,The Fantastic Vampire,James Craig Holte,Literary Criticism,Wherever vampires existed in the imaginations ...,2002.0,0.00
6739,9781932206104,How to Meditate,Frederick Lenz,Meditation,"How to Meditate is a transcript all of Rama's,...",2004.0,0.00
4479,9780744005615,Juiced,Doug Walsh,Games,BradyGames' Juiced Official Strategy Guide inc...,2005.0,0.00
1750,9780340774779,"The Road to War, 1933-1939",Andrew Hunt,Germany,This text uses a source-based approach to stud...,2000.0,0.00
...,...,...,...,...,...,...,...
1587,9780312890216,The Starry Rift,James Tiptree,Fiction,This novel set in the far-future and filled wi...,1994.0,3.82
1211,9780192837684,The Eclogues ; The Georgics,Virgil,Agriculture,"The Eclogues, ten short pastoral poems, were c...",1999.0,3.82
1217,9780192839572,La chartreuse de Parme,Jm Stendhal,Fiction,Follows the adventures of young Fabrizio del D...,1999.0,3.82
5770,9781400079179,The Da Vinci Code,Dan Brown,Fiction,Harvard symbologist Robert Langdon and French ...,2006.0,3.82


## Calculate the Embedding Cost

In [7]:
def get_embedding(text):
    text = text.replace('\n', ' ')
    response = client.embeddings.create(model='text-embedding-3-small', input=text)
    return response.data[0].embedding

In [8]:
def calculate_embedding_cost(books_data):
    # Calculate the number of tokens in the books dataset
    data_as_list = list(books_data['description'])
    enc = tiktoken.encoding_for_model(model_name="text-embedding-3-small")
    total_tokens = sum([len(enc.encode(desc)) for desc in data_as_list])
    print(f"Total tokens: {total_tokens}")
    cost = total_tokens * (0.00002/1000)
    print(f"Cost: ${cost:.20f}")

In [9]:
calculate_embedding_cost(books_data)

Total tokens: 178031
Cost: $0.00356062000000000002


## Calculate the Embeddings and Store them Locally

In [11]:
def get_embeddings_and_save_to_csv():
    books_data['embedding'] = books_data['description'].apply(lambda x: get_embedding(x))
    books_data.to_csv('books_dataset_with_embeddings.csv', index=False)

In [14]:
get_embeddings_and_save_to_csv()

## Load the Embeddings

In [10]:
books_with_embeddings = pd.read_csv('books_dataset_with_embeddings.csv')
books_with_embeddings["embedding"] = books_with_embeddings["embedding"].apply(eval).apply(np.array)
books_with_embeddings

Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating,embedding
0,9780812927634,Random House Crossword Megaomnibus,United Feature Syndication,Games,his new series contains crosswords slightly le...,1997.0,0.00,"[-0.018283704295754433, -0.015245502814650536,..."
1,9780313309335,The Fantastic Vampire,James Craig Holte,Literary Criticism,Wherever vampires existed in the imaginations ...,2002.0,0.00,"[-0.030094962567090988, 0.035030536353588104, ..."
2,9781932206104,How to Meditate,Frederick Lenz,Meditation,"How to Meditate is a transcript all of Rama's,...",2004.0,0.00,"[-0.0060596163384616375, 0.025496121495962143,..."
3,9780744005615,Juiced,Doug Walsh,Games,BradyGames' Juiced Official Strategy Guide inc...,2005.0,0.00,"[0.036799781024456024, 0.04352662339806557, -0..."
4,9780340774779,"The Road to War, 1933-1939",Andrew Hunt,Germany,This text uses a source-based approach to stud...,2000.0,0.00,"[-0.015587965957820415, 0.03728048503398895, 0..."
...,...,...,...,...,...,...,...,...
1995,9780312890216,The Starry Rift,James Tiptree,Fiction,This novel set in the far-future and filled wi...,1994.0,3.82,"[0.0023731449618935585, 0.042326465249061584, ..."
1996,9780192837684,The Eclogues ; The Georgics,Virgil,Agriculture,"The Eclogues, ten short pastoral poems, were c...",1999.0,3.82,"[0.021905047819018364, 0.014567454345524311, 0..."
1997,9780192839572,La chartreuse de Parme,Jm Stendhal,Fiction,Follows the adventures of young Fabrizio del D...,1999.0,3.82,"[-0.021207181736826897, -0.004873700439929962,..."
1998,9781400079179,The Da Vinci Code,Dan Brown,Fiction,Harvard symbologist Robert Langdon and French ...,2006.0,3.82,"[-0.008561644703149796, 0.029169591143727303, ..."


## Create Recommendations Function

In [18]:
%pip install matplotlib -q

Note: you may need to restart the kernel to use updated packages.


In [19]:
%pip install plotly -q

Note: you may need to restart the kernel to use updated packages.


In [12]:

import textwrap as tr
from typing import List, Optional

import matplotlib.pyplot as plt
import plotly.express as px
from scipy import spatial
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import average_precision_score, precision_recall_curve
# Utility Methods
def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances

def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
    """Return a list of indices of nearest neighbors from a list of distances."""
    return np.argsort(distances)

In [16]:
def get_recommendations_from_title(df_embeddings, title, n=5):
    if title not in df_embeddings['title'].values:
        print(f"Title '{title}' not found in the dataset")
        return False
    
    #Get text embedding for the title from the embeddings file
    title_embedding = df_embeddings[df_embeddings['title'] == title]['embedding']
    title_embedding = title_embedding.squeeze()
    all_embeddings = list(df_embeddings['embedding'])
    
    distances = distances_from_embeddings(title_embedding, all_embeddings)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    #print(indices_of_nearest_neighbors)
    
    recommendations = list()
    for index in indices_of_nearest_neighbors[1:n+1]:
        book = dict()
        book['title'] = df_embeddings.iloc[index]['title']
        book['description'] = df_embeddings.iloc[index]['description']
        book['distance'] = distances[index]
        recommendations.append(book)
    return recommendations
    

In [17]:
get_recommendations_from_title(books_with_embeddings, "Juiced", n=5)

[{'title': 'Medal of Honor Rising Sun',
  'description': 'Playmakers Win Championships -Online strategies for the PlayStation(R)2 computer entertainment system -Expert offensive and defensive strategies -Tips for success in every mode, including the new Owner Mode -How to unlock all hidden rewards -Profiles, statistics, and key plays for every team',
  'distance': 0.4523883173313282},
 {'title': 'Castlevania',
  'description': "BradyGames' Castlevania: Curse of Darkness Official Strategy Guide includes the following: A complete walkthrough of the entire game. Detailed maps of every area. In-depth listing of unlockables, items and weapons. Expert strategies to defeat every boss. Extensive bestiary. Game secrets revealed! Platform: PlayStation 2 Genre: Action/AdventureThis product is available for sale in North America only.",
  'distance': 0.45504904353297027},
 {'title': 'Hardball',
  'description': 'Hardball takes leaders deep inside the world of hardball competition - a world where t

## Testing

In [23]:
title = input("Enter the title of the book: ")

recommendations = get_recommendations_from_title(books_with_embeddings, title, n=5)

if recommendations:
    for i, recommendation in enumerate(recommendations):
        print(f"Recommendation {i+1}:")
        print(f"Title: {recommendation['title']}")
        print(f"Description: {recommendation['description']}")
        print(f"Distance: {recommendation['distance']}")
        print("\n")
else:
    print("Title not found in the dataset. Please try again.")

Recommendation 1:
Title: The Rule of Four
Description: Trying to decipher an ancient text that weaves a mathematical labyrinth within a love story, two researchers obtain a diary that may contain the key to the code, but when a fellow researcher is killed, they realize that the book contains a dangerous secret.
Distance: 0.4417196933140113


Recommendation 2:
Title: The Guide to Dan Brown's The Solomon Key
Description: The massive success of Dan Brown's The Da Vinci Code has readers eagerly awaiting his next novel, The Solomon Key. Using clues left by Brown in interviews, on his website, and on the cover of The Da Vinci Code, Greg Taylor takes you on an unprecedented tour of the new book before it is even released. The Guide to Dan Brown's The Solomon Key explores the topics likely to be included in Brown's next novel - including the hidden history of Washington, D.C., Freemasonry, and even the Ku Klux Klan - to give you a better understanding of the concepts behind the book. With no s

In [22]:
print(books_with_embeddings.iloc[3]['description'])

BradyGames' Juiced Official Strategy Guide includes the following: Track Maps: Each map is broken down to give you the best chance of winning. Full Car List: Get all the stats on all the hottest rides! Crew Management Tips: Get the best crew and learn what it takes to win! Rival Crew Challenges: Discover the ins and outs of every challenge and get the best strategy to crush rival crews! And More: Racing schedules, part listings, pro tips, crewmate data, and showoff tricks! Platform: PlayStation 2, Xbox, & PC Genre: Sports/Racing This title is available for sale worldwide.


## Visualize The Embeddings
- As the embeddings are just some numbers we will not be able to understand them but using some tools to visualize and show the relations between the data
- (Atlas)[https://atlas.nomic.ai/discover] is one of the tools that can do this job
- It avail APIs so you can visualize your own data

In [24]:
%pip install nomic -q

Note: you may need to restart the kernel to use updated packages.


In [25]:
from nomic import atlas

In [42]:
books = pd.read_csv('books_dataset_with_embeddings.csv')
books['embedding'] = books['embedding'].apply(eval).apply(np.array)




In [43]:

books_dict = books[['title','authors','categories']].to_dict('records')
books_dict

[{'title': 'Random House Crossword Megaomnibus',
  'authors': 'United Feature Syndication',
  'categories': 'Games'},
 {'title': 'The Fantastic Vampire',
  'authors': 'James Craig Holte',
  'categories': 'Literary Criticism'},
 {'title': 'How to Meditate',
  'authors': 'Frederick Lenz',
  'categories': 'Meditation'},
 {'title': 'Juiced', 'authors': 'Doug Walsh', 'categories': 'Games'},
 {'title': 'The Road to War, 1933-1939',
  'authors': 'Andrew Hunt',
  'categories': 'Germany'},
 {'title': 'American writers',
  'authors': 'Gale Group',
  'categories': 'Biography & Autobiography'},
 {'title': 'The Berlin Phenomenology',
  'authors': 'Georg Wilhelm Friedrich Hegel',
  'categories': 'History'},
 {'title': 'Open City 6',
  'authors': 'Thomas Beller;James Purdy;Strawberry Saroyan;Debra Garrison;Michael Cunningham',
  'categories': 'Fiction'},
 {'title': 'World Studies: Eastern Hemisphere',
  'authors': 'Heidi Hayes Jacobs;Pearson Prentice Hall;Kate Kinsella;Michal L. LeVasseur',
  'catego

In [39]:
import nomic
nomic_apiKey = os.environ.get('NOMIC_API_KEY')
nomic.login(nomic_apiKey)

NameError: name 'exit' is not defined

In [37]:
embeddings = books['embedding'].to_list()
embeddings = np.array(embeddings)


KeyError: 0

In [50]:


# Build a map using the map_data method
dataset = atlas.map_data(data=books_dict,
                         embeddings=embeddings,
                          identifier='mabdelraza2/books-map',
                          description='A map for the books data.',
                          )
dataset.maps[0] # to view map build status


[32m2024-06-02 14:59:14.756[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m918[0m - [1mCreating dataset `books-map`[0m
[32m2024-06-02 14:59:15.660[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m110[0m - [1mUploading data to Atlas.[0m
1it [00:03,  3.02s/it]
[32m2024-06-02 14:59:18.706[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1597[0m - [1mUpload succeeded.[0m
[32m2024-06-02 14:59:18.708[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m125[0m - [1m`mabdelraza2/books-map`: Data upload succeeded to dataset`[0m
[32m2024-06-02 14:59:22.136[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1301[0m - [1mCreated map `mabdelraza2/books-map` in dataset `mabdelraza2/books-map`: https://atlas.nomic.ai/data/mabdelraza2/books-map/map[0m
