In [16]:
from langchain_community.document_loaders import TextLoader # converts raw text (book description) and convert it to format that longchain can work with
from langchain.text_splitter import CharacterTextSplitter # splits whole document containing all of the descriptions into meaningful chunks (individual desc of each book)
# from langchain_openai import OpenAIEmbeddings # converting chunks into document embeddings
from langchain_chroma import Chroma # storing embeddings in vector database ChromaDB
from langchain_huggingface import HuggingFaceEmbeddings

In [17]:
import pandas as pd

books = pd.read_csv("data/books_cleaned.csv", encoding="utf-8", on_bad_lines="skip")

books["tagged_description"] = books["tagged_description"].str.replace('"', '', regex=False)
print(books['tagged_description'].head())

0    9780002005883 A NOVEL THAT READERS and critics...
1    9780002261982 A new 'Christie for Christmas' -...
2    9780006178736 A memorable, mesmerizing heroine...
3    9780006280897 Lewis' work on the nature of lov...
4    9780006280934 In The Problem of Pain, C.S. Lew...
Name: tagged_description, dtype: object


In [18]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 In The Problem of Pain, C.S. Lew..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5225,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...
5226,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...
5227,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...
5228,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...


In [19]:
books["tagged_description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 In The Problem of Pain, C.S. Lew...
                              ...                        
5225    9788172235222 On A Train Journey Home To North...
5226    9788173031014 This book tells the tale of a ma...
5227    9788179921623 Wisdom to Create a Life of Passi...
5228    9788185300535 This collection of the timeless ...
5229    9789027712059 Since the three volume edition o...
Name: tagged_description, Length: 5230, dtype: object

In [20]:
books['tagged_description'].to_csv("data/tagged_description.txt", sep='\n', index=False)

In [21]:
raw_documents = TextLoader('data/tagged_description.txt', encoding='utf-8').load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n") #prioritise splitting on the separator rather than on chunksize
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 18, which is longer than the specified 0
Created a chunk of size 1168, which is longer than the specified 0
Created a chunk of size 1214, which is longer than the specified 0
Created a chunk of size 373, which is longer than the specified 0
Created a chunk of size 309, which is longer than the specified 0
Created a chunk of size 477, which is longer than the specified 0
Created a chunk of size 482, which is longer than the specified 0
Created a chunk of size 960, which is longer than the specified 0
Created a chunk of size 188, which is longer than the specified 0
Created a chunk of size 843, which is longer than the specified 0
Created a chunk of size 274, which is longer than the specified 0
Created a chunk of size 191, which is longer than the specified 0
Created a chunk of size 875, which is longer than the specified 0
Created a chunk of size 1088, which is longer than the specified 0
Created a chunk of size 1189, which is longer than the specified 0
Created

In [22]:
documents[2]

Document(metadata={'source': 'data/tagged_description.txt'}, page_content="9780002261982 A new 'Christie for Christmas' -- a full-length novel adapted from her acclaimed play by Charles Osborne Following BLACK COFFEE and THE UNEXPECTED GUEST comes the final Agatha Christie play novelisation, bringing her superb storytelling to a new legion of fans. Clarissa, the wife of a Foreign Office diplomat, is given to daydreaming. 'Supposing I were to come down one morning and find a dead body in the library, what should I do?' she muses. Clarissa has her chance to find out when she discovers a body in the drawing-room of her house in Kent. Desperate to dispose of the body before her husband comes home with an important foreign politician, Clarissa persuades her three house guests to become accessories and accomplices. It seems that the murdered man was not unknown to certain members of the house party (but which ones?), and the search begins for the murderer and the motive, while at the same ti

In [23]:
print(f"Number of documents: {len(documents)}")
print(f"Average document length: {sum(len(doc.page_content) for doc in documents)/len(documents)} characters")


Number of documents: 5231
Average document length: 501.87497610399544 characters


In [24]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

print("Model loaded")

Model loaded


In [25]:
from langchain.embeddings.base import Embeddings

class MySTEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return self.model.encode(texts, show_progress_bar=True).tolist()

    def embed_query(self, text):
        return self.model.encode([text])[0].tolist()

my_embeddings = MySTEmbeddings(model)

In [26]:
from langchain.vectorstores import FAISS
import pickle

docs = documents

batch_size = 500
db_faiss = None
all_docs = []

for i in range(0, len(docs), batch_size):
    batch = docs[i:i + batch_size]
    all_docs.extend(batch)

    if db_faiss is None:
        db_faiss = FAISS.from_documents(batch, embedding=my_embeddings)
        print(f"Created FAISS base with batch {i}–{i + len(batch)}")
    else:
        db_faiss.add_documents(batch)
        print(f"Added batch {i}–{i + len(batch)}")

db_faiss.save_local("faiss_index")
with open("data/faiss_docs.pkl", "wb") as f:
    pickle.dump(all_docs, f)

print("FAISS index and documents saved")

Batches: 100%|██████████| 16/16 [00:03<00:00,  4.69it/s]


Created FAISS base with batch 0–500


Batches: 100%|██████████| 16/16 [00:02<00:00,  6.71it/s]


Added batch 500–1000


Batches: 100%|██████████| 16/16 [00:02<00:00,  6.01it/s]


Added batch 1000–1500


Batches: 100%|██████████| 16/16 [00:02<00:00,  7.92it/s]


Added batch 1500–2000


Batches: 100%|██████████| 16/16 [00:01<00:00,  8.37it/s]


Added batch 2000–2500


Batches: 100%|██████████| 16/16 [00:02<00:00,  7.12it/s]


Added batch 2500–3000


Batches: 100%|██████████| 16/16 [00:02<00:00,  5.57it/s]


Added batch 3000–3500


Batches: 100%|██████████| 16/16 [00:02<00:00,  6.52it/s]


Added batch 3500–4000


Batches: 100%|██████████| 16/16 [00:02<00:00,  6.59it/s]


Added batch 4000–4500


Batches: 100%|██████████| 16/16 [00:02<00:00,  5.93it/s]


Added batch 4500–5000


Batches: 100%|██████████| 8/8 [00:01<00:00,  5.43it/s]

Added batch 5000–5231
FAISS index and documents saved





In [27]:
query = 'A book to teach children about nature'
docs = db_faiss.similarity_search(query, k=10)
docs

[Document(id='34f58844-d233-476a-b2de-df0194aa8228', metadata={'source': 'data/tagged_description.txt'}, page_content="9780067575208 First published more than three decades ago, this reissue of Rachel Carson's award-winning classic brings her unique vision to a new generation of readers. Stunning new photographs by Nick Kelsh beautifully complement Carson's intimate account of adventures with her young nephew, Roger, as they enjoy walks along the rocky coast of Maine and through dense forests and open fields, observing wildlife, strange plants, moonlight and storm clouds, and listening to the living music of insects in the underbrush. If a child is to keep alive his inborn sense of wonder. Writes Carson, he needs the companionship of at least one adult who can share it, rediscovering with him the joy, excitement and mystery of the world we live in. The Sense of Wonder is a refreshing antidote to indifference and a guide to capturing the simple power of discovery that Carson views as es

In [28]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
452,9780067575208,006757520X,The Sense of Wonder,Rachel Carson,Nature,http://books.google.com/books/content?id=Zee5S...,"First published more than three decades ago, t...",1998.0,4.39,112.0,1160.0,The Sense of Wonder,9780067575208 First published more than three ...


In [29]:
def retrieve_semantic_recomendations(query: str, top_k: int = 10) -> pd.DataFrame:
    recs = db_faiss.similarity_search(query, k=50)

    books_list = []

    for i in range(0, len(recs)):
      books_list.append(int(recs[i].page_content.strip('"').split()[0]))

    return books[books["isbn13"].isin(books_list)].head(top_k)

In [30]:
retrieve_semantic_recomendations('A book about space adventure and universe')

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
6,9780006470229,000647022X,The Gap Into Madness,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",http://books.google.com/books/content?id=4oXav...,A new-cover reissue of the fourth book in the ...,1994.0,4.15,743.0,103.0,The Gap Into Madness: Chaos and Order,9780006470229 A new-cover reissue of the fourt...
112,9780060506049,0060506040,Worlds Enough & Time,Dan Simmons,Fiction,http://books.google.com/books/content?id=EKoIG...,An extraordinary artist with few rivals in his...,2002.0,3.75,272.0,923.0,Worlds Enough & Time: Five Tales of Speculativ...,9780060506049 An extraordinary artist with few...
349,9780061053566,0061053562,The Player of Games,Iain Banks,Fiction,http://books.google.com/books/content?id=io0JA...,"Gurgeh, a champion game player, travels a hund...",1997.0,4.27,293.0,43988.0,The Player of Games,"9780061053566 Gurgeh, a champion game player, ..."
738,9780141011110,0141011114,The Fabric of the Cosmos,Brian Greene,Science,http://books.google.com/books/content?id=dpSqv...,From the bestselling author of The Elegant Uni...,2005.0,4.12,592.0,324.0,"The Fabric of the Cosmos: Space, Time and the ...",9780141011110 From the bestselling author of T...
844,9780142500378,0142500372,Enchantress from the Stars,Sylvia Louise Engdahl,Juvenile Fiction,http://books.google.com/books/content?id=rntJA...,When young Elana unexpectedly joins the team l...,2003.0,3.94,304.0,2031.0,Enchantress from the Stars,9780142500378 When young Elana unexpectedly jo...
908,9780156027595,0156027593,The Cyberiad,Stanislaw Lem,Fiction,http://books.google.com/books/content?id=kWElP...,Trurl and Klaupacius are constructor robots wh...,2002.0,4.18,295.0,7512.0,The Cyberiad,9780156027595 Trurl and Klaupacius are constru...
941,9780156306300,0156306301,Fiasco,Stanislaw Lem,Fiction,http://books.google.com/books/content?id=4IZ3P...,When a crew of earthmen--among them a space pi...,1988.0,4.12,322.0,2125.0,Fiasco,9780156306300 When a crew of earthmen--among t...
1287,9780312864743,0312864744,Ports of Call,Jack Vance,Fiction,http://books.google.com/books/content?id=_Zu4S...,A romantic tale follows a space swashbuckler a...,1999.0,3.53,300.0,395.0,Ports of Call,9780312864743 A romantic tale follows a space ...
1290,9780312872793,0312872798,Lurulu,Jack Vance,Fiction,http://books.google.com/books/content?id=Jm3au...,"Continues the adventures of Myron Tany, a rebe...",2007.0,3.58,204.0,268.0,Lurulu,9780312872793 Continues the adventures of Myro...
1299,9780312890216,0312890214,The Starry Rift,James Tiptree,Fiction,,This novel set in the far-future and filled wi...,1994.0,3.82,250.0,220.0,The Starry Rift,9780312890216 This novel set in the far-future...
