In [None]:
from langchain_community.document_loaders import TextLoader # converts raw text (book description) and convert it to format that longchain can work with
from langchain.text_splitter import CharacterTextSplitter # splits whole document containing all of the descriptions into meaningful chunks (individual desc of each book)
# from langchain_openai import OpenAIEmbeddings # converting chunks into document embeddings
from langchain_chroma import Chroma # storing embeddings in vector database ChromaDB
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
import pandas as pd

books = pd.read_csv("data/books_cleaned.csv", encoding="utf-8", on_bad_lines="skip")

books["tagged_description"] = books["tagged_description"].str.replace('"', '', regex=False)
print(books['tagged_description'].head())

In [None]:
books

In [None]:
books["tagged_description"]

In [None]:
books['tagged_description'].to_csv("data/tagged_description.txt", sep='\n', index=False)

In [None]:
raw_documents = TextLoader('data/tagged_description.txt', encoding='utf-8').load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n") #prioritise splitting on the separator rather than on chunksize
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents[2]

In [None]:
print(f"Number of documents: {len(documents)}")
print(f"Average document length: {sum(len(doc.page_content) for doc in documents)/len(documents)} characters")


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

print("Model loaded")

In [None]:
from langchain.embeddings.base import Embeddings

class MySTEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return self.model.encode(texts, show_progress_bar=True).tolist()

    def embed_query(self, text):
        return self.model.encode([text])[0].tolist()

my_embeddings = MySTEmbeddings(model)

In [None]:
from langchain.vectorstores import FAISS
import pickle

docs = documents

batch_size = 500
db_faiss = None
all_docs = []

for i in range(0, len(docs), batch_size):
    batch = docs[i:i + batch_size]
    all_docs.extend(batch)

    if db_faiss is None:
        db_faiss = FAISS.from_documents(batch, embedding=my_embeddings)
        print(f"Created FAISS base with batch {i}–{i + len(batch)}")
    else:
        db_faiss.add_documents(batch)
        print(f"Added batch {i}–{i + len(batch)}")

db_faiss.save_local("faiss_index")
with open("data/faiss_docs.pkl", "wb") as f:
    pickle.dump(all_docs, f)

print("FAISS index and documents saved")

In [None]:
query = 'A book to teach children about nature'
docs = db_faiss.similarity_search(query, k=10)
docs

In [None]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())]

In [None]:
def retrieve_semantic_recomendations(query: str, top_k: int = 10) -> pd.DataFrame:
    recs = db_faiss.similarity_search(query, k=50)

    books_list = []

    for i in range(0, len(recs)):
      books_list.append(int(recs[i].page_content.strip('"').split()[0]))

    return books[books["isbn13"].isin(books_list)].head(top_k)

In [None]:
retrieve_semantic_recomendations('A book about space adventure and universe')