In [1]:
!pip install -qq langchain langchain-community langchain-openai langchain-chroma transformers openai python-dotenv
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma



In [2]:
import pandas as pd
df = pd.read_csv("/content/books_cleaned.csv")

In [3]:
df['tagged_description'].to_csv('tagged_description.txt',sep='\n',
                        index=False,
                        header = False)

In [4]:
from langchain_core.documents import Document
raw_documents = TextLoader("tagged_description.txt").load()
lines = raw_documents[0].page_content.split("\n")
documents = [Document(page_content=line.strip()) for line in lines if line.strip()]

In [None]:
!pip install -U langchain-huggingface sentence-transformers
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db_books = Chroma.from_documents(
    documents,
    embedding=embedding
)

In [10]:
query = "buddhism enlightenment consciousness religious liberation"
docs = db_books.similarity_search(query,k=10)
docs

[Document(id='69155875-a0a2-4292-a93f-b579002387f6', metadata={}, page_content='9780835608305 This new edition of critically acclaimed essays explores possible breakthroughs in the direction of reaching a liberated and enlightened consciousness. With a new preface and new final chapter. An invaluable collection of essays by the foremost religious writer in America today.'),
 Document(id='c26004d0-3761-45d6-9493-e1bb94c7f097', metadata={}, page_content='9780767901574 Drawing on three decades of learning from the spiritual masters of Asia, an American lama illuminates the sacred wisdom and practices of Buddhism and shows readers how to integrate them into their lives, relationships, and careers. Reprint. $50,000 ad/promo. Tour.'),
 Document(id='b97724dd-6e6d-43fe-93ef-739b9c1228d7', metadata={}, page_content="9780971500747 Explores the ego's expressions and inherent limitations and gives detailed explanations and instructions on how to transcend them. It expands the understanding of the 

In [11]:
df[df["isbn13"]==int(docs[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,titles_subtitles,tagged_description
4096,9780835608305,835608301,Beyond the Postmodern Mind,Huston Smith,Philosophy,http://books.google.com/books/content?id=lKnLB...,This new edition of critically acclaimed essay...,2003.0,4.21,295.0,67.0,Beyond the Postmodern Mind: The Place of Meani...,9780835608305 This new edition of critically a...


In [18]:
def retrieve_semantic_recommendations(query: str, top_k: int = 10) -> pd.DataFrame:
    docs = db_books.similarity_search(query, k=top_k)

    book_list = []
    for i in range(len(docs)):
        isbn = docs[i].page_content.split()[0].strip().strip('"')
        book_list.append(int(isbn))

    return df[df["isbn13"].isin(book_list)].head(top_k)


In [22]:
retrieve_semantic_recommendations("most meticulous accounts of Roman Empire")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,titles_subtitles,tagged_description
115,9780060510855,0060510854,Caesar,Colleen McCullough,Fiction,http://books.google.com/books/content?id=aoH_-...,A fictional portrait of Julius Caesar follows ...,2003.0,4.36,928.0,5825.0,Caesar: A Novel,9780060510855 A fictional portrait of Julius C...
671,9780140442410,0140442413,Germania,Cornelius Tacitus,History,http://books.google.com/books/content?id=-TCOs...,The Agricola is both a portrait of Julius Agri...,1970.0,3.98,174.0,3895.0,Germania,9780140442410 The Agricola is both a portrait ...
963,9780192833006,0192833006,Agricola and Germany,Tacitus,Literary Collections,http://books.google.com/books/content?id=wS7IJ...,"Cornelius Tacitus, Rome's greatest historian, ...",1999.0,3.98,224.0,25.0,Agricola and Germany,"9780192833006 Cornelius Tacitus, Rome's greate..."
1223,9780312320195,0312320191,SPQR VI: Nobody Loves a Centurion,John Maddox Roberts,Fiction,http://books.google.com/books/content?id=Wkowm...,"Like so many young men in later generations, R...",2003.0,4.13,288.0,626.0,SPQR VI: Nobody Loves a Centurion,9780312320195 Like so many young men in later ...
1753,9780380710829,038071082X,The Grass Crown,Colleen McCullough,Fiction,http://books.google.com/books/content?id=Kt3gN...,The lives of ancient Rome's men--general Gaius...,1992.0,4.29,1104.0,9397.0,The Grass Crown,9780380710829 The lives of ancient Rome's men-...
3043,9780674990333,0674990331,De officiis,Marcus Tullius Cicero;Walter Miller,Fiction,http://books.google.com/books/content?id=4lN0A...,"Cicero (Marcus Tullius, 106–43 BCE), Roman law...",1975.0,3.95,448.0,104.0,De officiis: C. Philosophical Treatises,"9780674990333 Cicero (Marcus Tullius, 106–43 B..."
3074,9780679724773,067972477X,"I, Claudius",Robert Graves,Fiction,http://books.google.com/books/content?id=cRzDT...,The emperor Claudius tells of his life during ...,1989.0,4.27,468.0,43469.0,"I, Claudius: From the Autobiography of Tiberiu...",9780679724773 The emperor Claudius tells of hi...
3182,9780688093686,068809368X,The first man in Rome,Colleen McCullough,Fiction,http://books.google.com/books/content?id=kMRmP...,A story tracing the creation of Republican Rom...,1990.0,4.1,896.0,294.0,The first man in Rome,9780688093686 A story tracing the creation of ...
3564,9780760768952,0760768951,The Conquest of Gaul,Julius Caesar;F. P. Long;Cheryl Walker,History,http://books.google.com/books/content?id=AYuHP...,Among the most durable and engaging texts in w...,2005.0,3.99,288.0,50.0,The Conquest of Gaul,9780760768952 Among the most durable and engag...
4648,9781565849426,1565849426,The Assassination of Julius Caesar,Michael Parenti,History,http://books.google.com/books/content?id=xApsW...,Parenti presents a story of popular resistance...,2004.0,4.14,276.0,628.0,The Assassination of Julius Caesar: A People's...,9781565849426 Parenti presents a story of popu...


In [24]:
len(df['categories'].unique())

480

In [23]:
categories = df['categories'].value_counts().reset_index().query("count >= 50")['categories']
categories.tolist()

['Fiction',
 'Juvenile Fiction',
 'Biography & Autobiography',
 'History',
 'Literary Criticism',
 'Religion',
 'Philosophy',
 'Comics & Graphic Novels',
 'Drama',
 'Juvenile Nonfiction',
 'Science',
 'Poetry',
 'Literary Collections']

In [25]:
categ_mapping = {'Fiction':"fiction",
 'Juvenile Fiction':"children fiction",
 'Biography & Autobiography':"Nonfiction",
 'History':"Nonfiction",
 'Literary Criticism':"Nonfiction",
 'Religion':"Nonfiction",
 'Philosophy':"Nonfiction",
 'Comics & Graphic Novels':"fiction",
 'Drama':"fiction",
 'Juvenile Nonfiction':"children Nonfiction",
 'Science':"Nonfiction",
 'Poetry':"fiction"}

In [26]:
df['simple_categ'] = df['categories'].map(categ_mapping)
df['simple_categ']

Unnamed: 0,simple_categ
0,fiction
1,
2,fiction
3,
4,
...,...
5192,
5193,
5194,
5195,Nonfiction


In [27]:
len(df[~df['simple_categ'].isna()]),len(df[df['simple_categ'].isna()])

(3743, 1454)

In [28]:
from transformers import pipeline
categories = ['fiction','Nonfiction']
classifier = pipeline("zero-shot-classification",model='facebook/bart-large-mnli',device=0)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [41]:
import numpy as np
def generate_prediction(seq,categories):
    predictions = classifier(seq,categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]
    return max_label
generate_prediction('budhhism spirituality',categories)

'fiction'

In [31]:
df.columns

Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
       'description', 'published_year', 'average_rating', 'num_pages',
       'ratings_count', 'titles_subtitles', 'tagged_description',
       'simple_categ'],
      dtype='object')

In [32]:
def generate_prediction(seq,categories):
    predictions = classifier(seq,categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]
    return max_label

from tqdm.auto import tqdm
actual_cat = []
predicted_cat = []

for i in tqdm(range(0,300)):
    seq = df.loc[df['simple_categ']=='fiction','description'].reset_index(drop=True)[i]
    predicted_cat.append(generate_prediction(seq,categories))
    actual_cat += ["fiction"]
for i in tqdm(range(0,300)):
    seq = df.loc[df['simple_categ']=='Nonfiction','description'].reset_index(drop=True)[i]
    predicted_cat.append(generate_prediction(seq,categories))
    actual_cat += ["Nonfiction"]
eval_data = pd.DataFrame({'prediction':predicted_cat,'actual_cat':actual_cat})


  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

In [33]:
isbn = []
predicted_cats = []
missing_cats = df.loc[df['simple_categ'].isna(),['isbn13','description']].reset_index(drop=True)

In [40]:
for i in tqdm(range(0,len(missing_cats))):
    seq = missing_cats['description'][i]
    predicted_cats += [generate_prediction(seq,categories)]
    isbn +=  [missing_cats['isbn13'][i]]

  0%|          | 0/1454 [00:00<?, ?it/s]

In [35]:
miss_cats = pd.DataFrame({'isbn13':isbn,'predicted_categ':predicted_cats})
miss_cats.head()

Unnamed: 0,isbn13,predicted_categ
0,9780002261982,fiction
1,9780006280897,Nonfiction
2,9780006280934,Nonfiction
3,9780006380832,Nonfiction
4,9780006470229,fiction


In [36]:
df = pd.merge(df,miss_cats, on='isbn13',how='left')
len(df)

5197

In [37]:
df["simple_categ_x"] = np.where(df["simple_categ"].isna(),df["predicted_categ"],df["simple_categ"])
df.isna().sum()

Unnamed: 0,0
isbn13,0
isbn10,0
title,0
authors,32
categories,30
thumbnail,166
description,0
published_year,0
average_rating,0
num_pages,0


In [38]:
df.drop(['predicted_categ','simple_categ'],axis=1,inplace=True)

In [39]:
df.columns

Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
       'description', 'published_year', 'average_rating', 'num_pages',
       'ratings_count', 'titles_subtitles', 'tagged_description',
       'simple_categ_x'],
      dtype='object')

In [43]:
df['simple_categ_x'].value_counts()

Unnamed: 0_level_0,count
simple_categ_x,Unnamed: 1_level_1
fiction,2722
Nonfiction,2028
children fiction,390
children Nonfiction,57


In [51]:
df[df["categories"].str.lower().isin([
    "fantasy",
    "horror","comedy"
])]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,titles_subtitles,tagged_description,simple_categ_x
478,9780099422341,99422344,Yeats is Dead!,Joseph O'Connor,Comedy,http://books.google.com/books/content?id=DrE3I...,"In aid of Amnesty International, this is a bri...",2002.0,3.39,298.0,34.0,Yeats is Dead!: A Novel by Fifteen Irish Writers,"9780099422341 In aid of Amnesty International,...",fiction
491,9780099446729,99446723,Blackwood Farm,Anne Rice,Horror,http://books.google.com/books/content?id=cIn8T...,"Lestat Is Back, Saviour And Demon, Presiding O...",2003.0,3.86,774.0,26145.0,Blackwood Farm,"9780099446729 Lestat Is Back, Saviour And Demo...",fiction
1090,9780261102422,261102427,The Silmarillion,John Ronald Reuel Tolkien,Fantasy,http://books.google.com/books/content?id=22ePu...,Tolkien's Silmarillion is the core work of the...,1999.0,3.91,384.0,253.0,The Silmarillion,9780261102422 Tolkien's Silmarillion is the co...,fiction
2845,9780575075597,575075597,Replay,Ken Grimwood,Fantasy,http://books.google.com/books/content?id=9vmNP...,At forty-three Jeff Winston is tired of his lo...,2005.0,4.16,272.0,412.0,Replay,9780575075597 At forty-three Jeff Winston is t...,Nonfiction
2860,9780590254762,590254766,"The lion, the witch and the wardrobe",Clive Staples Lewis,Fantasy,,Four English school children enter the magic l...,1995.0,4.21,189.0,860.0,"The lion, the witch and the wardrobe",9780590254762 Four English school children ent...,Nonfiction
3288,9780739423851,739423851,Wizard's Castle,Diana Wynne Jones,Fantasy,http://books.google.com/books/content?id=hB7hA...,Howl's moving castle - Eldest of three sisters...,2002.0,4.44,376.0,439.0,Wizard's Castle,9780739423851 Howl's moving castle - Eldest of...,Nonfiction
4483,9781416502043,1416502041,The Voyage of the Jerle Shannara Trilogy,Terry Brooks,Fantasy,http://books.google.com/books/content?id=jqVsA...,When the body of a half-drowned elf is found f...,2004.0,4.26,1260.0,84.0,The Voyage of the Jerle Shannara Trilogy,9781416502043 When the body of a half-drowned ...,Nonfiction
4979,9781841494081,1841494089,The Darkness that Comes Before,R. Scott Bakker,Fantasy,http://books.google.com/books/content?id=BG8qG...,A score of centuries has passed since the Firs...,2005.0,3.79,638.0,317.0,The Darkness that Comes Before,9781841494081 A score of centuries has passed ...,Nonfiction


In [52]:
df.to_csv('books_cleaned_with_categ.csv',index=False)