In [1]:
!pip install -qq langchain langchain-community langchain-openai langchain-chroma transformers openai python-dotenv
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma



In [2]:
import pandas as pd
df = pd.read_csv("/content/books_cleaned.csv")

In [3]:
df['tagged_description'].to_csv('tagged_description.txt',sep='\n',
                        index=False,
                        header = False)

In [4]:
from langchain_core.documents import Document
raw_documents = TextLoader("tagged_description.txt").load()
lines = raw_documents[0].page_content.split("\n")
documents = [Document(page_content=line.strip()) for line in lines if line.strip()]

In [None]:
!pip install -U langchain-huggingface sentence-transformers
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db_books = Chroma.from_documents(
    documents,
    embedding=embedding
)

In [None]:
query = "buddhism enlightenment consciousness religious liberation"
docs = db_books.similarity_search(query,k=10)
docs

In [None]:
df[df["isbn13"]==int(docs[0].page_content.split()[0].strip())]

In [18]:
def retrieve_semantic_recommendations(query: str, top_k: int = 10) -> pd.DataFrame:
    docs = db_books.similarity_search(query, k=top_k)

    book_list = []
    for i in range(len(docs)):
        isbn = docs[i].page_content.split()[0].strip().strip('"')
        book_list.append(int(isbn))

    return df[df["isbn13"].isin(book_list)].head(top_k)


In [None]:
retrieve_semantic_recommendations("most meticulous accounts of Roman Empire")

In [24]:
len(df['categories'].unique())

480

In [23]:
categories = df['categories'].value_counts().reset_index().query("count >= 50")['categories']
categories.tolist()

['Fiction',
 'Juvenile Fiction',
 'Biography & Autobiography',
 'History',
 'Literary Criticism',
 'Religion',
 'Philosophy',
 'Comics & Graphic Novels',
 'Drama',
 'Juvenile Nonfiction',
 'Science',
 'Poetry',
 'Literary Collections']

In [25]:
categ_mapping = {'Fiction':"fiction",
 'Juvenile Fiction':"children fiction",
 'Biography & Autobiography':"Nonfiction",
 'History':"Nonfiction",
 'Literary Criticism':"Nonfiction",
 'Religion':"Nonfiction",
 'Philosophy':"Nonfiction",
 'Comics & Graphic Novels':"fiction",
 'Drama':"fiction",
 'Juvenile Nonfiction':"children Nonfiction",
 'Science':"Nonfiction",
 'Poetry':"fiction"}

In [None]:
df['simple_categ'] = df['categories'].map(categ_mapping)
df['simple_categ']

In [27]:
len(df[~df['simple_categ'].isna()]),len(df[df['simple_categ'].isna()])

(3743, 1454)

In [None]:
from transformers import pipeline
categories = ['fiction','Nonfiction']
classifier = pipeline("zero-shot-classification",model='facebook/bart-large-mnli',device=0)


In [41]:
import numpy as np
def generate_prediction(seq,categories):
    predictions = classifier(seq,categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]
    return max_label
generate_prediction('budhhism spirituality',categories)

'fiction'

In [31]:
df.columns

Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
       'description', 'published_year', 'average_rating', 'num_pages',
       'ratings_count', 'titles_subtitles', 'tagged_description',
       'simple_categ'],
      dtype='object')

In [None]:
def generate_prediction(seq,categories):
    predictions = classifier(seq,categories)
    max_index = np.argmax(predictions['scores'])
    max_label = predictions['labels'][max_index]
    return max_label

from tqdm.auto import tqdm
actual_cat = []
predicted_cat = []

for i in tqdm(range(0,300)):
    seq = df.loc[df['simple_categ']=='fiction','description'].reset_index(drop=True)[i]
    predicted_cat.append(generate_prediction(seq,categories))
    actual_cat += ["fiction"]
for i in tqdm(range(0,300)):
    seq = df.loc[df['simple_categ']=='Nonfiction','description'].reset_index(drop=True)[i]
    predicted_cat.append(generate_prediction(seq,categories))
    actual_cat += ["Nonfiction"]
eval_data = pd.DataFrame({'prediction':predicted_cat,'actual_cat':actual_cat})


In [33]:
isbn = []
predicted_cats = []
missing_cats = df.loc[df['simple_categ'].isna(),['isbn13','description']].reset_index(drop=True)

In [None]:
for i in tqdm(range(0,len(missing_cats))):
    seq = missing_cats['description'][i]
    predicted_cats += [generate_prediction(seq,categories)]
    isbn +=  [missing_cats['isbn13'][i]]

In [None]:
miss_cats = pd.DataFrame({'isbn13':isbn,'predicted_categ':predicted_cats})
miss_cats.head()

In [36]:
df = pd.merge(df,miss_cats, on='isbn13',how='left')
len(df)

5197

In [None]:
df["simple_categ_x"] = np.where(df["simple_categ"].isna(),df["predicted_categ"],df["simple_categ"])
df.isna().sum()

In [38]:
df.drop(['predicted_categ','simple_categ'],axis=1,inplace=True)

In [39]:
df.columns

Index(['isbn13', 'isbn10', 'title', 'authors', 'categories', 'thumbnail',
       'description', 'published_year', 'average_rating', 'num_pages',
       'ratings_count', 'titles_subtitles', 'tagged_description',
       'simple_categ_x'],
      dtype='object')

In [None]:
df['simple_categ_x'].value_counts()

In [None]:
df[df["categories"].str.lower().isin([
    "fantasy",
    "horror","comedy"
])]

In [53]:
df.to_csv('books_cleaned_with_categ.csv',index=False)