# Book Recommender
Purpose: to explore topics in Information Retrieval and RAG.

## Environment Setup

In [1]:
%%capture

# installations
!pip install --quiet sentence_transformers transformers torch peft huggingface_hub kaggle pinecone lark rank_bm25 langchain_huggingface langdetect langchain_experimental langchain_pinecone 

# THE REGS
import pandas as pd
import numpy as np
import kagglehub
import torch
import nltk
import string
import os
import time
import re

# NLP
import nltk
from langdetect import detect, DetectorFactory

# Transformers
from transformers import AutoTokenizer, AutoModel
import torch

# PINECONE
from pinecone import Pinecone
from pinecone import ServerlessSpec

# LANGCHAIN
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document

# Kaggle environment
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

## Data Setup

In [2]:
# Reformat data file so it fits into a pandas dataframe
def text_to_csv_pandas(input_file, output_file, column_names, delimiter=None):
    """
    Reads a text file into a Pandas DataFrame and saves it as a CSV file.

    Args:
        input_file (str): The path to the input text file.
        output_file (str): The path to the output CSV file.
        delimiter (str, optional): The delimiter used in the text file. Defaults to None, 
        which will split each line by whitespace if the text file is not delimited.
    """
    if delimiter is not None:
        df = pd.read_csv(input_file, sep=delimiter, names = column_names, header=None)
    else:
         df = pd.read_csv(input_file, sep=r'\s+', names = column_names, header=None)
    df.to_csv(output_file, index=False, header=True)

# Columns in the data set
columns = ['Wikipedia article ID', 
           'Freebase ID', 
           'Book title', 
           'Author', 
           'Publication date', 
           'Book genres', 
           'Plot summary']

text_to_csv_pandas('/kaggle/input/cmu-book-summary-dataset/booksummaries.txt', 'data.csv', 
                   column_names = columns, delimiter='\t')

data = pd.read_csv('/kaggle/working/data.csv')

# drop the ID columns
data.drop(columns=['Wikipedia article ID', 'Freebase ID'], inplace=True)

# preview
data.head(n=3)

Unnamed: 0,Book title,Author,Publication date,Book genres,Plot summary
0,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...


## Data Cleaning

In [3]:
print("Number of NA values for each feature:\n", data.isna().sum())

Number of NA values for each feature:
 Book title             0
Author              2382
Publication date    5610
Book genres         3718
Plot summary           0
dtype: int64


### Genre Column
* Remove brackets
* Remove /m/ char sequences
* Fix utf-8 symbols
* Put genres in list format for each cell

In [4]:
# CLEAN GENRE COLUMN

# Find all genres in cell block, do not keep char sequences that begin with /m/
data['Book genres'] = data['Book genres'].apply(lambda row: re.findall(r'":\s*"([^"]+)"', str(row)))

# take care of utf-8 symbols as well (ex. \\u00e0)
data['Book genres'] = data['Book genres'].apply(lambda cell: [bytes(word, "utf-8").decode("unicode_escape") for word in cell])

# Preview
data.head(n=3)

Unnamed: 0,Book title,Author,Publication date,Book genres,Plot summary
0,Animal Farm,George Orwell,1945-08-17,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,Anthony Burgess,1962,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,The Plague,Albert Camus,1947,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...


### Publication date column
* The majority of the books half the publication year, therefore we will create a new column for it.

In [5]:
# regex to find dates that only contain the year
regex = r'\d{4}$'
year_only_dates = data[data['Publication date'].str.contains(regex) == True].index
print("Number of books with only the publication year: ", len(year_only_dates))

# regex to find dates that only contain year and month
regex = r'\d{4}-\d{2}$'
half_dates = data[data['Publication date'].str.contains(regex) == True].index
print("Number of books with just the year and month: ", len(half_dates))

# regex to find complete dates
regex = r'\d{4}-\d{2}-\d{2}'
full_dates = data[data['Publication date'].str.contains(regex) == True].index
print("Number of books with the full publication date: ", len(full_dates))

Number of books with only the publication year:  6799
Number of books with just the year and month:  1479
Number of books with the full publication date:  2671


In [6]:
# CREATE NEW COLUMN FOR PUBLICATION YEAR

# New column for publication year
data['Publication year'] = 0

# Fill in column with year values
data.loc[full_dates, 'Publication year'] = data.loc[full_dates, 'Publication date'].str.split("-").str[0].astype('int')
data.loc[half_dates, 'Publication year'] = data.loc[half_dates, 'Publication date'].str.split("-").str[0].astype('int')
data.loc[year_only_dates, 'Publication year'] = data.loc[year_only_dates, 'Publication date'].str.split("-").str[0].astype('int')

# Preview
data.head(n=3)

Unnamed: 0,Book title,Author,Publication date,Book genres,Plot summary,Publication year
0,Animal Farm,George Orwell,1945-08-17,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca...",1945
1,A Clockwork Orange,Anthony Burgess,1962,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan...",1962
2,The Plague,Albert Camus,1947,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...,1947


### Plot summary column
* Remove books with no proper description of the plot.

In [7]:
# get number of words per plot summary
words_per_summary = data['Plot summary'].apply(lambda x: len(nltk.word_tokenize(x)))

In [8]:
data.loc[words_per_summary[words_per_summary < 10].index].head()

Unnamed: 0,Book title,Author,Publication date,Book genres,Plot summary,Publication year
2045,The Kennel Murder Case,S. S. Van Dine,,"[Mystery, Fiction, Suspense]",~Plot outline description,0
3879,Slavers,Chris Pramas,2000,[Role-playing game],==Publication histor,2000
5271,Golem in the Gears,Piers Anthony,1986-02,"[Science Fiction, Speculative fiction, Fantasy...",pl:Zakochany golem,1986
5595,The Adventures of Super Diaper Baby,Dav Pilkey,2002,[Children's literature],=== Plot summary ===,2002
5693,The Deathlord of Ixia,John Grant,1992,"[Gamebook, Speculative fiction, Children's lit...",==Receptio,1992


In [9]:
# Regex pattern (e.g., any row containing 'apple' or 'grape')
pattern = r'=='

# Find matching indices
data[data['Plot summary'].str.contains(pattern, regex=True)].head()

Unnamed: 0,Book title,Author,Publication date,Book genres,Plot summary,Publication year
281,Excession,Iain Banks,1996,"[Science Fiction, Speculative fiction]",The Excession of the title is a perfect black...,1996
373,The Prince,Niccolò Machiavelli,1532,"[Treatise, Non-fiction]","The work has a recognizable structure, for th...",1532
1024,Oahspe: A New Bible,,1882,[Religion],"Oahspe includes doctrinal books, and precepts...",1882
1497,Three Men in a Boat,Jerome K. Jerome,1889,"[Children's literature, Fiction]",==Reception and history== The reception by cr...,1889
1550,Area 7,Matthew Reilly,2001-10-31,"[Techno-thriller, Fiction]",The President of the United States is visitin...,2001


### Since there are multiple valid plot summaries that have 2+ equal signs, we will need to remove them.

In [10]:
# Replace substrings with 2+ equal signs
data['Plot summary'] = data['Plot summary'].str.replace(r'={2,}', '', regex=True)

In [11]:
# drop rows that have 5 terms or less
data.drop(words_per_summary[words_per_summary < 6].index.tolist(), inplace=True)

# drop rows that contain "Plot outline description" 
data.drop(data[data['Plot summary'].str.contains(pattern, regex=True)].index.tolist(), inplace=True)

# dropping rows does not automatically reset index. So we must do this manually.
data.reset_index(drop=True, inplace=True)

In [12]:
# DELETE BOOKS THAT ARE NOT IN ENGLISH
DetectorFactory.seed = 0  # for consistent results

def detect_language(text):
    try:
        lang = detect(text)
        return lang
    except Exception as e:
        return f"Error: {str(e)}"

langs = data['Plot summary'].apply(lambda x: detect_language(x))
eng_books = langs[langs == 'en'].index

print("Number of english summaries in dataset: ", len(eng_books))

data = data.loc[eng_books]
data.reset_index(drop=True, inplace=True)

Number of english summaries in dataset:  16512


In [13]:
# Check to see if there are any other common things that do not add to the summary that I should remove (for embeddings)
data['Plot summary'][0]

' Old Major, the old boar on the Manor Farm, calls the animals on the farm for a meeting, where he compares the humans to parasites and teaches the animals a revolutionary song, \'Beasts of England\'. When Major dies, two young pigs, Snowball and Napoleon, assume command and turn his dream into a philosophy. The animals revolt and drive the drunken and irresponsible Mr Jones from the farm, renaming it "Animal Farm". They adopt Seven Commandments of Animal-ism, the most important of which is, "All animals are equal". Snowball attempts to teach the animals reading and writing; food is plentiful, and the farm runs smoothly. The pigs elevate themselves to positions of leadership and set aside special food items, ostensibly for their personal health. Napoleon takes the pups from the farm dogs and trains them privately. Napoleon and Snowball struggle for leadership. When Snowball announces his plans to build a windmill, Napoleon has his dogs chase Snowball away and declares himself leader. N

In [14]:
data['Plot summary'] = data['Plot summary'].apply(lambda x: x.replace('#', ''))
data['Plot summary'] = data['Plot summary'].apply(lambda x: x.replace('*', ''))

### Note:
* Stopwords are not removed as part of the text cleaning process because we are using an LLM to embed-- which are already trained with stopwords and therefore handle the importance of them internally.

### First we determine which embedding model we will use. 
* We will use BAAI's tuned BERT model (available at [HuggingFace](https://huggingface.co/BAAI/bge-large-zh-v1.5)).

In [15]:
# Get model
model_name = "BAAI/bge-large-zh-v1.5"
model = AutoModel.from_pretrained(model_name)

print("About the model: \n\n", model.config, "\n")

# Get corresponding tokenizer/encoder
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("About the tokenizer: \n\n", tokenizer)

config.json: 0.00B [00:00, ?B/s]

2025-09-01 14:50:52.807436: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756738253.114332      77 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756738253.202841      77 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

About the model: 

 BertConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}
 



tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

About the tokenizer: 

 BertTokenizerFast(name_or_path='BAAI/bge-large-zh-v1.5', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


### How Embeddings Operate
Model: https://huggingface.co/BAAI/bge-m3

**Encoding**:
* This is a broad term that refers to the process of transforming data from one format to another. Examples include converting text into binary format, converting characters to numerical values, or compressing data. [[1]](https://medium.com/@pratiyush1/understanding-different-types-of-encoding-and-decoding-in-programming-with-practical-examples-dcbdd5215605#:~:text=Practical%20Example%201:%20Email%20Attachments%20Base64%20encoding,were%20traditionally%20designed%20to%20handle%20text%2Donly%20data)
  
**Tokenization**:
* In the context of natural language processing (NLP), tokenization is a specific type of encoding where text is broken down into smaller units called tokens. These tokens can be words, characters, or even sub-word units. [[2]](https://www.datacamp.com/blog/what-is-tokenization#:~:text=Training%20more%20people?,which%20are%20easier%20to%20analyze)

**Embeddings**:
* are advanced vector representations of tokens. They try to capture the most nuance, connections, and semantic meanings between tokens. Each embedding is generally a series of real numbers on a vector space computed by a neural network. [[3]](https://medium.com/the-research-nest/explained-tokens-and-embeddings-in-llms-69a16ba5db33)


> In short, text is converted to tokens. Tokens are assigned token IDs. These token IDs can be used to create embeddings for more nuanced numerical representation in complex models.
>
> Why are embeddings so large and complex? What do they signify?
>
> Each token’s embedding is a high-dimensional vector. This allows the model to capture a wide range of linguistic features and nuances, like the meaning of a word, its part of speech, and its relationship to other words in the sentence.
>
> * Contextual Embeddings: Unlike simpler word embeddings (like Word2Vec), BERT’s embeddings are contextual. This means the same word can have different embeddings based on its context (its surrounding words). The embeddings need to be rich and complex to capture this contextual nuance.
> 
> * In more complex models like BERT, you get the final embeddings and access to the embeddings from each layer of the neural network. Each layer captures different aspects of the language, adding to the complexity and size of the tensor.
>
> * Input for Further Tasks: These embeddings are used as input for various NLP tasks like sentiment analysis, question answering, and language translation. The richness of the embeddings allows the model to perform these tasks with a high degree of sophistication.
>
> * Model’s Internal Representation: The complexity of these tensors reflects how the model ‘understands’ language. Each dimension in the embedding can represent some abstract language feature the model learned during its training.
> [[3]](https://medium.com/the-research-nest/explained-tokens-and-embeddings-in-llms-69a16ba5db33)

## VectorStore Setup
Useful links:
* [Pinecone](https://docs.pinecone.io/integrations/langchain)
* [Langchain](https://python.langchain.com/api_reference/pinecone/vectorstores/langchain_pinecone.vectorstores.PineconeVectorStore.html)

### Step 1. Initialize a vector store

To securely handle Pinecone API keys and prevent their exposure, especially when sharing code or deploying application:

* Store the Pinecone API key as an environment variable on your system or server.

* Access this variable within your code using the appropriate method for your programming language (e.g., os.environ.get("PINECONE_API_KEY") in Python).

This keeps the key separate from your codebase and prevents it from being committed to version control.

In [18]:
pc = Pinecone(api_key=user_secrets.get_secret("PINECONE_API_KEY"))

index_name = "book-vector-store"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024, # set to embedder's output size
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        deletion_protection="enabled",  # Defaults to "disabled"
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

### Step 2. Prepare/Create documents for the vector store

In [47]:
# create documents
book_docs = []

def chunk_list(lst, chunk_size):
    """Splits lst into chunks with length <= chunk_size"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
    
for i in range(len(data)):
    title = data['Book title'].iloc[i]
    authors = data['Author'].iloc[i]
    pub_date = data['Publication date'].iloc[i]
    pub_year = data['Publication year'].iloc[i]
    genres = data['Book genres'].iloc[i]
    doc_id = i

    plot_summary_tokens = tokenizer.tokenize(data['Plot summary'].iloc[i])
    tokenized_chunks = chunk_list(plot_summary_tokens, 512)
    text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in tokenized_chunks]

    documents = [
        Document(page_content=text_chunk, 
                 metadata={"Title": title, "Author(s)": authors, "Publication Date": pub_date, 
                           "Publication year": str(pub_year), "Genre(s)": genres, "doc_id": doc_id, "chunk": str(i)})
        for i, text_chunk in enumerate(text_chunks)
    ]

    book_docs += documents

### Step 3. Create embedding object using our model from huggingface

In [43]:
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedder = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [45]:
## create PineconeVectorStore object
vector_store = PineconeVectorStore(index=index, embedding=embedder)

### Step 4. Populate vector store

In [None]:
vector_store.add_documents(book_docs)

## Test out the search

> A similarity_search on a PineconeVectorStore object returns a list of LangChain Document objects most similar to the query provided. While the similarity_search uses a Pinecone query to find the most similar results, this method includes additional steps and returns results of a different type.
>
> The similarity_search method accepts raw text and automatically embeds it using the Embedding object provided when you initialized the PineconeVectorStore. You can also provide a k value to determine the number of LangChain Document objects to return. The default value is k=4.


In [None]:
## Example
#query = "Who is Ketanji Brown Jackson?"
    #vectorstore.similarity_search(query)
    
    # Response:
    # [
    #    Document(page_content='Ketanji Onyika Brown Jackson is an American lawyer and jurist who is an associate justice of the Supreme Court of the United...', metadata={'chunk': 0.0, 'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson', 'title': 'Ketanji Brown Jackson', 'wiki-id': '6573'}),  
    #    Document(page_content='Jackson was nominated to the Supreme Court by President Joe Biden on February 25, 2022, and confirmed by the U.S. Senate...', metadata={'chunk': 1.0, 'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson', 'title': 'Ketanji Brown Jackson', 'wiki-id': '6573'}),  
    #    Document(page_content='Jackson grew up in Miami and attended Miami Palmetto Senior High School. She distinguished herself as a champion debater...', metadata={'chunk': 3.0, 'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson', 'title': 'Ketanji Brown Jackson', 'wiki-id': '6573'}),
    #    Document(page_content='After high school, Jackson matriculated at Harvard University to study government, having applied despite her guidance...', metadata={'chunk': 5.0, 'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson', 'title': 'Ketanji Brown Jackson', 'wiki-id': '6573'})
    # ]

You can also optionally apply a metadata filter to your similarity search. The filtering query language is the same as for Pinecone queries, as detailed in [Filtering with metadata](https://docs.pinecone.io/guides/index-data/indexing-overview#metadata).

In [None]:
## Example
   # query = "Tell me more about Ketanji Brown Jackson."
   #  vectorstore.similarity_search(query, filter={'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson'})

## Test out RAG with Langchain
In RAG, you take the query as a question that is to be answered by a LLM, but the LLM must answer the question based on the information it is seeing from the vectorstore.

In [None]:
## Example
# from langchain_openai import ChatOpenAI  
# from langchain.chains import RetrievalQA  
# # completion llm  
# llm = ChatOpenAI(  
#     openai_api_key=OPENAI_API_KEY,  
#     model_name='gpt-3.5-turbo',  
#     temperature=0.0  
# )  
# qa = RetrievalQA.from_chain_type(  
#     llm=llm,  
#     chain_type="stuff",  
#     retriever=vectorstore.as_retriever()  
# )  
# qa.invoke(query)  

# Response:
# Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of fascism in Italy...