# Vector DB Setup 

## Environment Setup

In [1]:
%%capture

# installations
!pip install --quiet sentence_transformers transformers torch peft huggingface_hub kaggle pinecone lark rank_bm25 langchain_huggingface langdetect langchain_experimental langchain_pinecone 

# THE REGS
import pandas as pd
import numpy as np
import kagglehub
import torch
import string
import os
import time
import re

# NLP
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from langdetect import detect, DetectorFactory

# Transformers
from transformers import AutoTokenizer, AutoModel
import torch

# PINECONE
from pinecone import Pinecone
from pinecone import ServerlessSpec

# LANGCHAIN
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document

# Kaggle environment
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

## Data Setup

In [2]:
# Reformat data file so it fits into a pandas dataframe
def text_to_csv_pandas(input_file, output_file, column_names, delimiter=None):
    """
    Reads a text file into a Pandas DataFrame and saves it as a CSV file.

    Args:
        input_file (str): The path to the input text file.
        output_file (str): The path to the output CSV file.
        delimiter (str, optional): The delimiter used in the text file. Defaults to None, 
        which will split each line by whitespace if the text file is not delimited.
    """
    if delimiter is not None:
        df = pd.read_csv(input_file, sep=delimiter, names = column_names, header=None)
    else:
         df = pd.read_csv(input_file, sep=r'\s+', names = column_names, header=None)
    df.to_csv(output_file, index=False, header=True)

# Columns in the data set
columns = ['Wikipedia article ID', 
           'Freebase ID', 
           'Book title', 
           'Author', 
           'Publication date', 
           'Book genres', 
           'Plot summary']

text_to_csv_pandas('/kaggle/input/cmu-book-summary-dataset/booksummaries.txt', 'data.csv', 
                   column_names = columns, delimiter='\t')

data = pd.read_csv('/kaggle/working/data.csv')

# drop the ID columns
data.drop(columns=['Wikipedia article ID', 'Freebase ID'], inplace=True)

# preview
data.head(n=3)

Unnamed: 0,Book title,Author,Publication date,Book genres,Plot summary
0,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...


## Data Cleaning

In [3]:
print("Number of NA values for each feature:\n", data.isna().sum())

Number of NA values for each feature:
 Book title             0
Author              2382
Publication date    5610
Book genres         3718
Plot summary           0
dtype: int64


### Genre Column
* Remove brackets
* Remove /m/ char sequences
* Fix utf-8 symbols
* Put genres in list format for each cell

In [4]:
# CLEAN GENRE COLUMN

# Find all genres in cell block, do not keep char sequences that begin with /m/
data['Book genres'] = data['Book genres'].apply(lambda row: re.findall(r'":\s*"([^"]+)"', str(row)))

# take care of utf-8 symbols as well (ex. \\u00e0)
data['Book genres'] = data['Book genres'].apply(lambda cell: [bytes(word, "utf-8").decode("unicode_escape") for word in cell])

# Preview
data.head(n=3)

Unnamed: 0,Book title,Author,Publication date,Book genres,Plot summary
0,Animal Farm,George Orwell,1945-08-17,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,Anthony Burgess,1962,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,The Plague,Albert Camus,1947,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...


### Publication date column
* The majority of the books half the publication year, therefore we will create a new column for it.

In [5]:
# regex to find dates that only contain the year
regex = r'\d{4}$'
year_only_dates = data[data['Publication date'].str.contains(regex) == True].index
print("Number of books with only the publication year: ", len(year_only_dates))

# regex to find dates that only contain year and month
regex = r'\d{4}-\d{2}$'
half_dates = data[data['Publication date'].str.contains(regex) == True].index
print("Number of books with just the year and month: ", len(half_dates))

# regex to find complete dates
regex = r'\d{4}-\d{2}-\d{2}'
full_dates = data[data['Publication date'].str.contains(regex) == True].index
print("Number of books with the full publication date: ", len(full_dates))

Number of books with only the publication year:  6799
Number of books with just the year and month:  1479
Number of books with the full publication date:  2671


In [6]:
# CREATE NEW COLUMN FOR PUBLICATION YEAR

# New column for publication year
data['Publication year'] = 0

# Fill in column with year values
data.loc[full_dates, 'Publication year'] = data.loc[full_dates, 'Publication date'].str.split("-").str[0].astype('int')
data.loc[half_dates, 'Publication year'] = data.loc[half_dates, 'Publication date'].str.split("-").str[0].astype('int')
data.loc[year_only_dates, 'Publication year'] = data.loc[year_only_dates, 'Publication date'].str.split("-").str[0].astype('int')

# Preview
data.head(n=3)

Unnamed: 0,Book title,Author,Publication date,Book genres,Plot summary,Publication year
0,Animal Farm,George Orwell,1945-08-17,"[Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca...",1945
1,A Clockwork Orange,Anthony Burgess,1962,"[Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan...",1962
2,The Plague,Albert Camus,1947,"[Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...,1947


### Plot summary column
* Remove books with no proper description of the plot.

In [7]:
# DELETE BOOKS THAT ARE NOT IN ENGLISH
DetectorFactory.seed = 0  # for consistent results

def detect_language(text):
    try:
        lang = detect(text)
        return lang
    except Exception as e:
        return f"Error: {str(e)}"

langs = data['Plot summary'].apply(lambda x: detect_language(x))
eng_books = langs[langs == 'en'].index

print("Number of english summaries in dataset: ", len(eng_books))

data = data.loc[eng_books]
data.reset_index(drop=True, inplace=True)

Number of english summaries in dataset:  16520


#### Reduce number of books to upload to pinecone vectorstore

In [8]:
# Function to count sentences using NLTK
def count_sentences_nltk(text):
    sentences = sent_tokenize(text)
    return len(sentences)

# Filter rows with less than a 6 sentences
filtered_df = data[data['Plot summary'].apply(count_sentences_nltk) < 6]

# Filter out rows where genre is not an empty list
sample_data = filtered_df[filtered_df['Book genres'].apply(lambda x: len(x) > 0)]

sample_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3245 entries, 8 to 16512
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Book title        3245 non-null   object
 1   Author            3004 non-null   object
 2   Publication date  2316 non-null   object
 3   Book genres       3245 non-null   object
 4   Plot summary      3245 non-null   object
 5   Publication year  3245 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 177.5+ KB


In [9]:
# get number of words per plot summary
words_per_summary = sample_data['Plot summary'].apply(lambda x: len(nltk.word_tokenize(x)))

# take a look at what these summaries look like (less than 10 words)
sample_data.loc[words_per_summary[words_per_summary < 10].index].head()

### Since there are multiple valid plot summaries that have 2+ equal signs, we will need to remove them.

In [11]:
# Replace substrings with 2+ equal signs
sample_data.loc[:, 'Plot summary'] = sample_data['Plot summary'].str.replace(r'={2,}', '', regex=True)

# drop rows that have 5 terms or less
sample_data.drop(words_per_summary[words_per_summary < 6].index.tolist(), inplace=True)

# drop rows that contain "Plot outline description" 
sample_data.drop(sample_data[sample_data['Plot summary'].str.contains("Plot outline description", regex=True)].index.tolist(), inplace=True)

# additional text cleaning
sample_data.loc[:, 'Plot summary'] = sample_data['Plot summary'].apply(lambda x: x.replace('#', ''))
sample_data.loc[:, 'Plot summary'] = sample_data['Plot summary'].apply(lambda x: x.replace('*', ''))

# dropping rows does not automatically reset index. So we must do this manually.
sample_data.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data.drop(words_per_summary[words_per_summary < 6].index.tolist(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_data.drop(sample_data[sample_data['Plot summary'].str.contains("Plot outline description", regex=True)].index.tolist(), inplace=True)


In [12]:
# pinecone does not accept NaN values
# Replace NaN in the 'name' column with 'Unknown'
sample_data.loc[:, 'Publication date'] = sample_data['Publication date'].astype(str)
sample_data.loc[:, 'Publication date'] = sample_data['Publication date'].fillna('Unknown')
sample_data.loc[:, 'Author'] = sample_data['Author'].fillna('Unknown')
sample_data.loc[:, 'Publication date'] = sample_data['Publication date'].replace(['nan'], 'Unknown')

### Note:
* Stopwords are not removed as part of the text cleaning process because we are using an LLM to embed-- which are already trained with stopwords and therefore handle the importance of them internally.

### First we determine which embedding model we will use. 
* We will use BAAI's tuned BERT model.
* The bge-base-en model is tuned for short queries -- long contexts.

In [13]:
# Get model
model_name = "BAAI/bge-base-en"
model = AutoModel.from_pretrained(model_name)

print("About the model: \n\n", model.config, "\n")

# Get corresponding tokenizer/encoder
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("About the tokenizer: \n\n", tokenizer)

2025-09-02 20:09:33.830938: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756843773.865790     562 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756843773.877115     562 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


About the model: 

 BertConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}
 

About the tokenizer: 

 BertTokenizerFast(name_or_path='BAAI/bge-base-en', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[S

### How Embeddings Operate
Model: https://huggingface.co/BAAI/bge-base-en

**Encoding**:
* This is a broad term that refers to the process of transforming data from one format to another. Examples include converting text into binary format, converting characters to numerical values, or compressing data. [[1]](https://medium.com/@pratiyush1/understanding-different-types-of-encoding-and-decoding-in-programming-with-practical-examples-dcbdd5215605#:~:text=Practical%20Example%201:%20Email%20Attachments%20Base64%20encoding,were%20traditionally%20designed%20to%20handle%20text%2Donly%20data)
  
**Tokenization**:
* In the context of natural language processing (NLP), tokenization is a specific type of encoding where text is broken down into smaller units called tokens. These tokens can be words, characters, or even sub-word units. [[2]](https://www.datacamp.com/blog/what-is-tokenization#:~:text=Training%20more%20people?,which%20are%20easier%20to%20analyze)

**Embeddings**:
* are advanced vector representations of tokens. They try to capture the most nuance, connections, and semantic meanings between tokens. Each embedding is generally a series of real numbers on a vector space computed by a neural network. [[3]](https://medium.com/the-research-nest/explained-tokens-and-embeddings-in-llms-69a16ba5db33)


> In short, text is converted to tokens. Tokens are assigned token IDs. These token IDs can be used to create embeddings for more nuanced numerical representation in complex models.
>
> Why are embeddings so large and complex? What do they signify?
>
> Each token’s embedding is a high-dimensional vector. This allows the model to capture a wide range of linguistic features and nuances, like the meaning of a word, its part of speech, and its relationship to other words in the sentence.
>
> * Contextual Embeddings: Unlike simpler word embeddings (like Word2Vec), BERT’s embeddings are contextual. This means the same word can have different embeddings based on its context (its surrounding words). The embeddings need to be rich and complex to capture this contextual nuance.
> 
> * In more complex models like BERT, you get the final embeddings and access to the embeddings from each layer of the neural network. Each layer captures different aspects of the language, adding to the complexity and size of the tensor.
>
> * Input for Further Tasks: These embeddings are used as input for various NLP tasks like sentiment analysis, question answering, and language translation. The richness of the embeddings allows the model to perform these tasks with a high degree of sophistication.
>
> * Model’s Internal Representation: The complexity of these tensors reflects how the model ‘understands’ language. Each dimension in the embedding can represent some abstract language feature the model learned during its training.
> [[3]](https://medium.com/the-research-nest/explained-tokens-and-embeddings-in-llms-69a16ba5db33)

## VectorStore Setup
Useful links:
* [Pinecone](https://docs.pinecone.io/integrations/langchain)
* [Langchain](https://python.langchain.com/api_reference/pinecone/vectorstores/langchain_pinecone.vectorstores.PineconeVectorStore.html)

### Step 1. Initialize a vector store

To securely handle Pinecone API keys and prevent their exposure, especially when sharing code or deploying application:

* Store the Pinecone API key as an environment variable on your system or server.

* Access this variable within your code using the appropriate method for your programming language (e.g., os.environ.get("PINECONE_API_KEY") in Python).

This keeps the key separate from your codebase and prevents it from being committed to version control.

In [14]:
pc = Pinecone(api_key=user_secrets.get_secret("PINECONE_API_KEY"))

index_name = "book-vector-store"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024, # set to embedder's output size
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        deletion_protection="enabled",  # Defaults to "disabled"
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

### Step 2. Prepare/Create documents for the vector store

In [15]:
# create documents
book_docs = []

def chunk_list(lst, chunk_size):
    """Splits lst into chunks with length <= chunk_size"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
    
for i in range(len(sample_data)):
    title = sample_data['Book title'].iloc[i]
    authors = sample_data['Author'].iloc[i]
    pub_date = sample_data['Publication date'].iloc[i]
    pub_year = sample_data['Publication year'].iloc[i]
    genres = sample_data['Book genres'].iloc[i]
    doc_id = i

    plot_summary_tokens = tokenizer.tokenize(sample_data['Plot summary'].iloc[i])
    tokenized_chunks = chunk_list(plot_summary_tokens, 512)
    text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in tokenized_chunks]

    documents = [
        Document(page_content=text_chunk, 
                 metadata={"Title": title, "Author(s)": authors, "Publication Date": pub_date, 
                           "Publication year": str(pub_year), "Genre(s)": genres, "doc_id": doc_id, "chunk": str(i)})
        for i, text_chunk in enumerate(text_chunks)
    ]

    book_docs += documents

### Step 3. Create embedding object using our model from huggingface

In [16]:
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedder = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [17]:
## create PineconeVectorStore object
vector_store = PineconeVectorStore(index=index, embedding=embedder)

### Step 4. Populate vector store

In [21]:
%%time

print("Total number of documents to upload: ", len(book_docs))

def batch_list(data, batch_size):
    """Yield successive batch_size-sized chunks from data."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

i=0

for batch in batch_list(book_docs, 1000):
    i+=1
    print(f"Uploading batch: {i}")
    vector_store.add_documents(batch)

Total number of documents to upload:  3238
Uploading batch: 1
Uploading batch: 2
Uploading batch: 3
Uploading batch: 4
CPU times: user 21min 46s, sys: 1min 22s, total: 23min 9s
Wall time: 11min 43s
