# RAG Search and Retrieval

## Environment Setup

In [1]:
%%capture

# installations
!pip install --quiet sentence_transformers transformers torch peft huggingface_hub kaggle pinecone lark rank_bm25 langchain_huggingface langchain_experimental langchain_pinecone 

# THE REGS
import pandas as pd
import numpy as np
import kagglehub
import torch
import os
import time

# Transformers
from transformers import AutoTokenizer, AutoModel
import torch

# PINECONE
from pinecone import Pinecone
from pinecone import ServerlessSpec

# LANGCHAIN
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document

# Kaggle environment
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

In [8]:
# Get model
model_name = "BAAI/bge-base-en"
model = AutoModel.from_pretrained(model_name)

print("About the model: \n\n", model.config, "\n")

# Get corresponding tokenizer/encoder
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("About the tokenizer: \n\n", tokenizer)

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

About the model: 

 BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.52.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}
 



tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

About the tokenizer: 

 BertTokenizerFast(name_or_path='BAAI/bge-base-en', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [3]:
# get access to populated index
pc = Pinecone(api_key=user_secrets.get_secret("PINECONE_API_KEY"))
index_name = "book-vector-store"
index = pc.Index(index_name)

In [9]:
# create embedding object using the model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedder = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
## create PineconeVectorStore object
vector_store = PineconeVectorStore(index=index, embedding=embedder)

## Test out the search

> A similarity_search on a PineconeVectorStore object returns a list of LangChain Document objects most similar to the query provided. While the similarity_search uses a Pinecone query to find the most similar results, this method includes additional steps and returns results of a different type.The similarity_search method accepts raw text and automatically embeds it using the Embedding object provided when you initialized the PineconeVectorStore. You can also provide a k value to determine the number of LangChain Document objects to return. The default value is k=4.


In [11]:
query = "Fantasy with dragons, elves, and romance between a prince and farm-girl."
vector_store.similarity_search(query)

[Document(id='49c5d336-7f4f-4633-868f-9d9eb7765465', metadata={'Author(s)': 'Spike Milligan', 'Genre(s)': ["Children's literature", 'Comedy'], 'Publication Date': 'Unknown', 'Publication year': '0', 'Title': 'Sir Nobonk and the Terrible Dreadful Awful Naughty Nasty Dragon', 'chunk': '0', 'doc_id': 2692.0}, page_content='the story takes place in the mythical kingdom of rotten custard, a kingdom that exists within cornwall, where knights are constantly at war with the dragons. among the knights is a 60 - year - old knight named sir nobonk, who becomes a dragon - catcher in order to save the dragons from extinction. setting forth into the nearby forest, sir nobonk successfully captures the last living dragon, and convinces the king to open a zoo to help dragons to repopulate. the plan becomes successful, and also helps humans and dragons to co - exist peacefully within the kingdom. however, the prosperity of the kingdom invokes a giant named blackmangle to attack the kingdom along with hi

In [12]:
query = "Politics and world war"
vector_store.similarity_search(query)

[Document(id='7a3fb77d-a51c-449c-89dd-ce9c8de8e112', metadata={'Author(s)': 'Rex Stout', 'Genre(s)': ['Thriller', 'Mystery', 'Speculative fiction', 'Fiction', 'Suspense'], 'Publication Date': '1934-09-17', 'Publication year': '1934', 'Title': 'The President Vanishes', 'chunk': '0', 'doc_id': 740.0}, page_content='the book concerns the mysterious disappearance of the president of the united states, who was facing a serious political crisis, perhaps even impeachment, over his handling of a foreign situation, namely the impending war ( what we would now call world war ii ). the disappearance of the president seems like a kidnapping, but no ransom is demanded. although not revealed in detail until near the end, it is fairly apparent from an early stage that the president has staged his own disappearance to counter an impending military coup staged by an upstart army of fascist " grey shirts " allied with a small coterie of industrialists ( similar to the business plot ). the aim of all thi

In [None]:
## Example
#query = "Who is Ketanji Brown Jackson?"
    #vectorstore.similarity_search(query)
    
    # Response:
    # [
    #    Document(page_content='Ketanji Onyika Brown Jackson is an American lawyer and jurist who is an associate justice of the Supreme Court of the United...', metadata={'chunk': 0.0, 'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson', 'title': 'Ketanji Brown Jackson', 'wiki-id': '6573'}),  
    #    Document(page_content='Jackson was nominated to the Supreme Court by President Joe Biden on February 25, 2022, and confirmed by the U.S. Senate...', metadata={'chunk': 1.0, 'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson', 'title': 'Ketanji Brown Jackson', 'wiki-id': '6573'}),  
    #    Document(page_content='Jackson grew up in Miami and attended Miami Palmetto Senior High School. She distinguished herself as a champion debater...', metadata={'chunk': 3.0, 'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson', 'title': 'Ketanji Brown Jackson', 'wiki-id': '6573'}),
    #    Document(page_content='After high school, Jackson matriculated at Harvard University to study government, having applied despite her guidance...', metadata={'chunk': 5.0, 'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson', 'title': 'Ketanji Brown Jackson', 'wiki-id': '6573'})
    # ]

You can also optionally apply a metadata filter to your similarity search. The filtering query language is the same as for Pinecone queries, as detailed in [Filtering with metadata](https://docs.pinecone.io/guides/index-data/indexing-overview#metadata).

In [None]:
## Example
   # query = "Tell me more about Ketanji Brown Jackson."
   #  vectorstore.similarity_search(query, filter={'source': 'https://en.wikipedia.org/wiki/Ketanji_Brown_Jackson'})

### Test out RAG with Langchain
In RAG, you take the query as a question that is to be answered by a LLM, but the LLM must answer the question based on the information it is seeing from the vectorstore.

In [None]:
## Example
# from langchain_openai import ChatOpenAI  
# from langchain.chains import RetrievalQA  
# # completion llm  
# llm = ChatOpenAI(  
#     openai_api_key=OPENAI_API_KEY,  
#     model_name='gpt-3.5-turbo',  
#     temperature=0.0  
# )  
# qa = RetrievalQA.from_chain_type(  
#     llm=llm,  
#     chain_type="stuff",  
#     retriever=vectorstore.as_retriever()  
# )  
# qa.invoke(query)  

# Response:
# Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of fascism in Italy...

https://python.langchain.com/docs/tutorials/rag/