# Splitting and Embedding Text Using LangChain (Similarity Search)

This notebook uses the latest versions of the libraries OpenAI, LangChain, and Pinecone.

In [None]:
pip install -q -r ./requirements.txt

Download [requirements.txt](https://drive.google.com/file/d/1UpURYL9kqjXfe9J8o-_Dq5KJTbQpzMef/view?usp=sharing)

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [3]:
chunks = text_splitter.create_documents([churchill_speech])
# print(chunks[2])
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)}')

Now you have 300


#### Embedding Cost

In [4]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')
    
print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.000096


### Creating embeddings

In [11]:
# import warnings
# warnings.filterwarnings('ignore', module='langchain')

In [5]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

In [None]:
vector = embeddings.embed_query(chunks[0].page_content)
vector

### Inserting the Embeddings into a Pinecone Index

In [8]:
# I'm importing the necessary libraries and initializing the Pinecone client
import os
import pinecone

from langchain_community.vectorstores import Pinecone

pc = pinecone.Pinecone()

In [9]:
# deleting all indexes
indexes = pc.list_indexes().names()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes ... Done


In [10]:
# creating an index
from pinecone import ServerlessSpec
index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )
    print('Index created! 😊')
else:
    print(f'Index {index_name} already exists!')

Creating index churchill-speech
Index created! 😊


In [11]:
# processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
# inserting the embeddings into the index and returning a new Pinecone vector store object. 
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [12]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 268}},
 'total_vector_count': 268}

### Asking Questions (Similarity Search)

In [13]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a'), Document(page_content='number of the enemy, and fought fiercely on some of the old grounds that so many of us knew so')]


In [14]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a
--------------------------------------------------
number of the enemy, and fought fiercely on some of the old grounds that so many of us knew so
--------------------------------------------------


### Answering in Natural Language using an LLM

In [15]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# Initialize the LLM with the specified model and temperature
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.2)

# Use the provided vector store with similarity search and retrieve top 3 results
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Create a RetrievalQA chain using the defined LLM, chain type 'stuff', and retriever
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)


In [16]:
query = 'Answer only from the provided input. Where should we fight?'
answer = chain.invoke(query)
print(answer)

{'query': 'Answer only from the provided input. Where should we fight?', 'result': 'We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and streets, we shall fight in the hills.'}


In [19]:
query = 'Who was the king of Belgium at that time?'
answer = chain.invoke(query)
print(answer)

{'query': 'Who was the king of Belgium at that time?', 'result': 'The king of Belgium at that time was King Leopold.'}


In [20]:
query = 'What about the French Armies??'
answer = chain.invoke(query)
print(answer)

{'query': 'What about the French Armies??', 'result': 'The French Armies were involved in the fighting, particularly in the area around the Somme. They were supposed to advance across the Somme in great strength.'}
