Splitting and Embedding Text Using LangChain

In [8]:
import os 
from dotenv import load_dotenv, find_dotenv 

In [9]:
load_dotenv(find_dotenv(), override=True)

True

In [10]:
# PC

# with open('c:/Users/MP19/LangChain_Pinecone_1_Local/LangChain_Pinecone_Vector_Stores_1/Churchill_Speech.txt') as f: 
#     churchill_speech=f.read()

# Laptop

with open(r"c:\Users\mpari\LangChain_Pinecone_Laptop_1\LangChain_Pinecone_Vector_Stores_1\Churchill_Speech.txt") as f: 
    churchill_speech=f.read()

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 

text_splitter=RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20, length_function=len)

In [12]:
chunks=text_splitter.create_documents([churchill_speech])

print(chunks[0])
print("-"*50) 
print(chunks[0].page_content) 
print("-"*50) 
print(f"Now you have {len(chunks)} chunks")

page_content='When Napoleon lay at Boulogne for a year with his flat-bottomed boats and his Grand Army, he was'
--------------------------------------------------
When Napoleon lay at Boulogne for a year with his flat-bottomed boats and his Grand Army, he was
--------------------------------------------------
Now you have 25 chunks


Embedding Cost

In [13]:
def print_embedding_cost(texts): 
    import tiktoken 
    enc=tiktoken.encoding_for_model("text-embedding-ada-002") 
    total_tokens=sum([len(enc.encode(page.page_content)) for page in texts]) 
    print(f"Total Tokens: {total_tokens}") 
    print(f"Embedding Cost in USD: {total_tokens/1000*0.0004:.6f}")

print_embedding_cost(chunks)

Total Tokens: 476
Embedding Cost in USD: 0.000190


In [14]:
from langchain_openai import OpenAIEmbeddings 

embedding=OpenAIEmbeddings() 

vector=embedding.embed_query("Example Text") 

vector

[-0.008443634957075119,
 0.008319362998008728,
 0.005619884934276342,
 -0.010625310242176056,
 -0.00746326195076108,
 0.02780945412814617,
 -0.013338596560060978,
 -0.012523920275270939,
 -0.022010063752532005,
 -0.01924845017492771,
 0.01076339092105627,
 0.029217878356575966,
 -0.0031896643340587616,
 -0.001618133275769651,
 0.008588619530200958,
 0.01618305966258049,
 0.031206239014863968,
 0.003942204173654318,
 0.02691192924976349,
 -0.006872967351227999,
 -0.009928002953529358,
 0.0020660324953496456,
 0.001537010888569057,
 0.0107081588357687,
 0.002906598849222064,
 -0.019607460126280785,
 0.006251603830605745,
 -0.023597992956638336,
 0.0015413259388878942,
 -0.013828782364726067,
 0.022562386468052864,
 -0.019179409369826317,
 -0.016555877402424812,
 -0.020574025809764862,
 -0.012931258417665958,
 0.006631325930356979,
 -0.015616928227245808,
 -0.013828782364726067,
 0.027933726087212563,
 -0.015299342572689056,
 0.001205617212690413,
 0.015589311718940735,
 0.014388009905815

In [15]:
vector_2=embedding.embed_query(chunks[0].page_content) 

vector_2

[-0.03119259513914585,
 -0.0027252512518316507,
 0.009472597390413284,
 -0.0063629066571593285,
 -0.02334660477936268,
 -0.003388196462765336,
 -0.031438637524843216,
 -0.026066729798913002,
 -0.009424756281077862,
 -0.018125057220458984,
 0.0006988262175582349,
 0.02691420540213585,
 0.002720125485211611,
 0.00019531679572537541,
 -0.0020110474433749914,
 0.020845182240009308,
 0.009363246150314808,
 0.010436260141432285,
 0.020585471764206886,
 -0.0049413335509598255,
 0.0012114126002416015,
 -0.006188627332448959,
 -0.003844398306682706,
 0.004503926262259483,
 -0.012144881300628185,
 -0.019861016422510147,
 0.027775350958108902,
 -0.03854650259017944,
 -0.002193870022892952,
 -0.006608948111534119,
 -0.0010038150940090418,
 -0.031712014228105545,
 -0.017222905531525612,
 -0.010197052732110023,
 -0.01504953857511282,
 -0.0178380087018013,
 -0.002311764983460307,
 0.00513269891962409,
 0.024385446682572365,
 -0.00427838834002614,
 0.015008531510829926,
 -0.001918782014399767,
 -0.001

In [17]:
print(chunks[0].page_content)

# vector_2=embedding.embed_query(chunks[0])

When Napoleon lay at Boulogne for a year with his flat-bottomed boats and his Grand Army, he was
