In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('churchill_speech.txt', encoding='utf-8') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [10]:
chunks = text_splitter.create_documents([churchill_speech])

#print(chunks[10])
#print(chunks[10].page_content)

print(f'Now you have {len(chunks)}')

Now you have 269


## Embedding cost

In [11]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 5389
Embedding Cost in USD: 0.002156


In [18]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [19]:
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[-0.010196815253782817, 0.0005370322619610994, -0.0005157889104308394, 0.014914542449660352, -0.01097177342541186, 0.03959083560595864, -0.025464847706206047, -0.04143985737018731, 0.011216497058557873, -0.031650915769637085, 0.017647288755134924, 0.027164315828626846, 0.011814710694466763, -0.024268420434388267, 0.004931859676388365, 0.008028293035504206, -0.0014148082130870832, 0.02439078318228385, 0.006400201009603155, -0.03069921213096532, 0.003534896069507828, -0.026661273115378678, -0.003681050383915371, 0.011848699311857119, 0.011474816138660027, -0.0031100288060719843, 0.004928460814649329, -0.044403731861851745, -0.015852648778730827, -0.003545092887555578, 0.0005892909835596514, -0.018666969628587403, -0.01990418324127361, -0.023167165016553783, -0.012807200674006957, -0.014533861552985191, 0.0016416873564418719, -0.01133885887513088, 0.013296647008976408, -0.003922374922491705, 0.029258063088646674, -0.015077691538424361, 0.005193578083890843, 0.01299074293320518, -0.0254512

### Inserting the Embeddings into a Pinecone Index

In [14]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

  from tqdm.autonotebook import tqdm


In [15]:
# Deleting all index

indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes...', end='')
    pinecone.delete_index(i)
    print('Done')


Deleting all indexes...Done


In [17]:
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name}')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done')


Creating index churchill-speech
Done


In [21]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

## Asking Questions (Similarty Search)

In [22]:
query = 'Where should we fight'

results = vector_store.similarity_search(query)
print(results)

[Document(page_content='on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the'), Document(page_content='fields and in the streets, we shall fight in the hills; we shall never surrender, and even if,'), Document(page_content='When we consider how much greater would be our advantage in defending the air above this Island'), Document(page_content='front, now on that, fighting on three fronts at once, battles fought by two or three divisions')]


In [23]:
for r in results:
    print(r.page_content)
    print('-' * 50)

on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the
--------------------------------------------------
fields and in the streets, we shall fight in the hills; we shall never surrender, and even if,
--------------------------------------------------
When we consider how much greater would be our advantage in defending the air above this Island
--------------------------------------------------
front, now on that, fighting on three fronts at once, battles fought by two or three divisions
--------------------------------------------------


In [24]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [37]:
query = input('What is your question about the speech?')

answer = chain.run(query)

print(answer)

The French armies were involved in the fighting alongside the British armies. They were supposed to advance across the Somme in large numbers, but the overall situation was regarded as a military disaster.


In [36]:
print(answer)

The French armies played a significant role in the events mentioned. They were involved in the fighting alongside the British armies in France and Belgium. However, I do not have specific information about the details of their actions or their outcomes.
