In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 20,
    length_function = len
)

In [6]:
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[0])
print(chunks[1])

page_content='Winston Churchill Speech - We Shall Fight on the Beaches\nWe Shall Fight on the Beaches\nJune 4, 1940'
page_content='June 4, 1940\nHouse of Commons'


In [7]:
print(f'There is a total of {len(chunks)} chunks.')

There is a total of 300 chunks.


In [9]:
from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [11]:
vector = embedding.embed_query('embedding')
print(vector)
print(len(vector))

[-0.02404028070533167, -0.013646871176978735, 0.0023664882287299112, -0.0030142360891338886, -0.006639416034802035, 0.015369291666294882, -0.011895006925773294, -0.02426110472854999, -0.01570788701178025, -0.01426517620681595, 0.013043287562424902, 0.03677441761548303, -0.00022358344457290772, 0.006135203137564192, -0.011129486811779733, 0.011755153014919905, 0.04260414649647376, 0.01751863692411922, 0.011585854410854687, -0.02302449466887556, -0.019903526689122722, -0.0010893942770261833, 0.008619463619883687, -0.018475537299441805, -0.00613152301657398, -0.006665178744378586, 0.011372392492261858, -0.028736454090244307, -0.010157865021173766, -0.02103708591049095, -0.00037723955536934174, 0.001241210094496878, -0.02211175947072565, -0.014522802371258924, -0.009767744256535295, -0.00421036155828712, 0.008913894719518914, -0.037098290614362484, 0.018372486461135602, 0.002460338066068684, 0.014184207025773554, 0.00692648642679557, 0.02243563246960511, -0.01784251178564374, -0.0101873078

In [13]:
vector = embedding.embed_query(chunks[0].page_content)
print(len(vector))

1536


In [27]:
import os
import pinecone
from pinecone import ServerlessSpec
from langchain.vectorstores import Pinecone 

pc = pinecone.Pinecone(api_key = os.environ.get('PINECONE_API_KEY'), environment = os.environ.get('PINECONE_ENV'))

In [29]:
index_name = 'churchill-speech'
if index_name not in pc.list_indexes():
    pc.create_index(index_name, dimension=1536, metric = 'cosine', spec=ServerlessSpec(cloud="aws", region="us-west-2"))

In [30]:
vector_store = Pinecone.from_documents(chunks, embedding,index_name=index_name)

Similarity search

In [31]:
query = 'Where should I fight?'
result = vector_store.similarity_search(query)
for r in result:
    print(r.page_content)

front, now on that, fighting
shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
I return to the Army. In the long series of very fierce battles, now on this front, now on that,
Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940


In [32]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [34]:
query = 'Who was the king of Belgium at that time?'
answer = chain.run(query)
print(answer)

The king of Belgium at that time was King Leopold.


In [35]:
query = 'How did churchill feel about the war at that time?'
answer = chain.run(query)
print(answer)

Based on the given context, it is clear that Winston Churchill was determined and resolute in his attitude towards the war. In his speech, he emphasized the determination of the British to fight against the enemy on the beaches, indicating his strong commitment to defending his nation and its allies.
