In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [10]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

loader = TextLoader("doc.txt")
doc = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=20)
splt = text_splitter.split_documents(doc)

embd = OllamaEmbeddings(model="llama3")
embeddings = embd.embed_documents(splt)

splt, len(splt), len(embeddings)

([Document(metadata={'source': 'doc.txt'}, page_content='Hey, welcome to this document.'),
  Document(metadata={'source': 'doc.txt'}, page_content='The OpenAI API provides a simple interface to'),
  Document(metadata={'source': 'doc.txt'}, page_content='simple interface to state-of-the-art AI models'),
  Document(metadata={'source': 'doc.txt'}, page_content='AI models for natural language processing, image'),
  Document(metadata={'source': 'doc.txt'}, page_content='processing, image generation, semantic search,'),
  Document(metadata={'source': 'doc.txt'}, page_content='semantic search, and speech recognition. Follow'),
  Document(metadata={'source': 'doc.txt'}, page_content='recognition. Follow this guide to learn how to'),
  Document(metadata={'source': 'doc.txt'}, page_content='to learn how to generate human-like responses to'),
  Document(metadata={'source': 'doc.txt'}, page_content='responses to natural language prompts, create'),
  Document(metadata={'source': 'doc.txt'}, page_co

In [11]:
len(embeddings[0])

4096

In [12]:
from langchain_community.vectorstores import FAISS

In [15]:
db = FAISS.from_documents(documents=splt, embedding=embd)
db

<langchain_community.vectorstores.faiss.FAISS at 0x23ae8b1f8c0>

In [16]:
query = "OpenAI provides its API for which type of tasks?"

ss=db.similarity_search(query)
ss

[Document(metadata={'source': 'doc.txt'}, page_content='simple interface to state-of-the-art AI models'),
 Document(metadata={'source': 'doc.txt'}, page_content='Hey, welcome to this document.'),
 Document(metadata={'source': 'doc.txt'}, page_content='images from textual descriptions.'),
 Document(metadata={'source': 'doc.txt'}, page_content='processing, image generation, semantic search,')]

In [17]:
ss[0].page_content

'simple interface to state-of-the-art AI models'

In [20]:
rt=db.as_retriever(search_type="mmr", search_kwargs={'k': 2})
rt.invoke(query)

[Document(metadata={'source': 'doc.txt'}, page_content='simple interface to state-of-the-art AI models'),
 Document(metadata={'source': 'doc.txt'}, page_content='to learn how to generate human-like responses to')]

In [21]:
ssc = db.similarity_search_with_score(query=query)
ssc

[(Document(metadata={'source': 'doc.txt'}, page_content='simple interface to state-of-the-art AI models'),
  23664.227),
 (Document(metadata={'source': 'doc.txt'}, page_content='Hey, welcome to this document.'),
  28796.668),
 (Document(metadata={'source': 'doc.txt'}, page_content='images from textual descriptions.'),
  28803.973),
 (Document(metadata={'source': 'doc.txt'}, page_content='processing, image generation, semantic search,'),
  36657.637)]

In [23]:
qremb = embd.embed_query(query)
db.similarity_search_by_vector(qremb)

[Document(metadata={'source': 'doc.txt'}, page_content='simple interface to state-of-the-art AI models'),
 Document(metadata={'source': 'doc.txt'}, page_content='Hey, welcome to this document.'),
 Document(metadata={'source': 'doc.txt'}, page_content='images from textual descriptions.'),
 Document(metadata={'source': 'doc.txt'}, page_content='processing, image generation, semantic search,')]

In [25]:
# SAVING FAISS INDEX, DOCSTORE, ETC

db.save_local(folder_path="./")

In [27]:
# Loading SAVED FAISS

fs = db.load_local(folder_path="./", embeddings=embd, allow_dangerous_deserialization=True)
fs.similarity_search(query)

[Document(metadata={'source': 'doc.txt'}, page_content='simple interface to state-of-the-art AI models'),
 Document(metadata={'source': 'doc.txt'}, page_content='Hey, welcome to this document.'),
 Document(metadata={'source': 'doc.txt'}, page_content='images from textual descriptions.'),
 Document(metadata={'source': 'doc.txt'}, page_content='processing, image generation, semantic search,')]