In [3]:
from xml.sax import parse

from langchain_community.document_loaders import TextLoader
from oauthlib.uri_validate import query

loader = TextLoader("speech.txt")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'speech.txt'}, page_content='War Message to Congress\nBy Woodrow Wilson\nApril 2, 1917\n\nGentlemen of the Congress:\n\nI have called the Congress into extraordinary session because there are serious, very serious, choices of policy to be made, and it is necessary that we should have a clear and definite understanding as to the matters that are to be dealt with and the way in which they are to be handled.\n\nOn the 3rd of February last, I officially laid before you the extraordinary announcement of the Imperial German Government that on and after the 1st day of February it was its purpose to put aside all restraints of law or of humanity and use its submarines to sink every vessel that sought to approach either the ports of Great Britain and Ireland or the western coasts of Europe or any of the ports controlled by the enemies of Germany within the Mediterranean.\n\nThat policy has been followed with unrelenting severity.\n\nIt is a war against all nations.

In [12]:
# web based loader
from langchain_community.document_loaders import WebBaseLoader
import  bs4
loader = WebBaseLoader(web_paths=["https://en.wikipedia.org/wiki/Human_history"])
bs_kwargs= dict(parse_only=bs4.SoupStrainer),
class_=("firstHeading mw-first-heading")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Human_history', 'title': 'Human history - Wikipedia', 'language': 'en'}, page_content='\n\n\n\nHuman history - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1\nP

In [13]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("sample.pdf")
text_documents= loader.load()
text_documents

[Document(metadata={'producer': 'Mac OS X 10.5.4 Quartz PDFContext', 'creator': 'Pages', 'creationdate': "D:20080701052447Z00'00'", 'title': 'sample', 'author': 'Philip Hutchison', 'moddate': "D:20080701052447Z00'00'", 'source': 'sample.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Sample PDFThis is a simple PDF ﬁle. Fun fun fun.\nLorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. \nCurabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget \npharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. \nInteger a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. \nVestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla \nerat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. \nVivamus sollicitudin, metus ut interdum eleifend, nisi tellus pellentesque elit, tristique \naccumsan eros qu

In [16]:
from langchain.text_splitter import  RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
document = text_splitter.split_documents(text_documents)
document[:5]


[Document(metadata={'producer': 'Mac OS X 10.5.4 Quartz PDFContext', 'creator': 'Pages', 'creationdate': "D:20080701052447Z00'00'", 'title': 'sample', 'author': 'Philip Hutchison', 'moddate': "D:20080701052447Z00'00'", 'source': 'sample.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Sample PDFThis is a simple PDF ﬁle. Fun fun fun.\nLorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. \nCurabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget \npharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. \nInteger a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. \nVestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla \nerat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. \nVivamus sollicitudin, metus ut interdum eleifend, nisi tellus pellentesque elit, tristique \naccumsan eros qu

In [18]:
# vectorize
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
db= Chroma.from_documents(document[:5],OllamaEmbeddings())


In [19]:
# vector query
query=("what data you have")
result = db.similarity_search(query)
result


[Document(metadata={'author': 'Philip Hutchison', 'creationdate': "D:20080701052447Z00'00'", 'creator': 'Pages', 'moddate': "D:20080701052447Z00'00'", 'page': 0, 'page_label': '1', 'producer': 'Mac OS X 10.5.4 Quartz PDFContext', 'source': 'sample.pdf', 'title': 'sample', 'total_pages': 1}, page_content='Sample PDFThis is a simple PDF ﬁle. Fun fun fun.\nLorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. \nCurabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget \npharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. \nInteger a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. \nVestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla \nerat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. \nVivamus sollicitudin, metus ut interdum eleifend, nisi tellus pellentesque elit, tristique \naccumsan eros qu

In [None]:
# faiss db
from langchain_community.vectorstores import FAISS
faissdb= FAISS.from_documents(text_documents[:5],OllamaEmbeddings())
query1=("what data you have")
result = faissdb.similarity_search(query1)
result


NameError: name 'faissdb' is not defined