# Data Ingestion

In [1]:
# !pip install -U langchain langchain-community langchain-openai
# !pip uninstall -y langchain langchain-core langchain-community langchainhub langchain-openai



In [2]:
## document structure


from langchain_core.documents import Document 

In [3]:
doc=Document(
    page_content="This is the content of the document.",
    metadata={"source": "generated", "author": "AI"},
)

In [4]:
doc

Document(metadata={'source': 'generated', 'author': 'AI'}, page_content='This is the content of the document.')

In [5]:
## creating a simple txt document

import os
os.makedirs("../data/text_files", exist_ok=True)

In [6]:
sample_text = {
    
    "../data/text_files/doc1.txt": """This is the content of document , it is about python programming 
and its applications in data science.
Python is a versatile language that is widely used in various fields.
keyfeatures include simplicity, readability, and a vast ecosystem of libraries.
  
it has verstaile uses including web development, data analysis, machine learning, and automation.
  
    """,

}


for file_path, content in sample_text.items():   # using 2 variables, one for key and one for value
    with open(file_path, "w",encoding="utf-8") as f:
        f.write(content)
        
print("Sample text files created.")


Sample text files created.


In [7]:
## text loader

from langchain_community.document_loaders import TextLoader

loader=TextLoader("../data/text_files/doc1.txt", encoding="utf-8")  
document=loader.load()  # it is an object of the Textloader


In [8]:
document  # automatically have the metadata and page_content

[Document(metadata={'source': '../data/text_files/doc1.txt'}, page_content='This is the content of document , it is about python programming \nand its applications in data science.\nPython is a versatile language that is widely used in various fields.\nkeyfeatures include simplicity, readability, and a vast ecosystem of libraries.\n  \nit has verstaile uses including web development, data analysis, machine learning, and automation.\n  \n    ')]

In [9]:
## Directory Loader

from langchain_community.document_loaders import DirectoryLoader

# load all text files from directory 


dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt",  # pattern to match files
    loader_cls=TextLoader, ## loader class to use
    loader_kwargs={'encoding':'utf-8'},
    show_progress=False
)

documents=dir_loader.load()




In [10]:
documents

[Document(metadata={'source': '..\\data\\text_files\\doc1.txt'}, page_content='This is the content of document , it is about python programming \nand its applications in data science.\nPython is a versatile language that is widely used in various fields.\nkeyfeatures include simplicity, readability, and a vast ecosystem of libraries.\n  \nit has verstaile uses including web development, data analysis, machine learning, and automation.\n  \n    ')]

In [11]:
# !pip install pymupdf

In [12]:
## Directory Loader

from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

# load all text files from directory 


dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",  # pattern to match files
    loader_cls=PyMuPDFLoader, ## loader class to use,better for complex pdfs
    show_progress=False
)

pdf_docs = dir_loader.load()

pdf_docs


"""we are not getting one document per PDF file.
we are getting one document per page."""

'we are not getting one document per PDF file.\nwe are getting one document per page.'

In [13]:
type(pdf_docs[0])

langchain_core.documents.base.Document

## embeddings and vectordb