In [1]:
import os
from langchain_community.document_loaders import CSVLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from typing import List, Union

In [5]:
directory = 'C:/Users/skrge/Documents/GitHub/llmtesting/data'

In [2]:
def load_csv_files(directory: str) -> List[str]:
    """
    Load and return the content of all CSV files in the given directory.
    """
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".csv"):
            file_path = os.path.join(directory, file_name)
            loader = CSVLoader(file_path)
            documents.extend(loader.load())
    return documents

In [3]:
def load_pdf_files(directory: str) -> List[Document]:
    """
    Load and return the content of all PDF files in the given directory as Document objects.
    """
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(directory, file_name)
            loader = PyPDFLoader(file_path)
            documents = loader.load()  
    return documents

In [None]:
"""
    Split documents into chunks using RecursiveCharacterTextSplitter.
    
    Args:
        documents (List[Document]): List of Document objects to be split.
        chunk_size (int): Maximum size of each chunk.
        chunk_overlap (int): Overlap size between chunks.

    Returns:
        List[Document]: List of split Document objects.
"""

In [4]:
def split_docs(documents: List[Document], chunk_size: int = 400, chunk_overlap: int = 40) -> List[Document]:

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [5]:
def upload_files(directory: str) -> List[Document]:
    """
    Upload all supported file types from a given directory, split PDF content into chunks, and return their content.
    """
    supported_loaders = {
        "csv": load_csv_files,
        "pdf": load_pdf_files
    }
    documents = []

    for ext, loader_func in supported_loaders.items():
        if ext == "pdf":
            pdf_documents = loader_func(directory)
            documents.extend(split_docs(pdf_documents))  # Split PDFs into chunks
        else:
            documents.extend(loader_func(directory))
    
    return documents

In [6]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

def llama_embeddings(all_docs, model="llama3.1"):
    # Initialize the OllamaEmbeddings with the specified model
    embedding_model = OllamaEmbeddings(model=model)
    
    embeddings = []

    for chunk in all_docs:
        # Get the embedding for the chunk
        chunk_embedding = embedding_model.embed_query(chunk)
        embeddings.append(chunk_embedding)
    
    return embeddings


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_community.embeddings.ollama import OllamaEmbeddings


In [7]:
from langchain_nomic import NomicEmbeddings

In [8]:
def nomic_embeddings(all_docs, model="nomic-embed-text-v1.5"):
    # Initialize the OllamaEmbeddings with the specified model
    embedding_model = NomicEmbeddings(model=model)
    embeddings = []

    for chunk in all_docs:
        # Get the embedding for the chunk
        chunk_embedding = embedding_model.embed_query(chunk)
        embeddings.append(chunk_embedding)
    
    return embeddings

In [11]:
# Example usage
chunks = ["This is the first chunk.", "This is the second chunk."]
embeddings = nomic_embeddings(chunks)
print(len(embeddings[1]), embeddings[1][:10])

768 [0.072021484, 0.031341553, -0.1940918, -0.05505371, 0.05355835, -0.014434814, -0.032226562, -0.0023593903, -0.019424438, -0.032226562]


In [12]:
# Example usage
chunks = ["This is the first chunk.", "This is the second chunk."]
embeddings = llama_embeddings(chunks)
print(len(embeddings[1]), embeddings[1][:10])

4096 [-0.634800910949707, -2.7458271980285645, 2.647017240524292, 1.2148879766464233, -1.6613185405731201, -2.678941011428833, 0.4575340747833252, 1.8895249366760254, 0.12285317480564117, -1.275457501411438]


In [26]:
len(embeddings[1])

4096

In [15]:
pdfs = load_pdf_files(directory)
split_documents = split_docs(pdfs)
split_documents[1]

Document(metadata={'source': 'C:/Users/skrge/Documents/GitHub/llmtesting/data\\sample3.pdf', 'page': 0}, page_content='Elder patients are changing the expectations and the specifics of \nmedical services.  \nIn addition, people improve their attitude to  lifestyle, they are  \nmore sensible  to their health and want to have opportunity  to \nmonitor its condition.  \nAt the same time, medical services contain a lot of administrative \nroutine, repetitive work that can be optimized.')

In [16]:
all_docs = upload_files(directory)

In [17]:
all_docs[-6:]

[Document(metadata={'source': 'C:/Users/skrge/Documents/GitHub/llmtesting/data\\sample3.pdf', 'page': 1}, page_content='with access. \nSo Important conditions for AI to deliver its full potential \nhealthcare will be the integration of different databases across \norganizations, strong governance to improve  data quality, and \ngreater confidence from organizations, doctors, and patients and \nthe ability to manage the related risks.  \nIn this case Data architects  and data engineers  will have'),
 Document(metadata={'source': 'C:/Users/skrge/Documents/GitHub/llmtesting/data\\sample3.pdf', 'page': 1}, page_content='significant role in defining how to record, store , structure and \nshare clinical data so that algorithms can be useful for doctors. \n \nСлайд 4 \nThe interaction  between doctors, medical management, data \nscientists and artificial intelligence specialists is an important \nfactor for successful implementation of AI in workf lows. \nAnd human -machine interactions shoul

In [18]:
content_length = len(all_docs[-4].page_content)
content_length

370