In [None]:
# from langchain_core.documents import Document

In [None]:
class Document:
    def __init__(self,page_content,metadata,encoding):
        self.page_content=page_content
        self.metadata=metadata
        self.encoding=encoding
    def __str__(self):
        return f"""{{"page_content": {self.page_content}, "metadata": {self.metadata}}}"""
    def __repr__(self):
        return self.__str__()
    def load(self):
        return self

In [None]:
doc=Document(page_content="Hello this is some document",
             metadata={
        "source": "manual.pdf",
        "page": 10,
        "author": "Gemini",
    },encoding="utf-8")
print(doc)

In [None]:
# from langchain_community.document_loaders import TextLoader

In [None]:
class TextLoader:
    def __init__(self,file_path,encoding):
        self.file_path=file_path
        self.encoding=encoding
    def __str__(self):
        return f"{self.load()}"
    def __repr__(self):
        return self.__str__()
    def load(self):
        try:
            with open(self.file_path,encoding=self.encoding) as f:
                loaded_text=f.read()
        except:
            raise Exception("couldnt load file")
        loaded_document=Document(page_content=loaded_text,metadata={"source":self.file_path},encoding=self.encoding)
        return loaded_document

In [None]:
loaded_document=TextLoader("data/text_files/context8.txt",encoding="utf-8")

In [None]:
loaded_document.load()

In [None]:
# from langchain_community.document_loaders import DirectoryLoader

In [None]:
import os

In [None]:
class DirectoryLoader:
    def __init__(self,directory_path,file_type,encoding,loader_cls):
        self.directory_path=directory_path
        self.file_type=file_type
        self.encoding=encoding
        self.loader_cls=loader_cls

    def __str__(self):
        return f"DirectoryLoader(docs={self.custom_load()})"
    
    def __repr__(self):
        return self.__str__()
    def custom_load(self):
        docs=[]
        def recurse():
            for entry in os.listdir(self.directory_path):
                full_path = os.path.join(self.directory_path, entry)
                if os.path.isdir(full_path):
                    recurse(full_path)                 # recursive call
                elif entry.endswith(self.file_type):
                    loader = self.loader_cls(full_path, encoding=self.encoding)
                    docs.append(loader.load())
        recurse()
        return docs

In [None]:
dir_loader_txt=DirectoryLoader(directory_path="data/text_files",file_type=".txt",encoding="utf-8",loader_cls=TextLoader)

In [None]:
docs=dir_loader_txt.custom_load()

In [None]:
print(dir_loader_txt)

In [None]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from langchain_community.document_loaders import DirectoryLoader
# since this implements a different class called Blob, this is kinda tough to implement(though implementable) and also uses pypdf library=>lets just use this

In [None]:
dir_loader_pdf=DirectoryLoader(path="data/papers",glob="**/*.pdf",loader_cls=PyMuPDFLoader,show_progress=False)

In [None]:
docs=dir_loader_pdf.load()

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(docs,chunk_size=1000,chunk_overlap=200):
    text_spliter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=[""," ","\n","\n\n"]
    )
    splitted_chunks=text_spliter.split_documents(docs)
    print(f"split {len(docs)} documents into {len(splitted_chunks)} chunks")
    if splitted_chunks:
        print("Example:")
        print(f"{type(splitted_chunks[0])}")
        print(f"{splitted_chunks[0]}")
split_documents(docs,1000,200)

In [None]:
import uuid
import chromadb
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from enum import Enum,auto

class Processor(Enum):
    CPU=auto()
    GPU=auto()

In [None]:
from llama_cpp import Llama
import gc
import torch

In [None]:
def clean_memory(model):
    if model:
        del model
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        free=torch.cuda.mem_get_info()[0]/1024**3
        print(f"    [SYSTEM] VRAM Free: {free:.2f} GB")

In [None]:
import numpy as np

In [None]:
class EmbeddingModel:
    def __init__(self,model_path,processor,n_ctx,n_batch):
        self.model_path=model_path
        self.processor=processor
        self.n_ctx=n_ctx
        self.n_batch=n_batch
        self.model=None
    def load(self):
        if self.processor==Processor.CPU:
            self.model=Llama(model_path=self.model_path,
                        n_gpu_layers=0,
                        n_ctx=self.n_ctx,
                        n_batch=self.n_batch,
                        verbose=False)
        else:
            self.model=Llama(model_path=self.model_path,
                             n_gpu_layers=-1,
                             n_batch=self.n_batch,
                             n_ctx=self.n_ctx,
                             verbose=False)
    def unload(self):
        clean_memory(self.model)
    def embed(self,texts,show_progress_bar=False):
        total=len(texts)
        i=0
        embeddings=[]
        for text in texts:
            i+=1
            if show_progress_bar and i%50==0:
                print(f"{i}/{total} texts done")
            full_data=self.model.create_embedding(text)
            embedded_vector=full_data["data"][0]["embedding"]
            embeddings.append(embedded_vector)
        return np.array(embeddings, dtype="float32")


In [None]:
from os import path

In [None]:
print(f"{path.abspath(path.join(os.getcwd(),"../../models/gte-Qwen2-1.5B-instruct-f16.gguf"))}")
embeddingModel=EmbeddingModel(model_path=path.abspath(path.join(os.getcwd(),"../../models/gte-Qwen2-1.5B-instruct-f16.gguf")),
                              processor=Processor.GPU,
                              n_ctx=8192*3,
                              n_batch=512)

In [None]:

embeddingModel.load()


In [None]:
embeddingModel.unload()