### connect with ggdrive

In [1]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


### mount to project

In [2]:
cd /content/drive/MyDrive/project/chat_with_data

/content/drive/MyDrive/project/chat_with_data


### install lib

In [3]:
!pip install -r requirements.txt

Collecting bitsandbytes (from -r requirements.txt (line 4))
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting langchain_community (from -r requirements.txt (line 7))
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting pypdf (from -r requirements.txt (line 8))
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting fastapi (from -r requirements.txt (line 9))
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn (from -r requirements.txt (line 10))
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.

In [4]:
import warnings
warnings.filterwarnings("ignore")

### login hugging face

In [None]:
from huggingface_hub import login
login(token = "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")

### run

In [None]:
!python main.py

2025-04-21 12:43:31.682299: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745239411.702160    2689 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745239411.708296    2689 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-21 12:43:31.729270: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
config.json: 100% 878/878 [00:00<00:00, 7.03MB/s]
model.safetensors.index.json: 100% 20.9k/20.9k [00:00<00:00, 12.8MB

In [10]:
from typing import Union
import glob
import multiprocessing
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import Literal, List


In [74]:
def remove_non_utf8_characters(text):
    # return ''.join(char for char in text if ord(char) < 128)
    return text

In [75]:
class PDFLoader:
    def __init__(self) -> None:
        self.num_processes = multiprocessing.cpu_count()
    def load_file(self, files):
        docs = PyPDFLoader(files, extract_images = False).load()
        for doc in docs:
            doc.page_content = remove_non_utf8_characters(doc.page_content)
        return docs
    def __call__(self, files: List[str], **kwargs):
        num_processes = min(self.num_processes, kwargs["workers"])
        print(f"nums processes: {num_processes}")
        with multiprocessing.Pool(processes = num_processes) as pool:
            doc_loaded = []
            total_files = len(files)
            with tqdm(total = total_files, desc = "Loading PDFs ...", unit = "file") as pbar:
                for result in pool.imap_unordered(self.load_file, files):
                    doc_loaded.extend(result)
                    pbar.update(1)
        return doc_loaded

class TextSplitter:
    def __init__(self,
                 separators: List[str] = ["\n\n", "\n"," ", ""],
                 chunk_size = 500,
                 chunk_overlap = 0) -> None:
        self.splitter = RecursiveCharacterTextSplitter(
            separators = separators,
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap)
    def __call__(self,documents):
        return self.splitter.split_documents(documents)

class Loader:
    def __init__(self,
                file_type: str = Literal["pdf"],
                split_kwargs: dict = {"chunk_size":500,"chunk_overlap":0}) -> None:
        assert file_type in ["pdf"], "file type must be pdf"
        self.file_type = file_type
        if file_type == "pdf":
            self.doc_loader = PDFLoader()
        else:
            raise ValueError("file type must be pdf")
        self.doc_splitter = TextSplitter(**split_kwargs)
    def load(self, files: Union[str, List[str]], workers: int = 1):
        if isinstance(files, str):
            files = [files]
        doc_loaded = self.doc_loader(files, workers = workers)
        doc_split = self.doc_splitter(doc_loaded)
        return doc_split
    def load_dir(self, dir_path: str, workers: int = 1):
        if self.file_type == 'pdf':
            files = glob.glob(f"{dir_path}/*.pdf")
            assert len(files) > 0, f"No {self.file_type} files found in {dir_path}"
        else:
            raise ValueError("file_type must be pdf")
        return self.load(files, workers)

In [61]:
dir = "data"

In [63]:
loader = Loader("pdf")

In [65]:
doc = loader.load_dir(dir)

nums processes: 1


Loading PDFs ...: 100%|██████████| 2/2 [00:04<00:00,  2.07s/file]


In [25]:
files

['data/llama.pdf', 'data/attention.pdf']

In [19]:
pwd

'/content/drive/MyDrive/project/chat_with_data'

In [68]:
doc[5]

Document(metadata={'producer': 'Skia/PDF m112', 'creator': 'PyPDF', 'creationdate': '', 'source': 'data/llama.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='a lcapabilities, f u n d a m e n t a l l y a l t e r i n g h o w m a c h i n e s i n t e r a c t w i t h h u m a n i n p u t s . A m o n g t h e m o s tsignificantd e v e l o p m e n t s i n t h i s f i e l d i s t h e e m e r g e n c e o f m u l t i m o d a l A I , w h i c h i n t e g r a t e s a n dprocesses i n f o r m a t i o n f r o m m u l t i p l e m o d a l i t i e s  s u c h a s t e x t , i m a g e s , a n daudiosimu l t a n e o u s l y . T h i s c a p a b i l i t y a l l o w s f o r a')

In [76]:
a = PDFLoader()

In [77]:
loaded = a(['data/llama.pdf', 'data/attention.pdf'], workers = 1)

nums processes: 1


Loading PDFs ...: 100%|██████████| 2/2 [00:05<00:00,  2.61s/file]


In [78]:
loaded[0]

Document(metadata={'producer': 'Skia/PDF m112', 'creator': 'PyPDF', 'creationdate': '', 'source': 'data/llama.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content="M e t a L l a m a 4 : T h e F u t u r e o f M u l t i m o d a l A I A j i t S i n g h , B i h a r N a t i o n a l C o l l e g e , P a t n a U n i v e r s i t y , I n d i aAbstractThisresearc h p a p e r d e l v e s i n t o t h e t r a n s f o r m a t i v e c a p a b i l i t i e s o f M e t a L l a m a 4 , a c u t t i n g - e d g emultimodalA I m o d e l d e v e l o p e d b y M e t a P l a t f o r m s , I n c . B y i n t e g r a t i n g d i v e r s e d a t a t y p e s — s u c hastext,ima g e s , a n d a u d i o — M e t a L l a m a 4 r e p r e s e n t s a s i g n i f i c a n t a d v a n c e m e n t i n a r t i f i c i a lintelligence, e n h a n c i n g c o n t e x t u a l u n d e r s t a n d i n g a n d p e r f o r m a n c e a c r o s s v a r i o u s a p p l i c a t i o n s .Thisstudye v a l u a t e s t h e m o

In [82]:
docs = PyPDFLoader('data/attention.pdf', extract_images = False).load()

In [83]:
docs

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlu