# Config

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import GPT4AllEmbeddings

import os
import shutil

# Paramesters


In [15]:
pdf_data_path = "data"
vector_db_path = "vectorstores/db_faiss"

# Functions

In [4]:
def create_db_from_text(raw_text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=512, 
        chunk_overlap=50,
        length_function=len
        )
    chunks = text_splitter.split_text(raw_text)
    
    #Embeddings
    embedding_model = GPT4AllEmbeddings(model_file="../models/all-MiniLM-L6-v2-f16.gguf")
    
    #Put into Faiss Vector db_faiss
    db = FAISS.from_texts(texts=chunks, embedding=embedding_model)
    db.save_local(vector_db_path)
    print("Success")
    return db

def create_db_from_files(folder_path='../data'):
    #Load all data in data folder
    loader = DirectoryLoader(folder_path, glob="*.pdf", loader_cls = PyPDFLoader)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    embedding_model = GPT4AllEmbeddings(model_file="../models/all-MiniLM-L6-v2-f16.gguf")
    db = FAISS.from_documents(chunks, embedding=embedding_model)
    db.save_local(vector_db_path)
    print("Success")
    return db
    
def create_db_from__one_file(pdf_file):
    loader = PyPDFLoader(pdf_file)
    pages = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50))
    embedding_model = GPT4AllEmbeddings(model_file="../models/all-MiniLM-L6-v2-f16.gguf")
    db = FAISS.from_documents(pages, embedding=embedding_model)
    db.save_local(vector_db_path)
    print("Success")
    return db

In [9]:
def remove_all_vectorstores(folder_path = "./vectorstores"):
    """
    Remove all files and subdirectories within the specified folder.

    Args:
    - folder_path (str): Path to the folder whose contents will be removed.
    """
    try:
        # Iterate over all files and subdirectories within the folder
        for item in os.listdir(folder_path):
            item_path = os.path.join(folder_path, item)
            # If the item is a file, remove it
            if os.path.isfile(item_path):
                os.remove(item_path)
            # If the item is a directory, remove it recursively
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        print(f"All contents within {folder_path} have been removed successfully.")
    except Exception as e:
        print(f"Error occurred while removing contents: {e}")

# create_db_from_text

In [5]:
create_db_from_files("../data")

    
## transfrom text into vectorDB
raw_text = """ Transformer attention thông thường thực hiện attention trên toàn bộ 
feature map nó dẫn đến độ phức tạp của thuật toán tăng cao khi spatial size của feature map tăng. 
Tác giả đưa ra một kiểu attention mới mà chỉ attend 
vào một số sample locations (sample locations này cũng không cố định mà được học trong 
quá trình training tương tự như trong deformable convolution) 
qua đó giúp giảm độ phức tạp của thuật toán và làm giảm thời gian training mô hình. """
create_db_from_text(raw_text)

Success
Success


<langchain_community.vectorstores.faiss.FAISS at 0x17dff41a610>

In [10]:
remove_all_vectorstores()

All contents within ./vectorstores have been removed successfully.


# Transfrom 1 pdf file into vectorDB

In [14]:
pdf_file = "../data/Ebook Copywriting - Minh Xin Chào.pdf"
create_db_from__one_file(pdf_file)
# data\Ebook Copywriting - Minh Xin Chào.pdf

Success


<langchain_community.vectorstores.faiss.FAISS at 0x17d89a25f50>

In [16]:
remove_all_vectorstores()

All contents within ./vectorstores have been removed successfully.


# Transfrom folder contain pdfs file into vectorDB

In [17]:
create_db_from_files('../data')

Success


<langchain_community.vectorstores.faiss.FAISS at 0x17d89971750>

In [18]:
remove_all_vectorstores()

All contents within ./vectorstores have been removed successfully.
