# Db Tool



# 1. Setup

In [1]:
!pip -q  install  loguru
!pip -q  install  sentence-transformers
!pip -q install pypdf
!pip -q  install  faiss-gpu
!pip -q  install  langchain_core==0.2.0
!pip -q  install  langchain-community==0.2.0
!pip -q  install  langchain==0.2.0

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.5/294.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.9/307.9 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.9/296.9 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
%cd /content
%mkdir pdf
%mkdir db

/content


In [3]:
from loguru import logger # Import logger
import time
import os

### 1.2. Getting Embeddings Model from HuggingFace

HuggingFaceEmbeddings see here: https://api.python.langchain.com/en/latest/huggingface/embeddings/langchain_huggingface.embeddings.huggingface.HuggingFaceEmbeddings.html#

In [4]:
# Import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
def get_embeddings(type='cpu'):
  # Function to get the embeddings model from HuggingFace
    logger.debug('get_embeddings............')
    start_time = time.time()
    model_id = 'intfloat/multilingual-e5-large'
    if type=='cpu':
        model_kwargs = {'device': 'cpu'}
    else:
        model_kwargs = {'device': 'cuda'}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_id,
        model_kwargs=model_kwargs
    )
    end_time = time.time()
    elapsed_time = end_time - start_time
    logger.debug(f'get_embeddings elapsed_time = {elapsed_time} sec')
    return embeddings

### 1.3.Function for Creating Vector Knowledge Base

In [5]:
import os
from loguru import logger
from langchain_community.vectorstores import FAISS

def get_index_db(dir, db_file_name):
    """
    Функция для получения или создания векторной Базы-Знаний.
    Если база уже существует, она загружается из файла,
    иначе происходит чтение PDF-документов и создание новой базы.
    :param dir: каталог с файлами pdf
    :param db_file_name: каталог, куда будет сохранена созданная векторная Базы-Знаний
    :return: Векторная Базы-Знаний
    """

    logger.debug('...get_index_db')
    # Создание векторных представлений (Embeddings)
    logger.debug('Embeddings')
    # from langchain_huggingface import HuggingFaceEmbeddings
    # model_id = 'intfloat/multilingual-e5-large'
    # model_kwargs = {'device': 'cpu'} # Настройка для использования CPU (можно переключить на GPU)
    # # model_kwargs = {'device': 'cuda'}
    # embeddings = HuggingFaceEmbeddings(
    #     model_name=model_id,
    #     model_kwargs=model_kwargs
    # )
    embeddings = get_embeddings(type='cuda')


    # Загрузка векторной Базы-Знаний из файла
    logger.debug('Загрузка векторной Базы-Знаний из файла')
    file_path = db_file_name + "/index.faiss"
    import os.path
    # Проверка наличия файла с векторной Базой-Знаний
    if os.path.exists(file_path):
        logger.debug('Уже существует векторная База-знаний')
        # Загрузка существующей Базы-Знаний
        db = FAISS.load_local(db_file_name, embeddings, allow_dangerous_deserialization=True)

    else:
        logger.debug('Еще не создана векторная База-Знаний')
        # Если базы нет, происходит создание новой путем чтения PDF-документов
        # Document loaders
        ## Document loaders: https://python.langchain.com/docs/integrations/document_loaders
        ## PyPDFLoader: https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
        from langchain_community.document_loaders import PyPDFLoader


        logger.debug(f'Document loaders. dir={dir}')
        documents = []
        # Чтение всех PDF-файлов в указанной директории
        for root, dirs, files in os.walk(dir):
            for file in files:
                if file.endswith(".pdf"):
                    logger.debug(f'root={root} file={file}')
                    loader = PyPDFLoader(os.path.join(root, file))
                    documents.extend(loader.load())

        # Разделение документов на меньшие части (chunks)
        logger.debug('Разделение на chunks')
        from langchain.text_splitter import RecursiveCharacterTextSplitter

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
        source_chunks = text_splitter.split_documents(documents)
        logger.debug(type(source_chunks))
        logger.debug(len(source_chunks))
        logger.debug(source_chunks[100].metadata)
        logger.debug(source_chunks[100].page_content)

        # Создание векторной Базы-Знаний из chunks
        logger.debug('Векторная База-Знаний')
        db = FAISS.from_documents(source_chunks, embeddings)

        # Сохранение созданной Базы-Знаний в файл
        logger.debug('Сохранение векторной Базы-Знаний в файл')
        db.save_local(db_file_name)

    return db

In [6]:
dir = '/content/pdf'
db_file_name = 'db/db_systems_analyst/'
db = get_index_db(dir, db_file_name)

[32m2024-10-24 06:16:33.286[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_index_db[0m:[36m15[0m - [34m[1m...get_index_db[0m
[32m2024-10-24 06:16:33.288[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_index_db[0m:[36m17[0m - [34m[1mEmbeddings[0m
[32m2024-10-24 06:16:33.291[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_embeddings[0m:[36m5[0m - [34m[1mget_embeddings............[0m
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

[32m2024-10-24 06:17:13.494[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_embeddings[0m:[36m18[0m - [34m[1mget_embeddings elapsed_time = 40.201823472976685 sec[0m
[32m2024-10-24 06:17:13.496[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_index_db[0m:[36m30[0m - [34m[1mЗагрузка векторной Базы-Знаний из файла[0m
[32m2024-10-24 06:17:13.498[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_index_db[0m:[36m40[0m - [34m[1mЕще не создана векторная База-Знаний[0m
[32m2024-10-24 06:17:13.507[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_index_db[0m:[36m48[0m - [34m[1mDocument loaders. dir=/content/pdf[0m
[32m2024-10-24 06:17:13.508[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_index_db[0m:[36m54[0m - [34m[1mroot=/content/pdf file=Fowler_UML_Distilled.pdf[0m
[32m2024-10-24 06:17:17.674[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_index_db[0m:[36m54[0m - [34m[1mroot=/content/pdf file=Redmond_Se