# Library requirements

#### API: *ls__5179d3562f654bd2b1ff768e646bacaa* : Langchain

In [2]:
!pip install numpy pandas pretty_errors pypdf IPython langchain langchain-community langchain-core sentence-transformers

Collecting pretty_errors
  Using cached pretty_errors-1.2.25-py3-none-any.whl.metadata (12 kB)
Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl.metadata (7.4 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Downloading langchain_community-0.2.1-py3-none-any.whl.metadata (8.9 kB)
Using cached pretty_errors-1.2.25-py3-none-any.whl (17 kB)
Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
   ---------------------------------------- 0.0/290.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/290.4 kB ? eta -:--:--
   ----------- ---------------------------- 81.9/290.4 kB 1.1 MB/s eta 0:00:01
   ------------------------------- -------- 225.3/290.4 kB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 290.4/290.4 kB 2.2 MB/s eta 0:00:00
Downloading langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
   -------

In [1]:
import os
import torch
from pprint import pprint
import pretty_errors
import pypdf
from IPython.display import display, Markdown

import numpy as np
import pandas as pd
import warnings

from langchain_text_splitters import (
    RecursiveCharacterTextSplitter, 
    CharacterTextSplitter
)
from langchain_community.document_loaders import (
    PyPDFLoader, 
    DirectoryLoader
)

from langchain_community.vectorstores import FAISS
from langchain_core.vectorstores import VectorStoreRetriever
from langchain_community.embeddings import GPT4AllEmbeddings
# from langchain_community.llms import CTransformers # For cpu run
from langchain.llms import Ollama
from langchain.chains import RetrievalQA, LLMChain
from langchain.memory import ConversationSummaryMemory
from langchain.prompts import PromptTemplate
# from langchain.prompts.few_shot import FewShotPromptTemplate

from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda



# Data Storage

In [3]:
pdf_path = r"assets\pdf"
vector_db_path = r"assets\vector_db"
# strg = input("Enter a string: ")

In [4]:
def create_db_from_text(strs):
    """
    Create a vector database from the text
    """

    # Initialize the text splitter
    text_splitter = CharacterTextSplitter(
        separator = "\n",
        chunk_size = 500,
        chunk_overlap = 50,
        length_function = len
    )
    
    # Split the text into chunks
    chunks = text_splitter.split_text(strs)
    
    # Load the GPT-4 embeddings model
    embeddings_model = GPT4AllEmbeddings(model_file = "model/all-MiniLM-L12-v2.Q8_0.gguf")
    
    # Create the vector database using FAISS
    database = FAISS.from_text(chunks, embeddings_model)
    database.save_local(vector_db_path)
    
    return database

In [5]:
def create_db_from_pdf(path):
    """
    Create a vector database from the documents of a PDF file.
    """
    loader = DirectoryLoader(path, glob = "*.pdf", loader_cls = PyPDFLoader, use_multithreading = True, show_progress = True)
    
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1024,
        chunk_overlap = 20,
        length_function=len,
        separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200B",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",],
    )

    chunks = text_splitter.split_documents(documents)
    
    embeddings_model = GPT4AllEmbeddings(model_file = "model/all-MiniLM-L12-v2.Q8_0.gguf")
    
    database = FAISS.from_documents(chunks, embeddings_model)
    database.save_local(vector_db_path)

    return database

# LLM Model Loader

In [18]:
model = Ollama(
    model = "llama2:13b",
    num_gpu = -1,
    num_ctx = 4096,
    temperature = 0.01,
)

In [19]:
def prompt_gen(template):
    return PromptTemplate(
        template = template,
        max_tokens = 4096,
        temperature = 0.01,
        top_p = 0.95,
        frequency_penalty = 0.0, 
        presence_penalty = 0.0, 
        stop_sequences = ["\n"],
        input_variables = ["question"],
        return_only_outputs = False
    )

In [25]:
def chain_lang(pt, llm_model):
    chain = RetrievalQA.from_chain_type(
    llm = llm_model,
    retriever = VectorStoreRetriever(vectorstore = create_db_from_pdf(pdf_path)),
    memory = ConversationSummaryMemory(llm = llm_model),
    chain_type_kwargs = {"prompt": pt, "verbose": True},
    return_source_documents = False
    )
    
    return chain

In [21]:
template = """<|im_start|>system\nSử dụng thông tin sau đây để trả lời câu hỏi. Nếu bạn không biết câu trả lời, hãy nói không biết, đừng cố tạo ra câu trả lời.\n
    {context}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant"""

In [26]:
prompt = prompt_gen(template)
chain = chain_lang(prompt, model)

100%|██████████| 4/4 [02:00<00:00, 30.06s/it]


In [27]:
response = chain.invoke("Decision Tree có công thức như nào là gì?")



> Entering new StuffDocumentsChain chain...


> Entering new LLMChain chain...
Prompt after formatting:
<|im_start|>system
Sử dụng thông tin sau đây để trả lời câu hỏi. Nếu bạn không biết câu trả lời, hãy nói không biết, đừng cố tạo ra câu trả lời.

    vey of current methods for constructing decision tree classiﬁers in a top-down
manner. The chapter suggests a uniﬁed algorithmic framework for presenting
thesealgorithmsanddescribesvarioussplittingcriteriaandpruningmethodolo-
gies.
Keywords: Decision tree, Information Gain, Gini Index, Gain Ratio, Pruning, Minimum
Description Length, C4.5, CART,ObliviousDecision Trees
1. Decision Trees
A decision tree is a classiﬁer expressed as a recursive partition of the in-

Chapter 9
DECISION TREES
Lior Rokach
Department of Industrial Engineering
Tel-AvivUniversity
liorr@eng.tau.ac.il
Oded Maimon
Department of Industrial Engineering
Tel-AvivUniversity
maimon@eng.tau.ac.il
Abstract DecisionTreesareconsideredtobeoneofthemostpopularapproachesforrep-


> Finished chain.

> Finished chain.


In [29]:
pprint(response)

{'history': '',
 'query': 'Decision Tree có công thức như nào là gì?',
 'result': ' A decision tree is a popular machine learning approach for '
           'classifying data. It is a recursive partition of the input data, '
           'where each internal node in the tree represents a test or query, '
           'and each leaf node represents a class label or prediction. The '
           'tree grows top-down, with each node determining which of its '
           'subtrees should be evaluated for each input.\n'
           '\n'
           'There are several methods for constructing decision trees, '
           'including:\n'
           '\n'
           '1. Information Gain: This method selects the best feature to split '
           'the data based on the amount of information gained by doing so.\n'
           '2. Gini Index: This method selects the feature that leads to the '
           'most even distribution of the data among the classes.\n'
           '3. Gain Ratio: This method selects