In [2]:
from langchain_community.document_loaders import PyPDFLoader, CSVLoader,Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import GPT4AllEmbeddings,HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.schema import Document

import requests
from bs4 import BeautifulSoup
import os





In [6]:
def select_loader(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.pdf':
        return PyPDFLoader(file_path)
    elif ext == '.csv':
        return CSVLoader(file_path)
    elif ext == '.docx' or ext == '.doc':
        return Docx2txtLoader(file_path)
    elif ext == ".txt":
        return TextLoader(file_path)
    else:
        return None

def load_all_files(directory_path):
    combined_text = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            loader = select_loader(file_path)
            if loader:
                documents = loader.load()
                combined_text += documents
    
    return combined_text

def crawl_web(urls):
    combined_text = []
    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            # Loại bỏ các phần không cần thiết như script, style
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text(separator='\n')
            combined_text.append(text)
        except Exception as e:
            print(f"Không thể crawl {url}: {e}")
    return combined_text


In [7]:
directory_path = "data"
urls_to_crawl = [
    "https://thptbuithixuan.hcm.edu.vn/homegd3",
]
web_documents = crawl_web(urls_to_crawl)
documents = load_all_files(directory_path)
processed_web_documents = [Document(page_content=doc, metadata={}) for doc in web_documents]


combined_documents = processed_web_documents + documents


In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = text_splitter.split_documents(combined_documents)

# Embeding
# embedding_model = GPT4AllEmbeddings(model_file="models/all-MiniLM-L6-v2-f16.gguf")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

db = FAISS.from_documents(chunks, embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
  from tqdm.autonotebook import tqdm, trange


In [9]:
import os
from langchain_groq import ChatGroq
os.environ["GROQ_API_KEY"] = "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"

llm = ChatGroq(temperature=0, model="llama3-8b-8192")
# model_file = 'vinallama-7b-chat_q5_0.gguf'
# llm = CTransformers(
#         model=model_file,
#         model_type="llama",
#         max_new_tokens=1024,
#         temperature=0.01
#     )

# prompt = "Bạn hãy giới thiệu về bản thân"
# output = llm(prompt, max_tokens=50, temperature=0.5)


# In kết quả
# print(output["choices"][0]["text"])

In [18]:
template = """<|im_start|>system\nSử dụng thông tin sau đây để trả lời câu hỏi. Nếu bạn không biết câu trả lời, hãy nói tôi không biết, đừng cố tạo ra câu trả lời\n
    {context}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant"""
prompt = PromptTemplate(template = template, input_variables=["context", "question"])
llm_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type= "stuff",
        # retriever = db.as_retriever(search_kwargs = {"k":3}, max_tokens_limit=1024),
        retriever = db.as_retriever(search_kwargs={"k": 3, "fuzzy": True}, max_tokens_limit=1024),

        return_source_documents = False,
        chain_type_kwargs= {'prompt': prompt}

    )

In [22]:
question = "hiệu trưởng trường Bùi Thị Xuân là ai" + "trả lời tiếng việt"
response = llm_chain.invoke({"query": question})
print(response['result'])

Thầy Huỳnh Thanh Phú.
