In [82]:
import os
import glob
from dotenv import load_dotenv
import json
import gradio as gr
from openai import OpenAI

In [83]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [84]:
MODEL_GPT = "gpt-4o"
MODEL_QWEN3b = 'qwen2.5:3b'
MODEL_QWEN7b = 'qwen2.5'
MODEL_LLAMA  = 'llama3.2'
db_name = "..\\vector_db"

In [85]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai = OpenAI()

In [86]:
import fitz

def pdf_to_md(pdf_path, output_md_path):
    """
    Converts a PDF file to a Markdown (.md) file.

    Args:
        pdf_path (str): Path to the input PDF file.
        output_md_path (str): Path to the output Markdown file.
    """
    header = ("CHƯƠNG", "1.", "2.", "3.", "4.", "5.", "6.", "7.", "8.", "9.")#, "a)", "b)", "c)", "d)", "đ)", "e)")
    try:
        pdf_document = fitz.open(pdf_path)

        with open(output_md_path, 'w', encoding='utf-8') as md_file:
            for page_number in range(len(pdf_document)):
                page = pdf_document.load_page(page_number)  # Load page
                text = page.get_text("text")  # Extract text
                lines = text.split('\n')

                paragraph = []
                prev_line = "start"
                for i in range(len(lines)):
                    line = lines[i].strip()
                    if line == "" and prev_line == "":
                        continue
                    elif line == "" and prev_line != "":
                        paragraph.append(line)
                    elif line.startswith(header):
                        paragraph.append("")
                        paragraph.append(line)
                    else:
                        paragraph.append(line)
                    prev_line = line

                text = "\n".join(paragraph)

                md_file.write(text)
                md_file.write("\n\n---\n\n")  

        print(f"Markdown file created at: {output_md_path}")

    except Exception as e:
        print(f"An error occurred: {e}")

pdf_path = "../knowledge-base/document/qcdt_2023_upload.pdf" 
output_md_path = "../knowledge-base/document/qcdt_2023_upload.md"  

pdf_to_md(pdf_path, output_md_path)


An error occurred: module 'fitz' has no attribute 'open'


In [87]:
folders = glob.glob("../knowledge-base/*")

text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [89]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)


In [90]:
embeddings = OpenAIEmbeddings()

In [91]:
import pandas as pd
df = pd.DataFrame([d.page_content for d in chunks], columns=["text"])

In [92]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [93]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore: {vectorstore._collection.count()} documents")

Vectorstore: 118 documents


In [94]:
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [95]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green'][['document', 'question answer'].index(t)] for t in doc_types]

In [96]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [97]:
from langchain_core.tools import tool
from langchain.agents import initialize_agent

ranking = { 0.0: "Yếu", 2.0: "Trung bình", 2.5: "Khá", 3.2: "Giỏi", 3.6: "Xuất sắc"}

@tool
def get_ranking(grade):
    """
    Trả về xếp loại dựa vào số điểm CPA

    Args:
        grade (float): Số điểm CPA của sinh viên
    """
    grade = grade.replace("'", "\"")

    grade = json.loads(grade).get('grade')
    grade = float(grade)

    if grade > 4 or grade < 0:
        return "Số điểm không hợp lệ trên thang điểm 4"

    for key in sorted(ranking.keys(), reverse= True):
        if grade >= key:
            return ranking[key]

    return "Xuất sắc" 


In [98]:
from langchain_community.chat_models import ChatOllama

from langchain_openai import ChatOpenAI
# llm = ChatOpenAI(
#     api_key="ollama",
#     model= MODEL_QWEN7b,
#     base_url="http://localhost:11434/v1",
# )

llm = ChatOpenAI(
    model = MODEL_GPT,
    temperature= 0.7
)

tools = [get_ranking]

llm = llm.bind_tools(tools)
agent = initialize_agent(tools, llm, agent_type="zero-shot-react-description", handle_parsing_errors=True, verbose=True)

def handle_get_ranking_tool_call(tool_call):
    arguments = tool_call.get('args')
    grade = arguments.get('grade')
    hint = f"Lấy xếp loại của sinh viên dựa vào điểm số CPA cung cấp: {grade}"
    ranking = agent.run(hint)
    return ranking


In [99]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()

In [100]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever, question=RunnablePassthrough())

In [101]:
from langchain.prompts import ChatPromptTemplate

template = """
Bạn hãy đưa ra các câu trả lời bằng Tiếng Việt. 
Bạn là một chuyên gia tư vấn về quy chế đào tạo
cho một đại học ở Việt Nam, Đại học Bách khoa Hà Nội.
Bạn cần tư vấn chính xác những gì bạn biết và trả lời 
thành thật những nội dung trong tài liệu bạn được cung cấp. 
Khi nội dung được hỏi không có thông tin trong tài liệu được 
cung cấp, hãy nói không có thông tin trong tài liệu.

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [102]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

In [103]:
chain = setup | prompt | llm | parser

In [105]:
def chat(message, history):
    response = llm.invoke(message)
    if response.tool_calls:
        for tool_call in response.tool_calls:
            if tool_call.get('name') == 'get_ranking':
                result = handle_get_ranking_tool_call(response.tool_calls[0])
        return result
    else:
        result = chain.invoke()
        return result.content

In [106]:
chat("xếp hạng của tôi khi đạt cpa 3.7", "haha")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m[0m
Observation: Invalid Format: Missing 'Action:' after 'Thought:
Thought:[32;1m[1;3mAction: get_ranking
Action Input: { "grade": 3.7 }[0m
Observation: [36;1m[1;3mXuất sắc[0m
Thought:[32;1m[1;3mI now know the final answer.

Final Answer: Xếp loại của sinh viên với điểm CPA 3.7 là "Xuất sắc".[0m

[1m> Finished chain.[0m


'Xếp loại của sinh viên với điểm CPA 3.7 là "Xuất sắc".'

In [107]:
# view = gr.ChatInterface(chat, type="messages").launch(share = True)