In [1]:
import requests
import msal
from dotenv import load_dotenv
import os

# Load bi·∫øn m√¥i tr∆∞·ªùng
load_dotenv()

# Azure Client ID, Tenant ID, Client Secret t·ª´ file .env
azure_client_id = os.getenv("AZURE_CLIENT_ID")
azure_tenant_id = os.getenv("AZURE_TENANT_ID")
azure_client_secret = os.getenv("AZURE_CLIENT_SECRET")

# SharePoint Site URL v√† ID c·ªßa Drive
sharepoint_site_url = "maithujsc.sharepoint.com/sites/Trainingdocument"
drive_id = "b!SJpkxkt_aECkl7ZK6YMWBTM-60BFIl5ChlC_cxyDngG7XD9-vWJITZvMeqzfYkAW"

In [15]:

def get_access_token():
    """L·∫•y token truy c·∫≠p Microsoft Graph API"""
    app = msal.ConfidentialClientApplication(
        azure_client_id,
        authority=f"https://login.microsoftonline.com/{azure_tenant_id}",
        client_credential=azure_client_secret
    )
    token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
    return token["access_token"]

In [16]:
def get_files_in_folder():
    """L·∫•y danh s√°ch c√°c file PDF trong th∆∞ m·ª•c"""
    token = get_access_token()
    headers = {"Authorization": f"Bearer {token}"}
    
    # URL ƒë·ªÉ l·∫•y c√°c file trong th∆∞ m·ª•c g·ªëc c·ªßa drive
    url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root/children"
    
    response = requests.get(url, headers=headers)
    
    # In th√™m th√¥ng tin debug
    print(f"Response status code: {response.status_code}")
    if response.status_code == 200:
        files = response.json().get("value", [])
        pdf_files = [file["name"] for file in files if file["name"].endswith(".pdf")]
        print(f"üìÇ T√¨m th·∫•y {len(pdf_files)} file PDF:", pdf_files)
        return pdf_files
    else:
        print("‚ùå L·ªói l·∫•y danh s√°ch file:", response.json())
        return []

In [17]:
def download_file(file_name):
    """T·∫£i file PDF t·ª´ SharePoint"""
    token = get_access_token()
    headers = {"Authorization": f"Bearer {token}"}
    url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{file_name}:/content"
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # L∆∞u file v√†o th∆∞ m·ª•c downloads
        with open(f"downloads/{file_name}", "wb") as f:
            f.write(response.content)
        print(f"‚úÖ ƒê√£ t·∫£i file: {file_name}")
    else:
        print(f"‚ùå L·ªói t·∫£i file {file_name}: {response.json()}")

# T·∫°o th∆∞ m·ª•c "downloads" n·∫øu ch∆∞a c√≥
os.makedirs("downloads", exist_ok=True)

# L·∫•y danh s√°ch file PDF v√† t·∫£i xu·ªëng
pdf_files = get_files_in_folder()
for file in pdf_files:
    download_file(file)

Response status code: 200
üìÇ T√¨m th·∫•y 5 file PDF: ['01. Mai Thu Packaging.pdf', 'Dao tao van hoa hoi nhap Mai Thu.pdf', 'H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng b·ªô app qu·∫£n l√Ω.pdf', 'H∆Ø·ªöNG D·∫™N T·∫†O CH·ªÆ K√ù EMAIL M·ªöI (1).pdf', 'Qu·∫£n l√Ω ƒë∆°n h√†ng-phi·∫øu SX - Power Apps.pdf']
‚úÖ ƒê√£ t·∫£i file: 01. Mai Thu Packaging.pdf
‚úÖ ƒê√£ t·∫£i file: Dao tao van hoa hoi nhap Mai Thu.pdf
‚úÖ ƒê√£ t·∫£i file: H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng b·ªô app qu·∫£n l√Ω.pdf
‚úÖ ƒê√£ t·∫£i file: H∆Ø·ªöNG D·∫™N T·∫†O CH·ªÆ K√ù EMAIL M·ªöI (1).pdf
‚úÖ ƒê√£ t·∫£i file: Qu·∫£n l√Ω ƒë∆°n h√†ng-phi·∫øu SX - Power Apps.pdf


In [5]:
import fitz  # PyMuPDF
import openai
import json
import os

openai_api_key = os.getenv("OPENAI_API_KEY")

def extract_text_from_pdf(pdf_file):
    """Tr√≠ch xu·∫•t vƒÉn b·∫£n t·ª´ file PDF"""
    doc = fitz.open(pdf_file)  # M·ªü file PDF
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()  # Tr√≠ch xu·∫•t vƒÉn b·∫£n t·ª´ m·ªói trang
    return text



# T·∫°o embedding t·ª´ vƒÉn b·∫£n
def get_embedding(text):
    response = openai.Embedding.create(model="text-embedding-ada-002", input=text)
    embedding = response["data"][0]["embedding"]

    return response.data[0].embedding


In [6]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_db")  # T·∫°o th∆∞ m·ª•c l∆∞u tr·ªØ c∆° s·ªü d·ªØ li·ªáu
collection = chroma_client.get_or_create_collection(name="training_docs")  # T·∫°o collection l∆∞u t√†i li·ªáu ƒë√†o t·∫°o

for file_name in pdf_files:
    pdf_path = f"downloads/{file_name}"
    text = extract_text_from_pdf(pdf_path)

    if text.strip():  # Ki·ªÉm tra xem file c√≥ n·ªôi dung kh√¥ng
        collection.add(
            documents=[text],
            embeddings=[get_embedding(text)],
            ids=[file_name]
        )
        print(f"‚úÖ ƒê√£ l∆∞u v√†o ChromaDB: {file_name}")
    else:
        print(f"‚ö†Ô∏è File {file_name} kh√¥ng c√≥ n·ªôi dung!")
        
# Ki·ªÉm tra s·ªë l∆∞·ª£ng t√†i li·ªáu trong ChromaDB
print(f"üìù S·ªë l∆∞·ª£ng t√†i li·ªáu trong ChromaDB: {collection.count()}")



NameError: name 'pdf_files' is not defined

In [None]:
def update_documents_periodically():
    """Ki·ªÉm tra v√† t·∫£i t√†i li·ªáu m·ªõi ƒë·ªãnh k·ª≥"""
    while True:
        print("ƒêang ki·ªÉm tra t√†i li·ªáu m·ªõi t·ª´ SharePoint...")
        pdf_files = get_files_in_folder() 
        for file in pdf_files:
            if file not in downloaded_files:
                download_file(file) 
                downloaded_files.append(file)
        time.sleep(7200)  

In [None]:
def update_chromadb_with_new_documents(pdf_files):
    """C·∫≠p nh·∫≠t t√†i li·ªáu m·ªõi v√†o ChromaDB"""
    for file_name in pdf_files:
        pdf_path = f"downloads/{file_name}"
        text = extract_text_from_pdf(pdf_path) 

        if text.strip():  
            collection.add(
                documents=[text],
                embeddings=[get_embedding(text)],
                ids=[file_name]
            )
            print(f"‚úÖ ƒê√£ c·∫≠p nh·∫≠t v√†o ChromaDB: {file_name}")
        else:
            print(f"‚ö†Ô∏è File {file_name} kh√¥ng c√≥ n·ªôi dung!")

In [12]:
def collect_user_feedback(query, answer):
    """Thu th·∫≠p ph·∫£n h·ªìi t·ª´ ng∆∞·ªùi d√πng v√† l∆∞u l·∫°i v√†o file"""
    feedback = input("C√¢u tr·∫£ l·ªùi n√†y c√≥ ch√≠nh x√°c kh√¥ng? (yes/no): ").lower()
    
    if feedback in ["yes", "y"]:
        print("C·∫£m ∆°n b·∫°n!")
        # L∆∞u l·∫°i ph·∫£n h·ªìi ch√≠nh x√°c n·∫øu c·∫ßn (n·∫øu b·∫°n mu·ªën l∆∞u)
        return True
    else:
        print("C·∫£m ∆°n b·∫°n ƒë√£ ph·∫£n h·ªìi! T√¥i s·∫Ω c·∫£i thi·ªán.")
        # L∆∞u l·∫°i c√°c c√¢u tr·∫£ l·ªùi kh√¥ng ch√≠nh x√°c ƒë·ªÉ ph√¢n t√≠ch
        with open("feedback_log.txt", "a", encoding="utf-8") as f:
            f.write(f"Question: {query}, Answer: {answer}, Feedback: Incorrect\n")
        return False

In [7]:
def search_in_chroma(query, top_k=3):
    embedding = get_embedding(query)
    results = collection.query(
        query_embeddings=[embedding],
        n_results=top_k
    )
    
    # Tr·∫£ v·ªÅ danh s√°ch c√°c vƒÉn b·∫£n, kh√¥ng ph·∫£i danh s√°ch con
    return [result[0] for result in results['documents']] 

In [9]:
# Tr·∫£ l·ªùi c√¢u h·ªèi d·ª±a tr√™n vƒÉn b·∫£n t·ª´ ChromaDB
def generate_answer(query):
    context = "\n".join(search_in_chroma(query))  # K·∫øt h·ª£p c√°c ƒëo·∫°n vƒÉn b·∫£n th√†nh m·ªôt chu·ªói
    if context:
        prompt = f"Tr·∫£ l·ªùi c√¢u h·ªèi sau d·ª±a tr√™n th√¥ng tin d∆∞·ªõi ƒë√¢y:\n\n{context}\n\nC√¢u h·ªèi: {query}\nTr·∫£ l·ªùi:"
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  # Ch·ªçn m√¥ h√¨nh gpt-3.5-turbo thay v√¨ text-davinci-003
            messages=[
                {"role": "system", "content": "B·∫°n l√† m·ªôt tr·ª£ l√Ω th√¥ng minh."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=150
        )
        return response['choices'][0]['message']['content'].strip()
    else:
        return "Xin l·ªói, t√¥i kh√¥ng th·ªÉ t√¨m th·∫•y th√¥ng tin li√™n quan ƒë·∫øn c√¢u h·ªèi c·ªßa b·∫°n."


In [10]:
def chat():
    print("Ch√†o b·∫°n! T√¥i l√† tr·ª£ l√Ω ·∫£o c·ªßa Mai Th∆∞. B·∫°n c√≥ th·ªÉ h·ªèi t√¥i b·∫•t c·ª© c√¢u h·ªèi n√†o.")
    while True:
        user_input = input("B·∫°n: ")
        if user_input.lower() in ["exit", "quit", "bye"]:
            print("T·∫°m bi·ªát!")
            break

        print(f"\nC√¢u h·ªèi c·ªßa b·∫°n: {user_input}\n")
        answer = generate_answer(user_input) 
        
        print(f"Chatbot: {answer}")
        
        collect_user_feedback(user_input, answer)
        
        

In [13]:
if __name__ == "__main__":
    chat()


Ch√†o b·∫°n! T√¥i l√† tr·ª£ l√Ω ·∫£o c·ªßa Mai Th∆∞. B·∫°n c√≥ th·ªÉ h·ªèi t√¥i b·∫•t c·ª© c√¢u h·ªèi n√†o.

C√¢u h·ªèi c·ªßa b·∫°n: i need some informations of Take away box

Chatbot: Th√¥ng tin v·ªÅ h·ªôp ƒë·ª±ng th·ª©c ƒÉn mang v·ªÅ (Take away box) bao g·ªìm:
- V·∫≠t li·ªáu: Kraft, Ivory, gi·∫•y carton 3 l·ªõp,...
- Tr·ªçng l∆∞·ª£ng gi·∫•y: 250gsm - 400gsm,...
- Ph∆∞∆°ng ph√°p in: Offset / Flexo, 1 m√†u / nhi·ªÅu m√†u
- Ki·ªÉu d√°ng: C√≥ c·ª≠a s·ªï / kh√¥ng c√≥ c·ª≠a s·ªï
- C√°c k√≠ch th∆∞·ªõc v√† lo·∫°i h·ªôp kh√°c nhau nh∆∞ h·ªôp c·ªëc coffee mang v·ªÅ, h·ªô
C·∫£m ∆°n b·∫°n!

C√¢u h·ªèi c·ªßa b·∫°n: what about  Cup carrier tray

Chatbot: Cup carrier tray is one of the products in Mai Thu's product range. It is made from white kraft and brown kraft with a paper weight of 350gsm. The printing method used for Cup carrier tray is Offset/Flexo, and it comes in a style with 2 cups or 4 cups.
C·∫£m ∆°n b·∫°n ƒë√£ ph·∫£n h·ªìi! T√¥i s·∫Ω c·∫£i thi·ªán.

C√¢u h·ªèi c·ªßa b·∫°n: and bread b