In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
from uuid import uuid4

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

import pandas as pd
import sqlite3

In [None]:
PDF_PATH = "../files/synthetic_data_backup.pdf"
CSV_PATH = "../files/synthetic_data.csv"

# Vector Database

In [None]:
def load_pdf():
    doc_loader = PyPDFLoader(PDF_PATH)
    documents = doc_loader.load()
    for i in range(len(documents)):
        documents[i].page_content = ' '.join(documents[i].page_content.split())
    return documents

In [10]:
# Split documents into pages
def split_documents(documents: list[Document]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False
    )
    return splitter.split_documents(documents)

In [11]:
def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0
            
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        chunk.metadata["chunk_id"] = chunk_id

    return chunks

In [None]:
def laod_pdf_documents():
    documents = load_pdf()
    chunks = split_documents(documents)
    chunks = calculate_chunk_ids(chunks)
    return chunks

chunks = laod_pdf_documents()
chunks[0:3]

[Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-02-26T19:24:08+07:00', 'title': 'Dokumentasi Teknis AI Assistant X-3000', 'author': "Khoirul Ma'arif", 'subject': 'Panduan Penggunaan, Instalasi, dan Optimasi', 'moddate': '2025-02-26T19:24:08+07:00', 'source': "../files/M. Khoirul Ma'arif_synthetic_data_backup.pdf", 'total_pages': 60, 'page': 0, 'page_label': '1', 'chunk_id': "../files/M. Khoirul Ma'arif_synthetic_data_backup.pdf:0:0"}, page_content='DOKUMENTASI TEKNIS AI ASSISTANT X-3000 Panduan Penggunaan, Instalasi, dan Optimasi'),
 Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-02-26T19:24:08+07:00', 'title': 'Dokumentasi Teknis AI Assistant X-3000', 'author': "Khoirul Ma'arif", 'subject': 'Panduan Penggunaan, Instalasi, dan Optimasi', 'moddate': '2025-02-26T19:24:08+07:00', 'source': "../files/M. Khoirul Ma'arif_synthetic_data_backup.pdf", 'total_pages': 

In [13]:
# embedding function
def get_embeddings():
    # embeddings = OllamaEmbeddings(model="bge-m3")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    return embeddings

In [14]:
# embedding
embeddings = get_embeddings()

# index
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

# vector store
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
uuids = [str(uuid4()) for _ in range(len(chunks))]

# add documents
vector_store.add_documents(documents=chunks, uuids=uuids)

  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


['48de61c3-2e8f-4c65-b116-33775778a028',
 'f6ea95df-3c08-4acb-bf6d-ec511721feb2',
 'e9a95a80-8d01-4b6e-aef3-753a335295c2',
 'e426f25f-d4e2-4a08-8b8a-362011ded475',
 '9db31ae2-c15e-4482-b1c3-1be45dfc1ba6',
 'bc36da9a-3e7e-46f0-8396-12c61c2d01b7',
 '978485f3-abc7-41d7-bef2-bab370582ebc',
 '561c9781-4bf2-4f5e-a7bc-063e6484f26c',
 '357635b0-cd39-418f-9c96-21fd98bfff0a',
 '95bf007a-e798-485b-9a3c-4efe4d02f7ee',
 '5b13f7aa-f466-4462-9222-c9ddee6576a9',
 'ba2fb690-395b-4882-bec7-c5125991f74c',
 'ce263586-b519-4fc6-b8b3-a0c200e5cb2a',
 'e0663f77-69b3-4278-aef9-bdd50710d6e3',
 '1b4a7caa-b627-46cf-81c0-02622abfa3e1',
 '70e0a90c-8a24-463b-afef-bee0ff7050c6',
 '1a330ece-9257-4b7a-b4ef-581da659d63d',
 '1fd9e2a3-6b3a-4ccd-a0c7-d8b74ed6e1af',
 '36ca2796-def3-4593-b5d8-d190d181d3be',
 '3b735436-bdc5-47df-964e-b4248f8b8fd5',
 'ab102098-84d7-46c2-aab0-d132e4124291',
 '4d76a15a-1a95-4b30-935c-91e0b851cf52',
 '5210e471-1300-4fdc-b989-2f4474e968de',
 '5d8a7993-1de7-4d03-8820-10028cf0529a',
 '9ebe7820-e293-

In [15]:
results = vector_store.similarity_search(
    "Perintah apa saja yang paling sering diberikan kepada AI Assistant X-3000?", 
    k=5, 
)
for res in results:
    print(f"{res.page_content} \n[{res.metadata}]")
    print()

dalam menu pengaturan. 2. AI Assistant X -3000 akan meminta pengguna untuk mengucapkan beberapa perintah standar, seperti: o "Halo AI Assistant, nyalakan lampu." o "Atur pengingat untuk rapat besok jam 10 pagi." 3. Sistem akan menganalisis pola suara dan aksen pengguna. 4. Jika berhasil, AI Assistant X -3000 akan lebih responsif terhadap suara pengguna utama. 
[{'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-02-26T19:24:08+07:00', 'title': 'Dokumentasi Teknis AI Assistant X-3000', 'author': "Khoirul Ma'arif", 'subject': 'Panduan Penggunaan, Instalasi, dan Optimasi', 'moddate': '2025-02-26T19:24:08+07:00', 'source': "../files/M. Khoirul Ma'arif_synthetic_data_backup.pdf", 'total_pages': 60, 'page': 16, 'page_label': '17', 'chunk_id': "../files/M. Khoirul Ma'arif_synthetic_data_backup.pdf:16:1"}]

BAB 3 PENGGUNAAN LANJUTAN AI ASSISTANT X-3000 Setelah AI Assistant X -3000 berhasil dikonfigurasi, pengguna dapat memanfaatkan berbagai fitur cangg

In [16]:
# save vector store
vector_store.save_local("../database/faiss_index")

# SQL Database

In [None]:
# Load CSV
csv_path = CSV_PATH
df = pd.read_csv(csv_path)

# Create SQLite database
conn = sqlite3.connect("../database/csv_database.db")
df.to_sql("synthetic_data", conn, if_exists="replace", index=False)
conn.close()

print("CSV data successfully stored in SQL database!")

CSV data successfully stored in SQL database!


In [19]:
# print df to json
df.to_json("../database/synthetic_data.json", orient="records")

In [None]:
def query_sql_database(query, db_path="../database/csv_database.db"):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

sql_result = query_sql_database("SELECT * FROM synthetic_data LIMIT 5")
sql_result

Unnamed: 0,interaction_id,user_id,timestamp,device_type,command_category,command_text,ai_response,response_time_ms,ai_confidence_score,user_satisfaction,status,error_code
0,1,447,2024-03-12 17:56:48,Smart Speaker,Information,Apa rekomendasi restoran di sekitar sini?,Akses ditolak,310,67.97,4,Error,ERR403
1,2,469,2024-12-01 06:40:13,Smartphone,Productivity,Bagikan agenda meeting dengan anggota tim,Tugas telah disimpan,315,70.04,5,Success,
2,3,113,2024-09-08 02:02:41,Smart TV,Home Automation,Atur suhu AC ke 24°C,Atur suhu AC ke 24°C berhasil dilakukan,602,87.21,3,Success,
3,4,238,2024-10-26 06:44:04,Smartphone,Entertainment,Cari podcast tentang teknologi AI,Media sedang diputar,294,90.5,5,Success,
4,5,484,2024-08-16 23:26:00,Smart Speaker,Productivity,Tampilkan laporan produktivitas minggu ini,Tugas telah disimpan,203,83.39,4,Success,
