# coba 1

In [1]:
import os
from tqdm import tqdm
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain.chat_models import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import HumanMessage
from langchain.docstore.document import Document

In [2]:
# === Konfigurasi ===
FOLDER_DOCS = "../bahan-chatbot/txt/"
INDEX_PATH = "vectorstore_index"

In [3]:
# Pakai model LLM & embedding ringan
model = ChatOllama(model="gemma:2b")  # ganti sesuai model yang ada
embeddings = OllamaEmbeddings(model="nomic-embed-text")  # lebih cepat

  model = ChatOllama(model="gemma:2b")  # ganti sesuai model yang ada
  embeddings = OllamaEmbeddings(model="nomic-embed-text")  # lebih cepat


In [4]:
# === Fungsi membuat index FAISS ===
def build_vectorstore():
    docs = []
    print(f"📂 Membaca dokumen dari: {FOLDER_DOCS}")
    files = [f for f in os.listdir(FOLDER_DOCS) if f.endswith(".txt")]

    for filename in tqdm(files, desc="📄 Membaca file TXT", unit="file"):
        with open(os.path.join(FOLDER_DOCS, filename), "r", encoding="utf-8") as f:
            text = f.read()
            docs.append(Document(page_content=text, metadata={"source": filename}))

    print("\n✂️  Memotong dokumen menjadi chunks...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    docs_split = []
    for doc in tqdm(docs, desc="🔹 Memotong teks", unit="dokumen"):
        chunks = splitter.split_documents([doc])
        docs_split.extend(chunks)

    print("\n💾 Membuat FAISS index...")
    vectorstore = FAISS.from_documents(docs_split, embeddings)
    vectorstore.save_local(INDEX_PATH)
    print("✅ Index berhasil dibuat & disimpan!")
    return vectorstore

In [5]:
# === Load atau buat index ===
if os.path.exists(INDEX_PATH):
    print(f"⚡ Memuat FAISS index dari {INDEX_PATH}...")
    vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
    print("✅ Index berhasil dimuat!\n")
else:
    vectorstore = build_vectorstore()

⚡ Memuat FAISS index dari vectorstore_index...
✅ Index berhasil dimuat!



In [28]:
# === Fungsi chatbot ===
def chatbot(query):
    results = vectorstore.similarity_search(query, k=1)
    
    if results:
        context = results[0].page_content
        prompt = f"""
        Jawab pertanyaan berikut dengan bahasa Indonesia yang jelas.
        Jika jawabannya tidak ada di KONTEN, jawab saja: "Maaf, saya tidak menemukan informasi tersebut di dokumen."
        Jika relevan, hanya gunakan informasi dari konteks:
        KONTEN: {context}
        PERTANYAAN: {query}
        """
    else:
        prompt = f"""
        Kamu adalah asisten AI yang ramah dan menjawab dalam bahasa Indonesia.
        Pertanyaan: {query}
        """

    response = model([HumanMessage(content=prompt)])
    return response.content

In [29]:
# Contoh pemanggilan sekali saja
pertanyaan = "apa itu DTSEN?"
jawaban = chatbot(pertanyaan)
print("Kamu:", pertanyaan)
print("Bot :", jawaban)

Kamu: apa itu DTSEN?
Bot : Maaf, saya tidak menemukan informasi tersebut di dokumen.


In [None]:
# === Loop Chat ===
# while True:
#     user_input = input("Kamu: ")
#     if user_input.lower() in ["exit", "quit"]:
#         break
#     print("Bot:", chatbot(user_input))

# batas suci

In [115]:
import os
import json
from tqdm import tqdm
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain.chat_models import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import HumanMessage
from langchain.docstore.document import Document

In [24]:
# === Konfigurasi ===
FOLDER_DOCS = "../bahan-chatbot/txt/"
INDEX_PATH = "vectorstore_index"
META_PATH = "index_meta.json"
THRESHOLD = 0.8  # makin kecil = makin ketat

In [25]:
# Model embedding
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Model chatbot
model = ChatOllama(model="gemma:2b", temperature=0.1)

In [26]:
# === Fungsi baca metadata dokumen ===
def get_docs_metadata():
    metadata = {}
    for filename in os.listdir(FOLDER_DOCS):
        if filename.endswith(".txt"):
            path = os.path.join(FOLDER_DOCS, filename)
            metadata[filename] = os.path.getsize(path)  # bisa diganti last modified time
    return metadata

In [27]:
# === Fungsi membuat index FAISS ===
def build_vectorstore():
    docs = []
    print(f"📂 Membaca dokumen dari: {FOLDER_DOCS}")
    files = [f for f in os.listdir(FOLDER_DOCS) if f.endswith(".txt")]

    for filename in tqdm(files, desc="📄 Membaca file TXT", unit="file"):
        with open(os.path.join(FOLDER_DOCS, filename), "r", encoding="utf-8") as f:
            text = f.read()
            docs.append(Document(page_content=text, metadata={"source": filename}))

    print("\n✂️  Memotong dokumen menjadi chunks...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    docs_split = []
    for doc in tqdm(docs, desc="🔹 Memotong teks", unit="dokumen"):
        chunks = splitter.split_documents([doc])
        docs_split.extend(chunks)

    print("\n💾 Membuat FAISS index...")
    vectorstore = FAISS.from_documents(docs_split, embeddings)
    vectorstore.save_local(INDEX_PATH)

    # Simpan metadata file
    with open(META_PATH, "w") as f:
        json.dump(get_docs_metadata(), f)

    print("✅ Index berhasil dibuat & disimpan!")
    return vectorstore

In [28]:
# === Cek apakah perlu rebuild index ===
def should_rebuild():
    if not os.path.exists(INDEX_PATH) or not os.path.exists(META_PATH):
        return True
    try:
        with open(META_PATH, "r") as f:
            old_meta = json.load(f)
    except:
        return True

    new_meta = get_docs_metadata()
    return old_meta != new_meta

In [None]:
# === Load atau rebuild index ===
if should_rebuild():
    print("🔄 Perubahan dokumen terdeteksi. Membuat index baru...")
    vectorstore = build_vectorstore()
else:
    print(f"⚡ Memuat FAISS index dari {INDEX_PATH}...")
    vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
    print("✅ Index berhasil dimuat!\n")

In [26]:
# === Fungsi chatbot ===
def chatbot(query):
    results = vectorstore.similarity_search_with_score(query, k=3)
    relevant_texts = [r[0].page_content for r in results if r[1] < THRESHOLD]

    if not relevant_texts:
        return "Maaf, saya tidak menemukan informasi tersebut di dokumen."

    context = "\n".join(relevant_texts)

    prompt = f"""
            Jawablah pertanyaan berikut **hanya** berdasarkan informasi dari KONTEN di bawah ini.
            Kamu sangat paham terhadap hal-hal yang berkaitan tentang DTSEN (Data Tunggal Sosial Ekonomi Nasional), Kemiskinan, Bantuan Sosial, dan hal terkait itu.
            Jika jawabannya tidak ada di KONTEN, jawab: "Maaf, saya tidak menemukan informasi tersebut di dokumen."
            Gunakan bahasa Indonesia yang jelas.

            KONTEN:
            {context}

            PERTANYAAN:
            {query}
            """
    response = model([HumanMessage(content=prompt)])
    return response.content

In [27]:
# === Contoh penggunaan di Notebook ===
pertanyaan = "apa itu DTSEN?"
jawaban = chatbot(pertanyaan)
print("Kamu:", pertanyaan)
print("Bot :", jawaban)

Kamu: apa itu DTSEN?
Bot : Maaf, saya tidak menemukan informasi tersebut di dokumen.


# batas suci 2

In [13]:
import os
import json
from tqdm import tqdm
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain.chat_models import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import HumanMessage
from langchain.docstore.document import Document

In [14]:
# === Konfigurasi ===
FOLDER_DOCS = "../bahan-chatbot/txt/"
INDEX_PATH = "vectorstore_index"
META_PATH = "index_meta.json"
THRESHOLD = 1  # makin kecil = makin ketat

In [15]:
# Model embedding & chatbot
embeddings = OllamaEmbeddings(model="nomic-embed-text")
model = ChatOllama(model="gemma:2b", temperature=0.8)

In [16]:
# === Fungsi metadata dokumen ===
def get_docs_metadata():
    return {
        f: os.path.getsize(os.path.join(FOLDER_DOCS, f))
        for f in os.listdir(FOLDER_DOCS)
        if f.endswith(".txt")
    }

In [17]:
# === Split dokumen ===
def split_documents(files):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs_split = []
    for filename in tqdm(files, desc="📄 Memproses dokumen", unit="file"):
        with open(os.path.join(FOLDER_DOCS, filename), "r", encoding="utf-8") as f:
            text = f.read()
            chunks = splitter.split_text(text)
            docs_split.extend([Document(page_content=chunk, metadata={"source": filename}) for chunk in chunks])
    return docs_split

In [18]:
# === Rebuild total index ===
def rebuild_index():
    files = [f for f in os.listdir(FOLDER_DOCS) if f.endswith(".txt")]
    print(f"\n♻️ Mendeteksi {len(files)} file. Membuat ulang FAISS index...")
    docs_split = split_documents(files)
    print("\n💾 Menyimpan index baru...")
    vectorstore = FAISS.from_documents(docs_split, embeddings)
    vectorstore.save_local(INDEX_PATH)
    with open(META_PATH, "w") as f:
        json.dump(get_docs_metadata(), f)
    print("✅ Index berhasil dibuat ulang!")
    return vectorstore

In [19]:
# === Append file baru ===
def append_new_files(vectorstore):
    with open(META_PATH, "r") as f:
        old_meta = json.load(f)
    new_meta = get_docs_metadata()
    new_files = [f for f in new_meta if f not in old_meta]
    if not new_files:
        print("✅ Tidak ada file baru untuk ditambahkan.")
        return vectorstore
    print(f"\n📂 Menambahkan {len(new_files)} file baru ke index...")
    docs_baru = split_documents(new_files)
    print("\n💾 Menyimpan index yang telah diperbarui...")
    vectorstore.add_documents(docs_baru)
    vectorstore.save_local(INDEX_PATH)
    old_meta.update({f: new_meta[f] for f in new_files})
    with open(META_PATH, "w") as f:
        json.dump(old_meta, f)
    print("✅ File baru berhasil ditambahkan.")
    return vectorstore

In [20]:
# === Deteksi perubahan ===
def load_or_update_index():
    if not os.path.exists(INDEX_PATH) or not os.path.exists(META_PATH):
        print("🆕 Index belum ada. Membuat baru...")
        return rebuild_index()

    with open(META_PATH, "r") as f:
        old_meta = json.load(f)
    new_meta = get_docs_metadata()

    # Cek apakah ada file lama berubah
    for file, size in new_meta.items():
        if file in old_meta and old_meta[file] != size:
            print(f"♻️ Perubahan terdeteksi di file: {file}")
            return rebuild_index()

    # Load index lama
    print(f"⚡ Memuat FAISS index dari {INDEX_PATH}...")
    vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)

    # Tambahkan file baru kalau ada
    return append_new_files(vectorstore)

In [21]:
# === Fungsi chatbot ===
def chatbot(query):
    results = vectorstore.similarity_search_with_score(query, k=3)
    relevant_texts = [r[0].page_content for r in results if r[1] < THRESHOLD]
    if not relevant_texts:
        return "Maaf, saya tidak menemukan informasi tersebut di dokumen."
    context = "\n".join(relevant_texts)
    prompt = f"""
Jawablah pertanyaan berikut **hanya** berdasarkan informasi dari KONTEN di bawah ini.
Kamu harus memposisikan diri sebagai suatu customer service, jadi harus ramah kepada penanya. Kalau ada yang menyapa ada minta tolong harus kamu bantu.
Kamu juga paham banyak hal tentang kementerian sosial dan badan pusat statistik yang ada di Indonesia.
Jika jawabannya tidak ada di KONTEN, jawab: "Maaf, saya tidak menemukan informasi tersebut di dokumen."
Gunakan bahasa Indonesia yang jelas.

KONTEN:
{context}

PERTANYAAN:
{query}
"""
    response = model([HumanMessage(content=prompt)])
    return response.content

In [22]:
# === Main ===
vectorstore = load_or_update_index()

🆕 Index belum ada. Membuat baru...

♻️ Mendeteksi 14 file. Membuat ulang FAISS index...


📄 Memproses dokumen: 100%|██████████| 14/14 [00:00<00:00, 559.52file/s]


💾 Menyimpan index baru...





✅ Index berhasil dibuat ulang!


In [25]:
pertanyaan = "apa itu DTSEN?"
print("Kamu:", pertanyaan)
print("Bot :", chatbot(pertanyaan))

Kamu: apa itu DTSEN?
Bot : Maaf, saya tidak menemukan informasi tersebut di dokumen.


# reset index

In [24]:
# import shutil
# import os

# INDEX_PATH = "vectorstore_index"
# META_PATH = "index_meta.json"

# # Hapus folder FAISS index
# if os.path.exists(INDEX_PATH):
#     shutil.rmtree(INDEX_PATH)
#     print(f"✅ Folder {INDEX_PATH} dihapus.")

# # Hapus metadata
# if os.path.exists(META_PATH):
#     os.remove(META_PATH)
#     print(f"✅ File {META_PATH} dihapus.")