# Proses Training untuk buat Model

In [2]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [3]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import numpy as np
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [4]:
# Load Indonesian Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Function to preprocess text: lowercase, remove punctuation, and lemmatize
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Lemmatize each word
    words = text.split()
    lemmatized_words = [stemmer.stem(word) for word in words]
    # Join the words back into a single string
    return ' '.join(lemmatized_words)

In [5]:
!wget https://raw.githubusercontent.com/Muhammad-Ikhwan-Fathulloh/Artificial-Intelligence-Super-Class-Batch-1/refs/heads/main/Text_Query/data.json

--2025-03-24 15:23:37--  https://raw.githubusercontent.com/Muhammad-Ikhwan-Fathulloh/Artificial-Intelligence-Super-Class-Batch-1/refs/heads/main/Text_Query/data.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4985 (4.9K) [text/plain]
Saving to: ‘data.json’


2025-03-24 15:23:37 (60.3 MB/s) - ‘data.json’ saved [4985/4985]



In [6]:
# Load the corpus
with open("data.json", "r") as file:
    corpus = json.load(file)["qa_corpus"]

In [7]:
# Extract questions and answers
questions = [item["question"] for item in corpus]
answers = [item["answer"] for item in corpus]

# Preprocess questions and answers
preprocessed_questions = [preprocess_text(question) for question in questions]
preprocessed_answers = [preprocess_text(answer) for answer in answers]

# Combine preprocessed questions and answers into a single corpus for TF-IDF training
combined_corpus = preprocessed_questions + preprocessed_answers

# Initialize and fit the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(combined_corpus)

# Save the vectorizer model
joblib.dump(vectorizer, "vectorizer.joblib")

# Transform each question into a TF-IDF vector
question_vectors = vectorizer.transform(preprocessed_questions)

# Prepare data for saving to vector.json
vector_data = []
for question, answer, vector in zip(questions, answers, question_vectors):
    vector_data.append({
        "question": question,
        "answer": answer,
        "vector": vector.toarray().tolist()[0]  # Convert sparse matrix to dense list
    })

# Save the vectors to vector.json
with open("vector.json", "w") as file:
    json.dump(vector_data, file, indent=4)

print("Training complete and data saved to vectorizer.joblib and vector.json")

Training complete and data saved to vectorizer.joblib and vector.json


# Proses Testing hasil Training Model

In [8]:
import json
import joblib
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

In [9]:
# Load the vectorizer and vector data
vectorizer = joblib.load("vectorizer.joblib")

with open("vector.json", "r") as file:
    question_vectors = json.load(file)

In [10]:
# Set threshold for distance
THR = 1  # Adjust this value based on testing and requirement

def find_best_match(user_question):
    # Transform the user question into a TF-IDF vector
    user_vector = vectorizer.transform([user_question]).toarray()

    closest_distance = float("inf")
    closest_question = None
    closest_answer = None

    for item in question_vectors:
        question_vector = np.array(item["vector"]).reshape(1, -1)
        distance = euclidean_distances(user_vector, question_vector)[0][0]

        # Check if this question is the closest and below the threshold
        if distance < closest_distance and distance <= THR:
            closest_distance = distance
            closest_question = item["question"]
            closest_answer = item.get("answer", "Maaf, tidak ada jawaban yang tersedia.")

    return closest_question, closest_answer, closest_distance

In [11]:
# Main Q&A loop
print("Selamat datang di sistem tanya jawab Emerald Mabel. Ketik 'exit' untuk keluar.")

while True:
    user_question = input("Anda: ")
    if user_question.lower() == "exit":
        print("Terima kasih telah menggunakan layanan kami. Sampai jumpa!")
        break

    closest_question, closest_answer, closest_distance = find_best_match(user_question)

    if closest_question:
        print(f"Pertanyaan terkait: {closest_question}")
        print(f"Emerald Mabel: {closest_answer}")
    else:
        print("Maaf, kami tidak menemukan jawaban yang sesuai dengan pertanyaan Anda.")\

Selamat datang di sistem tanya jawab Emerald Mabel. Ketik 'exit' untuk keluar.
Anda: apa itu emerald
Maaf, kami tidak menemukan jawaban yang sesuai dengan pertanyaan Anda.
Anda: Apakah Emerald Mabel menyediakan pengiriman
Pertanyaan terkait: Apakah Emerald Mabel menyediakan pengiriman untuk semua produk?
Emerald Mabel: Ya, kami menyediakan layanan pengiriman untuk semua produk kami. Biaya pengiriman dapat bervariasi tergantung lokasi.
Anda: exit
Terima kasih telah menggunakan layanan kami. Sampai jumpa!
