In [None]:
from google.colab import auth
auth.authenticate_user()



In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel
import time
import os
from typing import List, Tuple, Set, Optional
from datetim import datetime

# Fungsi ini harus didefinisikan di scope global agar bisa dipanggil oleh kelas.
def custom_log(level: str, message: str):
    """
    Mencetak pesan log ke konsol dengan format waktu, level, dan pesan yang seragam.
    
    Args:
        level (str): Tingkat keparahan log (misalnya "INFO", "ERROR", "WARNING", "CRITICAL").
        message (str): Pesan log yang akan dicetak.
    """
    # Mengambil waktu saat ini dan memformatnya
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Mencetak output log
    print(f"{timestamp} - {level} - {message}")

# --- Keywords ---
# Daftar kunci kata yang diizinkan (vocabulary) untuk membuat dataset.
# Setiap frasa yang dihasilkan LLM HARUS terdiri dari kata-kata ini.
KEYWORDS = [
    "Saya", "Mau", "Ya", "Tidak", "Cukup", "Makan", "Minum", "Tidur", "Kamu",
    "Lagi", "Bukan", "Toilet", "Halo", "Bagus", "Buruk", "Lihat", "Bicara",
    "Dengar", "Rumah", "Sakit", "Tidak Bisa Bicara", "Bermain", "Terima Kasih",
    "Sampai Jumpa", "Senang", "Sedih", "Marah", "Cinta", "Takut", "Bingung",
    "Lapar", "Nasi", "Mie", "Sup", "Ikan", "Tempe", "Ayam Goreng",
    "Telur Goreng", "Roti", "Hamburger", "Kentang goreng", "Buah-buahan",
    "Es krim", "Keripik", "Cokelat", "Permen", "Kacang", "Haus", "Air putih",
    "Susu", "Susu Cokelat", "Jus Jeruk", "Apel", "Jeruk", "Semangka", "Pisang",
    "Anggur", "Stroberi", "Alpukat", "Anjing", "Kucing", "Kelinci", "Bebek",
    "Babi", "Beruang", "Panda", "Penguin", "Gajah", "Rubah", "Koala",
    "Burung Hantu", "Katak", "MonyeT", "Sapi", "Ayam", "Tikus", "Burung",
    "Kura-kura", "Penyu", "Domba", "Ayah", "Ibu", "Kakek", "Nenek",
    "Kakak Perempuan", "Kakak Laki-Laki", "Adik Perempuan", "Adik Laki-Laki",
    "Pensil", "Rautan", "Tas Ransel", "Buku", "Gunting", "Penggaris",
    "Pensil Warna", "Kertas", "Origami", "Lem", "Handphone", "Tablet",
    "Laptop", "TV", "Komputer", "Remot Kontrol", "Kamera", "AC",
    "Kipas Angin", "Sendok", "Garpu", "Sumpit", "Piring", "Mangkuk",
    "Gelas", "Cangkir", "Pisau", "Kulkas", "Pagi", "Siang", "Sore",
    "Malam", "Waktu", "Jam", "Menit", "Sekarang", "Hari ini", "Besok",
    "Kemarin", "Seminggu", "Minggu ini", "Akhir Pekan", "Minggu Depan",
    "Minggu Lalu", "Bulan", "Bulan Ini", "Bulan Depan", "Bulan Lalu",
    "Matahari", "Terik", "Hujan", "Panas", "Dingin", "Musim", "Kemarau",
    "Berawan", "Badai", "Ini", "Besar", "Sedang", "Kecil", "Banyak",
    "Sedikit", "Tebal", "Tipis", "Kotor", "Bersih"
]

# --- Configuration ---
# Kelas untuk menampung semua parameter dan konfigurasi.
class Config:
    PROJECT_ID = "talk-of-the-heart-aac-464912"
    LOCATION = "us-central1"
    MODEL_NAME = "gemini-2.5-pro" # Model LLM yang digunakan untuk generasi data.
    DATA_QUANTITY = 7000 
    BATCH_SIZE = 150 # Jumlah frasa yang diminta LLM per panggilan API.
    MAX_RETRIES = 5 
    SUCCESS_DELAY = 0.5 
    FAILURE_DELAY = 10 
    MIN_PHRASE_LENGTH = 2
    MAX_PHRASE_LENGTH = 6 
    OUTPUT_FILENAME = "Dataset_Key.txt" 
    CHECKPOINT_FILENAME = "Dataset_Checkpoint.txt"

# --- Core Logic ---
# Kelas utama yang mengelola inisialisasi, generasi, dan validasi data.
class DatasetGenerator:
    def __init__(self, config: Config, keywords: List[str]):
        # Konstruktor: Mengatur konfigurasi dan menyiapkan alat.
        self.config = config
        self._initialize_vertexai() # Menghubungkan ke Google Vertex AI dan memuat model.

        # Mengubah keywords menjadi huruf kecil untuk validasi dan mengurutkannya 
        # berdasarkan panjang (terpanjang dulu) untuk validasi urutan kata yang akurat.
        self.lowercase_keywords = sorted(list(set(word.lower() for word in keywords)))
        self.sorted_keywords = sorted(self.lowercase_keywords, key=len, reverse=True)
        self.final_dataset: Set[Tuple[str, ...]] = set()

        self._load_from_checkpoint() 
        self._precompute_logical_rules()

    def _load_from_checkpoint(self):
        # Memuat data dari file checkpoint jika ada.
        if not os.path.exists(self.config.CHECKPOINT_FILENAME):
            custom_log("INFO", "Checkpoint file not found. Starting a new dataset.")
            return
        # Logika pembacaan file dan penambahan frasa yang tersimpan ke final_dataset.
        try:
            with open(self.config.CHECKPOINT_FILENAME, "r", encoding="utf-8") as f:
                pass # Baris ini hanya placeholder; kode aslinya melakukan pembacaan.
        except Exception as e:
            custom_log("ERROR", f"Failed to load from checkpoint file: {e}. Starting fresh.")
            self.final_dataset = set()

    def _save_to_checkpoint(self):
        # Menyimpan data yang sudah terkumpul ke file checkpoint.
        try:
            with open(self.config.CHECKPOINT_FILENAME, "w", encoding="utf-8") as f:
                sorted_list = sorted(list(self.final_dataset))
                for data_tuple in sorted_list:
                    formatted_keywords = [f'"{keyword}"' for keyword in data_tuple]
                    line = ", ".join(formatted_keywords)
                    f.write(line + "\n")
            custom_log("INFO", f"Checkpoint saved with {len(self.final_dataset)} records.")
        except IOError as e:
            custom_log("ERROR", f"Could not save checkpoint to file '{self.config.CHECKPOINT_FILENAME}': {e}")

    def _initialize_vertexai(self):
        # Menyiapkan koneksi ke Google Vertex AI dan Generative Model (Gemini).
        try:
            custom_log("INFO", f"Initializing Vertex AI for project '{self.config.PROJECT_ID}' in location '{self.config.LOCATION}'...")
            vertexai.init(project=self.config.PROJECT_ID, location=self.config.LOCATION)
            self.model = GenerativeModel(self.config.MODEL_NAME)
            custom_log("INFO", f"Successfully initialized Generative Model '{self.config.MODEL_NAME}'.")
        except Exception as e:
            custom_log("ERROR", f"An error occurred during Vertex AI initialization: {e}")
            # Menampilkan pesan kesalahan kesalahan dan menghentikan skrip.
            print("\n--------------------------------------------------------------")
            print("CRITICAL ERROR: Please ensure Authentication and Project ID are correct.")
            print("--------------------------------------------------------------\n")
            raise

    def _precompute_logical_rules(self):
        # Mendefinisikan aturan logika (antonym dan konflik waktu/logika)
        antonyms = [
            ("bersih", "kotor"), ("bagus", "buruk"), ("besar", "kecil"),
            ("tebal", "tipis"), ("panas", "dingin"), ("banyak", "sedikit"),
            ("ya", "tidak"), ("senang", "sedih")
        ]
        # Diubah menjadi frozenset yang tidak order-independent, sehingga lebih efisien 
        self.antonym_sets = [frozenset(pair) for pair in antonyms]
        
        # Pasangan kata yang tidak masuk akal (illogical).
        illogical_pairs = [("makan", "ac"), ("minum", "lem")]
        self.illogical_pair_sets = [frozenset(pair) for pair in illogical_pairs]
        
        # Pasangan kata yang menunjukkan konflik waktu.
        temporal_conflicts = [
            ("kemarin", "besok"), ("kemarin", "minggu depan"), ("kemarin", "bulan depan"),
            ("minggu lalu", "besok"), ("minggu lalu", "minggu depan"), ("minggu lalu", "bulan depan"),
            ("bulan lalu", "besok"), ("bulan lalu", "minggu depan"), ("bulan lalu", "bulan depan"),
            ("sekarang", "kemarin"), ("sekarang", "minggu lalu"), ("sekarang", "bulan lalu"),
            ("sekarang", "besok"), ("hari ini", "kemarin"), ("hari ini", "minggu lalu"),
            ("hari ini", "bulan lalu"), ("hari ini", "besok"), ("minggu ini", "minggu lalu"),
            ("minggu ini", "minggu depan"), ("bulan ini", "bulan lalu"), ("bulan ini", "bulan depan")
        ]
        self.temporal_conflict_sets = [frozenset(pair) for pair in temporal_conflicts]
        custom_log("INFO", "Logical rule sets have been pre-computed for efficiency.")


    def is_phrase_logically_valid(self, keywords_list: List[str]) -> bool:
        # Memeriksa apakah suatu frasa melanggar aturan logika yang telah ditentukan.
        keyword_set = set(keywords_list)

        if any(pair.issubset(keyword_set) for pair in self.antonym_sets): return False # Cek antonim
        if any(pair.issubset(keyword_set) for pair in self.temporal_conflict_sets): return False # Cek konflik waktu
        if any(pair.issubset(keyword_set) for pair in self.illogical_pair_sets): return False # Cek pasangan ilogis

        return True

    def validate_keyword_sequence(self, phrase: str) -> Optional[List[str]]:
        # Memastikan frasa yang dihasilkan LLM HANYA terdiri dari kata-kata yang ada di daftar KEYWORD (Mengatasi Model Halusinasi).
        temp_phrase = phrase.strip().lower()
        found_keywords = []
        while temp_phrase:
            match_found = False
            for keyword in self.sorted_keywords:
                # Mencocokkan kata kunci terpanjang terlebih dahulu (untuk menangani frasa majemuk seperti "air putih").
                if temp_phrase.startswith(keyword + " ") or temp_phrase == keyword:
                    found_keywords.append(keyword)
                    temp_phrase = temp_phrase[len(keyword):].strip() # Memotong kata kunci yang sudah dicocokkan.
                    match_found = True
                    break
            if not match_found:
                return None # Mengembalikan None jika ada kata yang TIDAK termasuk KEYWORDS.
        return found_keywords

    def _build_prompt(self, batch_size: int) -> str:
        # Membuat prompt yang sangat ketat untuk model LLM (Gemini).
        keyword_string = ", ".join(self.lowercase_keywords)
        return f"""
        PERHATIKAN: Anda adalah mesin generator data presisi tinggi. Kegagalan dalam mematuhi aturan format akan membuat output tidak dapat digunakan. Ikuti semua instruksi dengan sangat teliti.

        TUGAS UTAMA: Hasilkan {batch_size} baris kombinasi kata kunci.

        <ATURAN_SANGAT_PENTING>
        1. SETIAP KATA dalam output Anda HARUS, TANPA PENGECUALIAN, berasal dari daftar kata kunci yang diizinkan ini. JANGAN TAMBAHKAN KATA APAPUN di luar daftar.
        2. Setiap baris output HANYA berisi kata kunci yang dipisahkan oleh spasi. TIDAK BOLEH ada tanda baca, nomor, atau kata pengantar seperti "Berikut adalah...".
        3. Setiap baris harus berisi 2 hingga 6 kata kunci.
        4. Jangan ulangi kata kunci dalam satu baris.
        5. Semua kombinasi harus logis dan masuk akal dalam Bahasa Indonesia. Hindari kontradiksi (misal: "panas dingin", "kemarin besok").
        </<ATURAN_SANGAT_PENTING>
        
        <DAFTAR_KATA_KUNCI_YANG_DIIZINKAN>
        {keyword_string}
        </DAFTAR_KATA_KUNCI_YANG_DIIZINKAN>
        <CONTOH_OUTPUT_YANG_BENAR>
        saya lapar mau makan nasi sekarang
        adik laki-laki bermain komputer lagi
        nenek sedih hari ini
        tas ransel kotor
        ibu minum air putih
        burung hantu tidur siang
        rumah besar bersih
        </CONTOH_OUTPUT_YANG_BENAR>     
        """

    def generate(self):
        # Fungsi utama untuk menjalankan proses generasi dataset secara berulang (iteratif).
        custom_log("INFO", f"Starting dataset generation. Goal: {self.config.DATA_QUANTITY} unique phrases.")
        start_time = time.time()
        
        # Berjalan selama target kuantitas belum tercapai.
        while len(self.final_dataset) < self.config.DATA_QUANTITY:
            needed = self.config.DATA_QUANTITY - len(self.final_dataset)
            current_batch_size = min(self.config.BATCH_SIZE, needed)

            custom_log("INFO", f"Requesting {current_batch_size} new phrases. (Current unique total: {len(self.final_dataset)}/{self.config.DATA_QUANTITY})")

            llm_prompt = self._build_prompt(current_batch_size)
            
            # Logika percobaan ulang (retry) untuk mengatasi kegagalan API.
            for attempt in range(self.config.MAX_RETRIES):
                try:
                    response = self.model.generate_content(llm_prompt) # Memanggil LLM
                    llm_results = [result.strip() for result in response.text.strip().split('\n') if result.strip()]
                    newly_added_count = self._process_batch(llm_results) # Memproses dan memvalidasi hasil.

                    if newly_added_count > 0:
                        custom_log("INFO", f"Success. Added {newly_added_count} new valid phrases. New total: {len(self.final_dataset)}")
                        self._save_to_checkpoint()
                        time.sleep(self.config.SUCCESS_DELAY)
                        break # Lanjut ke iterasi berikutnya
                    else:
                        custom_log("WARNING", f"Attempt {attempt + 1}/{self.config.MAX_RETRIES}: Batch yielded 0 new valid phrases. Retrying...")
                        time.sleep(self.config.FAILURE_DELAY)
                except Exception as e:
                    custom_log("ERROR", f"Attempt {attempt + 1}/{self.config.MAX_RETRIES}: An API error occurred: {e}. Retrying in {self.config.FAILURE_DELAY}s...")
                    time.sleep(self.config.FAILURE_DELAY)
                    
        # Log akhir setelah selesai.
        total_time = time.time() - start_time
        print("---------------------------------------------------------")
        custom_log("INFO", "PROCESS COMPLETE!")
        custom_log("INFO", f"Total UNIQUE data points created: {len(self.final_dataset)}")
        custom_log("INFO", f"Total time elapsed: {total_time:.2f} seconds")

    def _process_batch(self, llm_results: List[str]) -> int:
        # Memvalidasi setiap frasa dari LLM dan menambahkannya ke dataset jika valid.
        initial_count = len(self.final_dataset)
        for result in llm_results:
            valid_tokens = self.validate_keyword_sequence(result)
            if not valid_tokens:
                continue
            
            # Cek 4 kriteria validasi keras:
            is_long_enough = self.config.MIN_PHRASE_LENGTH <= len(valid_tokens) <= self.config.MAX_PHRASE_LENGTH # 1. Panjang kata
            is_internal_unique = len(valid_tokens) == len(set(valid_tokens)) # 2. Tidak ada pengulangan kata
            is_logical = self.is_phrase_logically_valid(valid_tokens) # 3. Tidak ada konflik logika
            
            if is_long_enough and is_internal_unique and is_logical:
                self.final_dataset.add(tuple(valid_tokens)) # 4. Menambahkan ke set (otomatis cek keunikan)
                
        return len(self.final_dataset) - initial_count # Mengembalikan jumlah frasa baru yang berhasil ditambahkan.

    def save_to_file(self):
        # Menyimpan dataset final ke file dan menghapus checkpoint.
        if not self.final_dataset:
            custom_log("WARNING", "Final dataset is empty. Nothing to save.")
            return
            
        sorted_dataset = sorted(list(self.final_dataset))
        try:
            with open(self.config.OUTPUT_FILENAME, "w", encoding="utf-8") as f:
                for data_tuple in sorted_dataset:
                    formatted_keywords = [f'"{keyword}"' for keyword in data_tuple]
                    line = ", ".join(formatted_keywords)
                    f.write(line + "\n")
            custom_log("INFO", f"Dataset successfully saved to '{self.config.OUTPUT_FILENAME}'")
            
            # Menghapus file checkpoint karena proses sudah selesai.
            if os.path.exists(self.config.CHECKPOINT_FILENAME):
                os.remove(self.config.CHECKPOINT_FILENAME)
                custom_log("INFO", f"Checkpoint file '{self.config.CHECKPOINT_FILENAME}' removed.")
        except IOError as e:
            custom_log("ERROR", f"Failed to write to file '{self.config.OUTPUT_FILENAME}': {e}")


# --- Main Execution ---
def main():
    # Fungsi utama untuk menjalankan seluruh skrip.
    try:
        config = Config()
        generator = DatasetGenerator(config, KEYWORDS)
        generator.generate()
        generator.save_to_file()
    except NameError:
        custom_log("CRITICAL", "ERROR: Variable 'KEYWORDS' is not defined. ...")
    except Exception as e:
        custom_log("CRITICAL", f"A critical error occurred, and the script has to exit: {e}")

if __name__ == "__main__":
    # Memastikan main() hanya berjalan saat skrip dieksekusi langsung.
    main()

2025-09-26 04:45:57 - INFO - Initializing Vertex AI for project 'talk-of-the-heart-aac-464912' in location 'us-central1'...
2025-09-26 04:45:57 - INFO - Successfully initialized Generative Model 'gemini-2.5-pro'.
2025-09-26 04:45:57 - INFO - Checkpoint file not found. Starting a new dataset.
2025-09-26 04:45:57 - INFO - Logical rule sets have been pre-computed for efficiency.
2025-09-26 04:45:57 - INFO - Starting dataset generation. Goal: 5000 unique phrases.
2025-09-26 04:45:57 - INFO - Requesting 150 new phrases. (Current unique total: 0/5000)




2025-09-26 04:47:12 - INFO - Success. Added 119 new valid phrases. New total: 119
2025-09-26 04:47:12 - INFO - Checkpoint saved with 119 records.
2025-09-26 04:47:13 - INFO - Requesting 150 new phrases. (Current unique total: 119/5000)
2025-09-26 04:48:22 - INFO - Success. Added 127 new valid phrases. New total: 246
2025-09-26 04:48:22 - INFO - Checkpoint saved with 246 records.
2025-09-26 04:48:22 - INFO - Requesting 150 new phrases. (Current unique total: 246/5000)
2025-09-26 04:49:02 - INFO - Success. Added 113 new valid phrases. New total: 359
2025-09-26 04:49:02 - INFO - Checkpoint saved with 359 records.
2025-09-26 04:49:02 - INFO - Requesting 150 new phrases. (Current unique total: 359/5000)
2025-09-26 04:50:04 - INFO - Success. Added 118 new valid phrases. New total: 477
2025-09-26 04:50:04 - INFO - Checkpoint saved with 477 records.
2025-09-26 04:50:05 - INFO - Requesting 150 new phrases. (Current unique total: 477/5000)
2025-09-26 04:51:03 - INFO - Success. Added 44 new valid