In [22]:
import re
import json
import joblib
import numpy as np
import time
import sys
from datetime import datetime
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from scipy.sparse import hstack

# ==== KONFIGURASI FILE ====
BOOSTER_JSON_PATH = "source/booster.json"
BOOSTER_CONFIG_PATH = "source/booster_config.json"
VECT_PATH = "model/vectorizer-fiks-1.pkl"
XGB_PATH = "model/xgboost_model-fiks-1.pkl"
KEYW_PATH = "model/aduan_keywords-1.npy"
SLANG_PATH = "source/slang-kamus.txt"
THRESHOLD = 0.4

# ==== FUNGSI UTIL ====
def print_banner():
    banner = """
================================================================
                                                            
    🔍 SISTEM DETEKSI ADUAN TRANSPORTASI DAN LALU LINTAS OTOMATIS                     
    ════════════════════════════════════════════════════════════                

    📊 Powered by Machine Learning & NLP                     
    🚀 Enhanced CLI Interface                                 
    ⚡ Real-time Text Analysis                                
                                                            
================================================================
"""
    print(banner)

def print_separator(char="=", length=66):
    print(char * length)

def print_loading(message="Loading"):
    for i in range(4):
        sys.stdout.write(f"\r⏳ {message}{'.' * i}   ")
        sys.stdout.flush()
        time.sleep(0.3)
    sys.stdout.write(f"\r✅ {message} Complete!\n")

# ==== LOAD DATA ====
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_slang_dict():
    slang = {}
    try:
        with open(SLANG_PATH, "r", encoding="utf-8") as f:
            for line in f:
                if ":" in line:
                    k, v = line.strip().split(":")
                    slang[k] = v
    except:
        pass
    return slang

# ==== LOGIKA BOOSTER ====
def is_booster_aduan(text, token_pairs, regex_patterns):
    tokens = set(word_tokenize(text.lower()))
    for pair in token_pairs:
        if all(p in tokens for p in pair):
            return True, f"[TokenPairBooster: {pair}]"
    for rule in regex_patterns:
        if re.search(fr"{rule['head']}\s+{rule['tail']}", text.lower()):
            return True, f"[RegexBooster: {rule}]"
    return False, None

def normalize_slang(text, slang_dict):
    return " ".join([slang_dict.get(w, w) for w in word_tokenize(text.lower())])

def stem_and_check(word, stemmer, keywords):
    stem = stemmer.stem(word)
    return int(stem in keywords), stem

def preprocess(text, slang_dict, stemmer, stop_words, keywords):
    text = normalize_slang(text, slang_dict)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = [w for w in word_tokenize(text) if w not in stop_words]
    clean = []
    score = 0
    for w in tokens:
        c, s = stem_and_check(w, stemmer, keywords)
        clean.append(s)
        score += c
    return " ".join(clean), score, len(tokens)

def is_keyword_aduan(text, keywords):
    return int(bool(set(word_tokenize(text.lower())) & keywords))

# ==== PREDIKSI UTAMA ====
def predict_aduan(text, booster_aduan, booster_bukan_aduan, token_pairs, regex_patterns,
                vectorizer, model, keywords, slang_dict, stemmer, stop_words):

    text_lower = text.lower()
    start = time.time()

    for pat in booster_aduan:
        if re.search(pat, text_lower):
            duration = time.time() - start
            return 1, 1.0, 0, text, f"[AduanBooster: {pat}]", len(text), 0, duration

    ok, note = is_booster_aduan(text, token_pairs, regex_patterns)
    if ok:
        duration = time.time() - start
        return 1, 1.0, 0, text, note, len(text), 0, duration

    cleaned, senti_score, token_count = preprocess(text, slang_dict, stemmer, stop_words, keywords)
    Xv = vectorizer.transform([cleaned])
    Xk = np.array([is_keyword_aduan(cleaned, keywords)]).reshape(-1, 1)
    X = hstack([Xv, Xk])
    prob = model.predict_proba(X)[0, 1]
    label = int(prob >= THRESHOLD)

    if label == 1:
        duration = time.time() - start
        return label, prob, senti_score, cleaned, "ML-Prediksi", len(text), token_count, duration

    for pat in booster_bukan_aduan:
        if re.search(pat, text_lower):
            duration = time.time() - start
            return 0, prob, senti_score, cleaned, f"[BukanAduanBooster: {pat}]", len(text), token_count, duration

    duration = time.time() - start
    return label, prob, senti_score, cleaned, "ML-Prediksi", len(text), token_count, duration

# ==== MAIN PROGRAM ====
if __name__ == "__main__":
    print_banner()
    print_loading("Menyiapkan model & data")

    booster_data = load_json(BOOSTER_JSON_PATH)
    booster_aduan = booster_data["booster_aduan"]
    booster_bukan_aduan = booster_data["booster_bukan_aduan"]

    config_data = load_json(BOOSTER_CONFIG_PATH)
    token_pairs = config_data["token_booster_pairs"]
    regex_patterns = config_data["regex_booster_patterns"]

    vectorizer = joblib.load(VECT_PATH)
    model = joblib.load(XGB_PATH)
    keywords = set(np.load(KEYW_PATH, allow_pickle=True))
    slang_dict = load_slang_dict()
    stemmer = StemmerFactory().create_stemmer()
    stop_words = set(stopwords.words("indonesian")) | set(stopwords.words("english"))

    print("\n📢 Sistem siap digunakan. Ketik 'exit' untuk keluar.")

    while True:
        text = input("\n📝 Masukkan teks: ").strip()
        if text.lower() == "exit":
            print("👋 Terima kasih. Sampai jumpa.")
            break

        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        label, prob, score, cleaned, info, char_len, token_len, durasi = predict_aduan(
            text, booster_aduan, booster_bukan_aduan, token_pairs,
            regex_patterns, vectorizer, model, keywords,
            slang_dict, stemmer, stop_words
        )

        print_separator()
        print(f"🕓 Waktu Prediksi        : {timestamp}")
        print(f"🗒️  Teks Asli            : {text}")
        print(f"🧹 Teks Setelah Bersih   : {cleaned}")
        print(f"👉 Label                 : {'Aduan' if label else 'Bukan Aduan'}")
        print(f"🔢 Probabilitas          : {prob:.4f}")
        print(f"🧮 Panjang Teks          : {char_len} karakter")
        print(f"📏 Jumlah Token          : {token_len} kata")
        print(f"⏱️  Durasi Prediksi      : {durasi:.4f} detik")
        print_separator("-")



                                                            
    🔍 SISTEM DETEKSI ADUAN TRANSPORTASI DAN LALU LINTAS OTOMATIS                     
    ════════════════════════════════════════════════════════════                

    📊 Powered by Machine Learning & NLP                     
    🚀 Enhanced CLI Interface                                 
    ⚡ Real-time Text Analysis                                
                                                            

✅ Menyiapkan model & data Complete!

📢 Sistem siap digunakan. Ketik 'exit' untuk keluar.
🕓 Waktu Prediksi        : 2025-08-05 04:41:59
🗒️  Teks Asli            : COK
🧹 Teks Setelah Bersih   : cok
👉 Label                 : Bukan Aduan
🔢 Probabilitas          : 0.0262
🧮 Panjang Teks          : 3 karakter
📏 Jumlah Token          : 1 kata
💬 Skor Kata Penting     : 0
📝 Info Booster          : [BukanAduanBooster: (?:(?:jalan|lalu\s+lintas|lalin|akses|situasi|kondisi)\s+.*)?(?:kering|tanpa\s+genangan|tidak\s*tergenang|tidak\