In [106]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Concatenate, SpatialDropout1D, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout, Conv1D, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.preprocessing import RobustScaler
import re
import pickle
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.preprocessing import MinMaxScaler
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [107]:
def create_directories():
    """Create necessary directories for model and visualization artifacts"""
    directories = ['saved_models', 'visualizations']
    for directory in directories:
        if not os.path.exists(directory):
            os.makedirs(directory)

In [108]:
def clean_text(text):
    text = str(text).lower()

    # Remove URLs dan email
    text = re.sub(r'http\S+|www\S+|https\S+|\S+@\S+', '', text)
    
    # Handle a``ngka dengan konteks
    text = re.sub(r'(\d+)\s*(milyar|miliar|m)', r'\1000000000', text)
    text = re.sub(r'(\d+)\s*(juta|jt)', r'\1000000', text)
    
    # Remove special characters tapi pertahankan yang penting
    text = re.sub(r'[^\w\s+\-.,]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def load_and_preprocess_data(file_path):
    # Baca dataset
    df = pd.read_csv(file_path)
    
    # Hapus baris yang kosong
    df = df.dropna()
    
    # Inisialisasi Sastrawi
    stemmer_factory = StemmerFactory()
    stemmer = stemmer_factory.create_stemmer()
    stopword_factory = StopWordRemoverFactory()
    stopword = stopword_factory.create_stop_word_remover()
    
    # Terapkan pembersihan teks dengan weighted concatenation yang dimodifikasi
    df['clean_description'] = (
        (df['Judul_Clean'].astype(str) + ' ' + df['Judul_Clean'].astype(str)) + ' ' +  # Bobot 2x
        df['Lokasi_Clean'].astype(str) + ' ' +  # Bobot 1x
        df['Deskripsi_Clean'].astype(str) + ' ' +  # Bobot 1x
        (df['Keywords_Clean'].astype(str) + ' ' + df['Keywords_Clean'].astype(str))  # Bobot 2x
    )
    
    # Bersihkan teks gabungan
    df['clean_description'] = df['clean_description'].apply(clean_text)
    return df

In [109]:
def prepare_features(df):
    # Tokenisasi teks dengan konfigurasi yang dioptimalkan
    tokenizer = Tokenizer(
        num_words=10000,  # Meningkatkan vocabulary size
        oov_token='<OOV>',
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
        lower=True
    )
    tokenizer.fit_on_texts(df['clean_description'])
    
    # Convert teks ke sequences dengan padding yang dioptimalkan
    sequences = tokenizer.texts_to_sequences(df['clean_description'])
    padded_sequences = pad_sequences(
        sequences, 
        maxlen=300,  # Meningkatkan maksimum length
        padding='post',
        truncating='post'
    )
    
    # Ensure directory exists before saving
    create_directories()
    
    # Simpan tokenizer
    with open('saved_models/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    # Numeric features dengan normalisasi yang ditingkatkan
    numeric_features = df[[
        'Harga_Normalized', 'Kamar_Normalized', 'WC_Normalized',
        'Parkir_Normalized', 'Luas_Tanah_Normalized', 'Luas_Bangunan_Normalized'
    ]].values
    
    # Tambahkan feature engineering
    additional_features = np.column_stack([
        numeric_features,
        df['Harga_Normalized'] / df['Luas_Bangunan_Normalized'],  # Price per sqm
        df['Luas_Bangunan_Normalized'] / df['Luas_Tanah_Normalized'],  # Building ratio
        df['Kamar_Normalized'] * df['WC_Normalized']  # Room-bathroom ratio
    ])
    
    return padded_sequences, additional_features, tokenizer

In [110]:
def create_relevance_labels(df, query_features):
    """
    Membuat label relevansi berdasarkan kecocokan fitur properti dengan query
    """
    labels = []
    
    for _, property in df.iterrows():
        score = 0
        total_weights = 0
        
        # Checking price match (weight: 0.3)
        if property['Harga'] <= query_features['max_price']:
            score += 0.3
        total_weights += 0.3
        
        # Checking location match (weight: 0.25)
        if query_features['location'] in property['Lokasi_Clean'].lower():
            score += 0.25
        total_weights += 0.25
        
        # Checking room requirements (weight: 0.15)
        if property['Kamar'] >= query_features['min_bedrooms']:
            score += 0.15
        total_weights += 0.15
        
        # Checking bathroom requirements (weight: 0.15)
        if property['WC'] >= query_features['min_bathrooms']:
            score += 0.15
        total_weights += 0.15
        
        # Checking parking requirements (weight: 0.15)
        if property['Parkir'] >= query_features['min_parking']:
            score += 0.15
        total_weights += 0.15
        
        # Normalize score
        final_score = score / total_weights
        
        # Convert to binary label with threshold
        labels.append(1 if final_score >= 0.7 else 0)
    
    return np.array(labels)

In [111]:
def prepare_training_queries():
    """
    Membuat dataset training queries yang realistis
    """
    locations = ['sleman', 'bantul', 'yogyakarta', 'jogja', 'kulon progo', 'gunung kidul']
    price_ranges = [
        {'min': 0.5, 'max': 1.0},
        {'min': 1.0, 'max': 1.5},
        {'min': 1.5, 'max': 2.0},
        {'min': 2.0, 'max': 3.0},
    ]
    
    training_queries = []
    
    for loc in locations:
        for price in price_ranges:
            for bedrooms in range(1, 6):
                for bathrooms in range(1, 4):
                    for parking in range(0, 4):
                        query_features = {
                            'location': loc,
                            'max_price': price['max'] * 1e9,  # Convert to rupiah
                            'min_bedrooms': bedrooms,
                            'min_bathrooms': bathrooms,
                            'min_parking': parking
                        }
                        
                        query = f"Cari rumah di {loc} dengan {bedrooms} kamar tidur, "
                        query += f"{bathrooms} kamar mandi, parkir {parking} mobil "
                        query += f"dibawah {price['max']} milyar"
                        
                        training_queries.append((query, query_features))
    
    return training_queries

In [112]:
def build_model(vocab_size, embedding_dim=128):
    """
    Model dengan arsitektur yang dioptimasi dan flow data yang benar,
    dengan beberapa optimasi dari model optimized namun tetap mempertahankan
    struktur inti.
    """
    # Text processing branch
    text_input = Input(shape=(300,), name='text_input')
    # Convert text indices to embeddings - tetap menggunakan 256 sesuai original
    embedding = Embedding(vocab_size, embedding_dim, mask_zero=False)(text_input)
    
    # Multiple parallel convolution layers - optimasi jumlah filter
    conv_layers = []
    for filter_size in [3, 4]:  # Tetap menggunakan 3 filter sesuai original
        conv = Conv1D(32, 3, activation='relu')(embedding)  # Kurangi filter
        pool = GlobalMaxPooling1D()(conv)
        conv_layers.append(pool)
    
    conv_concat = Concatenate()(conv_layers)
    
    # LSTM branch - tetap bidirectional sesuai original dengan optimasi ukuran
    lstm = Bidirectional(LSTM(96, return_sequences=True))(embedding)  # Optimasi ke 96 dari 128
    lstm = Bidirectional(LSTM(48))(lstm)  # Optimasi ke 48 dari 64
    
    # Combine text features dengan dropout yang dioptimasi
    text_features = Concatenate()([conv_concat, lstm])
    text_features = Dropout(0.2)(pool)
    
    # Numeric features branch - optimasi arsitektur
    numeric_input = Input(shape=(9,), name='numeric_input')
    numeric_dense = Dense(32, activation='relu')(numeric_input)
    numeric_dense = BatchNormalization()(numeric_dense)
    numeric_dense = Dropout(0.2)(numeric_dense)
    numeric_dense = Dense(48, activation='relu')(numeric_dense)  # Optimasi ke 48 dari 64
    
    # Merge all features
    merged = Concatenate()([text_features, numeric_dense])
    
    # Deep classification layers dengan optimasi
    dense = Dense(64, activation='relu')(merged)
    dense = BatchNormalization()(dense)
    dense = Dropout(0.2)(dense)
    dense = Dense(64, activation='relu')(merged)
    dense = BatchNormalization()(dense)
    dense = Dropout(0.2)(dense)
    #dense = Dense(96, activation='relu', kernel_regularizer=l2(0.005))(dense)  # Optimasi dari 128 dan l2
    #dense = BatchNormalization()(dense)
    #dense = Dropout(0.25)(dense)  # Optimasi dari 0.3
    output = Dense(1, activation='sigmoid')(dense)
    
    # Create model with multiple inputs
    model = Model(
        inputs={
            'text_input': text_input,
            'numeric_input': numeric_input
        },
        outputs=output
    )
    
    # Learning rate schedule yang dioptimasi
    initial_learning_rate = 0.001
    decay_steps = 1200  # Optimasi dari 1000
    decay_rate = 0.85   # Optimasi dari 0.9
    learning_rate_fn = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate, decay_steps, decay_rate
    )
    
    # Optimizer with gradient clipping
    optimizer = Adam(
        learning_rate=learning_rate_fn,
        clipnorm=0.8  # Optimasi dari 1.0
    )
    
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.AUC(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
            tf.keras.metrics.F1Score()
        ]
    )
    
    return model

In [113]:
def train_improved_model_batched(df, padded_sequences, numeric_features, tokenizer, batch_size=256):  # Optimasi batch size
    """
    Training dengan pendekatan batch processing menggunakan tf.data.Dataset
    dengan optimasi performa
    """
    
    def create_tf_dataset(query_data, numeric_data, batch_size, padded_sequences, tokenizer):
        """Membuat tf.data.Dataset dengan class weights dan preprocessing yang benar"""
        # Optimasi class weights
        class_weights = {0: 1.0, 1: 1.8}  # Sedikit pengurangan dari 2.0
        
        def generator_fn():
            while True:
                for query, query_features in query_data:
                    # Process query
                    clean_query = clean_text(query)
                    query_sequence = tokenizer.texts_to_sequences([clean_query])[0]
                    
                    # Create labels
                    labels = create_relevance_labels(df, query_features)
                    num_samples = len(labels)
                    
                    # Generate batches with optimized shuffling
                    indices = np.random.permutation(num_samples)
                    for start_idx in range(0, num_samples, batch_size):
                        end_idx = min(start_idx + batch_size, num_samples)
                        batch_indices = indices[start_idx:end_idx]
                        
                        # Prepare batch data with optimized dtype
                        batch_X_text = padded_sequences[batch_indices].astype(np.float32)
                        batch_X_numeric = numeric_data[batch_indices].astype(np.float32)
                        
                        batch_y = np.array(labels)[batch_indices]
                        batch_y = np.expand_dims(batch_y, axis=-1)
                        
                        # Calculate sample weights
                        sample_weights = np.array([class_weights[y[0]] for y in batch_y])
                        
                        yield (
                            {
                                'text_input': batch_X_text,
                                'numeric_input': batch_X_numeric
                            },
                            batch_y,
                            sample_weights
                        )
        
        output_signature = (
            {
                'text_input': tf.TensorSpec(shape=(None, 300), dtype=tf.float32),
                'numeric_input': tf.TensorSpec(shape=(None, 9), dtype=tf.float32)
            },
            tf.TensorSpec(shape=(None, 1), dtype=tf.int32),
            tf.TensorSpec(shape=(None,), dtype=tf.float32)
        )
        
        # Create optimized dataset dengan prefetch
        dataset = tf.data.Dataset.from_generator(
            generator_fn,
            output_signature=output_signature
        ).prefetch(tf.data.AUTOTUNE)
        
        return dataset

    # Generate training queries
    training_queries = prepare_training_queries()
    
    # Optimasi split ratio
    num_queries = len(training_queries)
    train_size = int(0.85 * num_queries)  # Sedikit peningkatan dari 0.8
    
    # Shuffle queries dengan seed untuk reproducibility
    np.random.seed(42)  # Tambahan untuk konsistensi
    shuffle_idx = np.random.permutation(num_queries)
    train_queries = [training_queries[i] for i in shuffle_idx[:train_size]]
    val_queries = [training_queries[i] for i in shuffle_idx[train_size:]]
    
    # Hitung total samples
    total_train_samples = sum(len(df) for _, _ in train_queries)
    total_val_samples = sum(len(df) for _, _ in val_queries)
    
    # Buat datasets
    train_dataset = create_tf_dataset(
        train_queries,
        numeric_features,
        batch_size,
        padded_sequences,
        tokenizer
    )
    
    val_dataset = create_tf_dataset(
        val_queries,
        numeric_features,
        batch_size,
        padded_sequences,
        tokenizer
    )
    
    # Build model
    vocab_size = len(tokenizer.word_index) + 1
    model = build_model(vocab_size=vocab_size)
    
    # Validasi output shape
    if model.outputs[0].shape[-1] != 1:
        raise ValueError("Model output shape harus (None, 1)")
    
    # Optimasi callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=8,  # Optimasi dari 10
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ModelCheckpoint(
            'saved_models/best_model.keras',
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.3,  # Optimasi dari 0.2
            patience=4,  # Optimasi dari 5
            min_lr=0.0001,
            verbose=1
        )
    ]
    
    # Train model dengan optimasi epochs
    history = model.fit(
        train_dataset,
        steps_per_epoch=total_train_samples // batch_size,
        validation_data=val_dataset,
        validation_steps=total_val_samples // batch_size,
        epochs=5,  # ATUR EPOCH KANG
        callbacks=callbacks,
        verbose=1
    )
    
    return model, history

In [114]:
class PropertyRAG:
    def __init__(self, df, model, tokenizer):
        self.df = df
        self.model = model
        self.tokenizer = tokenizer
        self.stemmer = StemmerFactory().create_stemmer()
        self.stopword_factory = StopWordRemoverFactory()  # Create factory
        self.stopword = self.stopword_factory.create_stop_word_remover()
        
        # Tambahkan TF-IDF untuk improved text matching
        self.tfidf = TfidfVectorizer(
            max_features=2000,
            ngram_range=(1, 2),
            stop_words=self.stopword_factory.get_stop_words()  # Get stop words from factory
        )
        self.tfidf.fit(df['clean_description'])
        self.tfidf_matrix = self.tfidf.transform(df['clean_description'])
    
    def process_query(self, query):
        # Extract numeric requirements dengan error handling
        try:
            kamar = int(re.findall(r'(\d+)\s*kamar tidur', query)[0]) if re.findall(r'(\d+)\s*kamar tidur', query) else 0
            wc = int(re.findall(r'(\d+)\s*kamar mandi', query)[0]) if re.findall(r'(\d+)\s*kamar mandi', query) else 0
            parkir = int(re.findall(r'(\d+)\s*mobil', query)[0]) if re.findall(r'(\d+)\s*mobil', query) else 0
            
            # Extract price range
            price_match = re.findall(r'(\d+(?:\.\d+)?)\s*(?:milyar|miliar|m|M)', query)
            max_price = float(price_match[0]) if price_match else float('inf')
        except Exception as e:
            print(f"Error in extracting numeric values: {e}")
            return None, None, None, None
        
        # Extract location dengan fuzzy matching
        locations = ['sleman', 'bantul', 'yogyakarta', 'jogja', 'kulon progo', 'gunung kidul']
        query_location = None
        max_ratio = 0
        
        for loc in locations:
            ratio = fuzz.partial_ratio(loc, query.lower())
            if ratio > max_ratio and ratio > 80:  # Threshold 80%
                max_ratio = ratio
                query_location = loc
        
        # Clean and tokenize query dengan improved preprocessing
        clean_query = self.preprocess_query(query)
        query_sequence = self.tokenizer.texts_to_sequences([clean_query])
        padded_query = pad_sequences(query_sequence, maxlen=300, padding='post', truncating='post')
        
        # Create numeric features dengan additional engineering
        numeric_query = np.zeros((1, 9))  # Updated for additional features
        if kamar: numeric_query[0][1] = kamar / 10
        if wc: numeric_query[0][2] = wc / 10
        if parkir: numeric_query[0][3] = parkir / 10
        
        # Add engineered features
        if kamar and wc:
            numeric_query[0][7] = (kamar * wc) / 100  # Room-bathroom ratio
        
        return padded_query, numeric_query, query_location, max_price
    
    def preprocess_query(self, query):
        # Enhanced query preprocessing
        query = query.lower()
        query = re.sub(r'[^a-zA-Z0-9\s]', ' ', query)
        query = re.sub(r'\s+', ' ', query).strip()
        query = self.stemmer.stem(query)
        query = self.stopword.remove(query)
        return query
    
    def enhance_query_understanding(self, query):
        """
        Meningkatkan pemahaman query dengan sinonim dan variasi bahasa
        """
        # Mapping lokasi
        location_mapping = {
            'jogja': ['yogyakarta', 'jogjakarta', 'yogya'],
            'sleman': ['depok', 'condong catur', 'godean'],
            'bantul': ['kasihan', 'banguntapan']
        }
        
        # Mapping fasilitas
        facility_mapping = {
            'kamar': ['ruang tidur', 'bedroom'],
            'wc': ['kamar mandi', 'toilet', 'bathroom'],
            'parkir': ['garasi', 'carport']
        }
        
        enhanced_query = query.lower()
        
        # Apply mappings
        for main_loc, variants in location_mapping.items():
            for var in variants:
                if var in enhanced_query:
                    enhanced_query = enhanced_query.replace(var, main_loc)
        
        for main_fac, variants in facility_mapping.items():
            for var in variants:
                if var in enhanced_query:
                    enhanced_query = enhanced_query.replace(var, main_fac)
        
        return enhanced_query
    
    def get_recommendations(self, query, top_k=5):
        # Process query dengan error handling
        result = self.process_query(query)
        if result is None:
            return pd.DataFrame()
        
        padded_query, numeric_query, query_location, max_price = result
        
        # Filter by location dan price dengan fuzzy matching
        filtered_df = self.df.copy()
        
        if query_location:
            location_mask = filtered_df['Lokasi_Clean'].apply(
                lambda x: fuzz.partial_ratio(query_location, x.lower()) > 80
            )
            filtered_df = filtered_df[location_mask]
        
        # Apply price filter
        if max_price != float('inf'):
            filtered_df = filtered_df[filtered_df['Harga'] <= (max_price * 1e9)]  # Convert to rupiah
        
        if len(filtered_df) == 0:
            return pd.DataFrame()
        
        # Calculate text similarity scores
        query_tfidf = self.tfidf.transform([self.preprocess_query(query)])
        text_similarities = cosine_similarity(query_tfidf, self.tfidf_matrix[filtered_df.index])
        
        # Get text sequences
        sequences = self.tokenizer.texts_to_sequences(filtered_df['clean_description'])
        padded_sequences = pad_sequences(sequences, maxlen=300, padding='post', truncating='post')
        
        numeric_features = np.column_stack([
            filtered_df[[
                'Harga_Normalized', 'Kamar_Normalized', 'WC_Normalized',
                'Parkir_Normalized', 'Luas_Tanah_Normalized', 'Luas_Bangunan_Normalized'
            ]].values,
            filtered_df['Harga_Normalized'] / filtered_df['Luas_Bangunan_Normalized'],
            filtered_df['Luas_Bangunan_Normalized'] / filtered_df['Luas_Tanah_Normalized'],
            filtered_df['Kamar_Normalized'] * filtered_df['WC_Normalized']
        ])
        
        # Combine model predictions with text similarity
        model_predictions = self.model.predict([numeric_features, padded_sequences])
        combined_scores = 0.7 * model_predictions.flatten() + 0.3 * text_similarities.flatten()
        
        # Add scores to DataFrame
        filtered_df['relevance_score'] = combined_scores
        
        # Sort and return recommendations
        recommendations = filtered_df.sort_values(
            ['relevance_score', 'Harga'],
            ascending=[False, True]
        ).head(top_k)
        
        return recommendations[['Judul', 'Lokasi', 'Harga', 'Kamar', 'WC', 'Parkir', 
                              'Luas_Tanah', 'Luas_Bangunan', 'Property_Link', 'relevance_score']]

In [115]:
def create_training_visualization(history, save_path='visualizations/training_history.png'):
    """
    Membuat dan menyimpan visualisasi metrics training model.
    
    Args:
        history: History object dari model.fit() Keras
        save_path: Path untuk menyimpan file visualisasi
    """
    # Pastikan direktori exists
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    
    # Set style untuk plot yang lebih baik
    plt.style.use('default')  # Menggunakan default style yang pasti ada
    
    # Buat figure dengan ukuran yang lebih besar dan resolusi tinggi
    plt.figure(figsize=(15, 6), dpi=300)
    
    # Plot training & validation loss
    plt.subplot(1, 2, 1)
    plt.plot(history['loss'], 'b-', label='Training Loss', linewidth=2)
    plt.plot(history['val_loss'], 'r--', label='Validation Loss', linewidth=2)
    plt.title('Model Loss', fontsize=12, pad=15)
    plt.xlabel('Epoch', fontsize=10)
    plt.ylabel('Loss', fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(fontsize=10)
    
    # Plot training & validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history['accuracy'], 'b-', label='Training Accuracy', linewidth=2)
    plt.plot(history['val_accuracy'], 'r--', label='Validation Accuracy', linewidth=2)
    plt.title('Model Accuracy', fontsize=12, pad=15)
    plt.xlabel('Epoch', fontsize=10)
    plt.ylabel('Accuracy', fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(fontsize=10)
    
    # Atur layout dan margin
    plt.tight_layout(pad=3.0)
    
    # Debug print untuk memastikan data ada
    print("\nDebug information:")
    print(f"History keys: {list(history.keys())}")
    print(f"Number of epochs: {len(history['loss'])}")
    
    # Simpan plot dengan kualitas tinggi
    try:
        plt.savefig(save_path, 
                   format='png',
                   bbox_inches='tight',
                   pad_inches=0.2,
                   dpi=300)
        print(f"\nVisualisasi berhasil disimpan di: {save_path}")
    except Exception as e:
        print(f"Error saat menyimpan visualisasi: {str(e)}")
    finally:
        # Pastikan untuk menutup figure untuk menghemat memori
        plt.close('all')

In [None]:
def main():
    try:
        # Create necessary directories
        create_directories()
        
        print("Loading and preprocessing data...")
        # Load and preprocess data
        df = load_and_preprocess_data('processed_house_data.csv')
        
        print("Preparing features...")
        # Prepare features
        padded_sequences, numeric_features, tokenizer = prepare_features(df)
        
        print("Starting model training...")
        # Train model
        vocab_size = len(tokenizer.word_index) + 1
        model, history = train_improved_model_batched(df, padded_sequences, numeric_features, tokenizer)
        
        # Buat visualisasi menggunakan fungsi yang telah diperbaiki
        print("\nMembuat visualisasi training history...")
        create_training_visualization(history.history)
        
        # Print metrics training
        print("\nHasil Training Model:")
        print("-" * 50)
        for metric in history.history.keys():
            final_value = history.history[metric][-1]
            print(f"{metric}: {final_value:.4f}")
        
        # Initialize RAG system
        print("\nInitializing RAG system...")
        rag_system = PropertyRAG(df, model, tokenizer)
        
        return rag_system, history.history
        
    except Exception as e:
        print(f"Error terjadi selama eksekusi: {str(e)}")
        raise

In [117]:
if __name__ == "__main__":
    try:
        # Run main function
        rag_system, metrics = main()
       
        # Example queries
        test_queries = [
            "saya ingin rumah di sleman dengan 3 kamar tidur, 2 kamar mandi dan slot parkir 2 mobil",
            "cari rumah di bantul dengan budget 500 juta dengan 2 kamar tidur",
            "rekomendasi rumah di kota jogja dengan luas tanah minimal 100m2"
        ]
       
        for query in test_queries:
            print("\n" + "="*50)
            print(f"Query: {query}")
            print("="*50)
           
            recommendations = rag_system.get_recommendations(query)
           
            print("\nRekomendasi Properti:")
            for idx, row in recommendations.iterrows():
                print(f"\nProperti {idx + 1}:")
                print(f"Judul: {row['Judul']}")
                print(f"Lokasi: {row['Lokasi']}")
                print(f"Harga: Rp {row['Harga']:,}")
                print(f"Kamar Tidur: {row['Kamar']}")
                print(f"Kamar Mandi: {row['WC']}")
                print(f"Parkir: {row['Parkir']}")
                print(f"Luas Tanah: {row['Luas_Tanah']} m²")
                print(f"Luas Bangunan: {row['Luas_Bangunan']} m²")
                print("-"*30)
               
        print("\nModel Performance Metrics:")
        print("-"*50)
        for metric, value in metrics.items():
            # Check if value is a number before formatting
            if isinstance(value, (int, float)):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")
           
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print("Please ensure all directories and files exist and are accessible.")
        raise

Loading and preprocessing data...
Preparing features...
Starting model training...
Epoch 1/5
[1m467/468[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.6994 - auc_9: 0.6256 - f1_score: 0.2722 - loss: 0.7121 - precision_9: 0.2420 - recall_9: 0.3991
Epoch 1: val_loss improved from inf to 0.57952, saving model to saved_models/best_model.keras
[1m468/468[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.6997 - auc_9: 0.6257 - f1_score: 0.2722 - loss: 0.7117 - precision_9: 0.2421 - recall_9: 0.3986 - val_accuracy: 0.8464 - val_auc_9: 0.7184 - val_f1_score: 0.2715 - val_loss: 0.5795 - val_precision_9: 0.6707 - val_recall_9: 0.0436 - learning_rate: 9.3858e-04
Epoch 2/5
[1m468/468[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.8246 - auc_9: 0.7001 - f1_score: 0.2844 - loss: 0.6101 - precision_9: 0.4269 - recall_9: 0.1657
Epoch 2: val_loss did not improve from 0.57952
[1m468/468[0m [32m━━━━━━━━━━━━━━




Rekomendasi Properti:

Properti 68:
Judul: Rumah Cantik Estetik Limasan Di Jalan Sidomoyo Km 1 Godean Sleman
Lokasi: Sleman, Yogyakarta
Harga: Rp 785,000,000.0
Kamar Tidur: 3.0
Kamar Mandi: 2.0
Parkir: 2.0
Luas Tanah: 110.0 m²
Luas Bangunan: 75.0 m²
------------------------------

Properti 83:
Judul: Rumah Baru Mewah Fresh Limasan Modern Dalam Mini Cluster Madinah Residence Di Jalan Kaliurang Km 13 Dekat Sma N Ngaglik Sleman
Lokasi: Ngaglik, Sleman
Harga: Rp 655,000,000.0
Kamar Tidur: 3.0
Kamar Mandi: 2.0
Parkir: 2.0
Luas Tanah: 104.0 m²
Luas Bangunan: 80.0 m²
------------------------------

Properti 8:
Judul: Rumah Baru 3 Unit Mewah Limasan Harga Murah Area Perumahan Di Jalan Kaliurang Km 13 Dekat Sma N 2 Ngaglik Dan Spbu Mindi Sleman
Lokasi: Ngemplak, Sleman
Harga: Rp 650,000,000.0
Kamar Tidur: 3.0
Kamar Mandi: 2.0
Parkir: 2.0
Luas Tanah: 90.0 m²
Luas Bangunan: 65.0 m²
------------------------------

Properti 67:
Judul: Rumah Baru SHM Siap Huni, 8 Menit Ke Pemda Sleman Di Mlati Slem




Rekomendasi Properti:

Properti 53:
Judul: Rumah Cantik Scandinavian Dengan Mezanin Kekinian Dalam Perumahan Puri Ismail Dekat Giwangan Banguntapan Bantul
Lokasi: Banguntapan, Bantul
Harga: Rp 650,000,000.0
Kamar Tidur: 2.0
Kamar Mandi: 1.0
Parkir: 1.0
Luas Tanah: 98.0 m²
Luas Bangunan: 60.0 m²
------------------------------

Properti 50:
Judul: Promo Bulan November 375 Juta Saja! Rumah Cantik Mewah - Harga Murah Dalam Perumahan Fasco Village Bangunjiwo Kasihan Bantul
Lokasi: Kasihan, Bantul
Harga: Rp 350,000,000.0
Kamar Tidur: 2.0
Kamar Mandi: 1.0
Parkir: 1.0
Luas Tanah: 80.0 m²
Luas Bangunan: 36.0 m²
------------------------------

Properti 85:
Judul: RUMAH BARU SIAP HUNI CANTIK MINIMALIS HARGA EKONOMIS HOOK DALAM CLUSTER MUTIARA SEWON JALAN IMOGIRI BARAT KM 6
Lokasi: Sewon, Bantul
Harga: Rp 495,000,000.0
Kamar Tidur: 2.0
Kamar Mandi: 1.0
Parkir: 1.0
Luas Tanah: 85.0 m²
Luas Bangunan: 50.0 m²
------------------------------

Properti 6:
Judul: Minimalis Modern Cocok Untuk Gaya Hidup 