In [None]:
data = r"/home/whoami/Programming/information retrieval/Relevance Feedback/UU + Perpu.csv"
stopword_id = r"/home/whoami/Programming/information retrieval/Relevance Feedback/stopwords-id.txt"

import pandas as pd
import numpy as np
import string
import re
import pickle
import os
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from joblib import Parallel, delayed
from joblib import Memory, dump, load
from sklearn.decomposition import TruncatedSVD

# Caching setup - folder untuk menyimpan cache
cache_dir = './cacheall'
memory = Memory(cache_dir, verbose=0)

# Folder untuk menyimpan hasil preprocessing
preprocessed_dir = './preprocessed_data'
os.makedirs(preprocessed_dir, exist_ok=True)

# Path untuk file yang akan menyimpan hasil preprocessing
preprocessed_file = os.path.join(preprocessed_dir, 'preprocessed_texts.pkl')
tfidf_matrix_file = os.path.join(preprocessed_dir, 'tfidf_matrix.pkl')
tfidf_vectorizer_file = os.path.join(preprocessed_dir, 'tfidf_vectorizer.pkl')
svd_model_file = os.path.join(preprocessed_dir, 'svd_model.pkl')
tfidf_matrix_reduced_file = os.path.join(preprocessed_dir, 'tfidf_matrix_reduced.pkl')
df_file = os.path.join(preprocessed_dir, 'dataframe.pkl')

# Inisialisasi stemmer bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membaca daftar stop words bahasa Indonesia
with open(stopword_id, 'r') as f:
    stop_words_id = f.read().splitlines()

def preprocess_text(text):
    # Pastikan text adalah string
    if not isinstance(text, str):
        text = str(text)
    
    # Menghilangkan karakter berulang
    text = re.sub(r'(.)\1+', r'\1', text)
    
    # Menghilangkan angka
    text = ''.join([i for i in text if not i.isdigit()])
    
    # Menghilangkan tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Mengubah teks menjadi huruf kecil
    text = text.lower()
    
    # Melakukan stemming pada teks
    text = stemmer.stem(text)
    
    return text

@memory.cache
def preprocess_parallel(text_series):
    return Parallel(n_jobs=-1)(delayed(preprocess_text)(text) for text in text_series)

# Fungsi untuk melakukan preprocessing dan menyimpan hasilnya
def process_and_save_data():
    print("Memulai preprocessing data...")
    
    # Membaca file CSV
    df = pd.read_csv(data)
    
    # Cek apakah hasil preprocessing sudah ada
    if os.path.exists(preprocessed_file):
        print("Loading preprocessed texts from cache...")
        with open(preprocessed_file, 'rb') as f:
            preprocessed_texts = pickle.load(f)
        df['Teks'] = preprocessed_texts
    else:
        print("Preprocessing texts (this might take a while)...")
        import time
        start_time = time.time()
        preprocessed_texts = preprocess_parallel(df['Teks'].fillna(''))

        end_time = time.time()
        execution_time = end_time - start_time
        print()
        print(f"Waktu mulai: {start_time}")
        print(f"Waktu selesai: {end_time}")
        print(f"Waktu eksekusi: {execution_time} detik")
        print()

        df['Teks'] = preprocessed_texts
        # Simpan hasil preprocessing
        with open(preprocessed_file, 'wb') as f:
            pickle.dump(preprocessed_texts, f)
    
    # Simpan dataframe
    dump(df, df_file)
    
    # Cek apakah model TF-IDF dan matrix sudah ada
    if os.path.exists(tfidf_vectorizer_file) and os.path.exists(tfidf_matrix_file):
        print("Loading TF-IDF model and matrix from cache...")
        tfidf = load(tfidf_vectorizer_file)
        tfidf_matrix = load(tfidf_matrix_file)
    else:
        print("Creating TF-IDF matrix...")
        # Inisialisasi TfidfVectorizer dengan stop words bahasa Indonesia
        tfidf = TfidfVectorizer(stop_words=stop_words_id, max_df=0.85, min_df=2, ngram_range=(1, 2))
        # Melakukan fit dan transformasi pada kolom Teks
        tfidf_matrix = tfidf.fit_transform(df['Teks'])
        # Simpan model dan matrix
        dump(tfidf, tfidf_vectorizer_file)
        dump(tfidf_matrix, tfidf_matrix_file)
    
    # Cek apakah model SVD dan matrix yang direduksi sudah ada
    if os.path.exists(svd_model_file) and os.path.exists(tfidf_matrix_reduced_file):
        print("Loading SVD model and reduced matrix from cache...")
        svd = load(svd_model_file)
        tfidf_matrix_reduced = load(tfidf_matrix_reduced_file)
    else:
        print("Reducing dimensions with SVD...")
        # Menggunakan TruncatedSVD untuk pengurangan dimensi
        svd = TruncatedSVD(n_components=100)
        tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)
        # Simpan model dan matrix
        dump(svd, svd_model_file)
        dump(tfidf_matrix_reduced, tfidf_matrix_reduced_file)
    
    print("Preprocessing selesai dan data telah disimpan!")
    
    return df, tfidf, tfidf_matrix, svd, tfidf_matrix_reduced

# Fungsi untuk memuat data yang telah diproses
def load_processed_data():
    if not (os.path.exists(df_file) and 
            os.path.exists(tfidf_vectorizer_file) and 
            os.path.exists(tfidf_matrix_file) and 
            os.path.exists(svd_model_file) and 
            os.path.exists(tfidf_matrix_reduced_file)):
        return process_and_save_data()
    
    print("Loading preprocessed data from cache...")
    df = load(df_file)
    tfidf = load(tfidf_vectorizer_file)
    tfidf_matrix = load(tfidf_matrix_file)
    svd = load(svd_model_file)
    tfidf_matrix_reduced = load(tfidf_matrix_reduced_file)
    
    return df, tfidf, tfidf_matrix, svd, tfidf_matrix_reduced

# Path untuk menyimpan feedback relevansi
feedback_path = 'relevance_feedback.pkl'

# Memuat feedback relevansi jika file ada, jika tidak, buat dictionary kosong
try:
    with open(feedback_path, 'rb') as f:
        relevance_feedback = pickle.load(f)
except FileNotFoundError:
    relevance_feedback = {}

def search_documents(query, tfidf, svd, tfidf_matrix_reduced, df, top_n=10):
    # Preprocessing query
    query = preprocess_text(query)
    
    # Transformasi query menjadi vektor tf-idf
    query_vec = tfidf.transform([query])
    query_vec_reduced = svd.transform(query_vec)
    
    # Menghitung cosine similarity antara query dan semua dokumen
    cosine_similarities = linear_kernel(query_vec_reduced, tfidf_matrix_reduced).flatten()
    
    # Mendapatkan indeks dokumen dengan similarity tertinggi
    related_docs_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    # Mendapatkan judul, teks, dan nilai similarity dari dokumen yang relevan
    results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i], i) for i in related_docs_indices]
    
    return results

def get_feedback(results):
    feedback = []
    for idx, (title, text, similarity, doc_index) in enumerate(results):
        print(f"Dokumen {idx + 1}:")
        print(f"Judul: {title}")
        print(f"Teks: {text[:200]}...")  # Display only the first 200 characters
        print(f"Similarity: {similarity:.4f}")
        print()
        
        # Ubah input di sini
        while True:
            relevansi_input = input("Apakah dokumen ini relevan? (1/ya untuk relevan, 0/tidak untuk tidak relevan): ").lower()
            if relevansi_input in ["1", "ya"]:
                relevansi = 1  # Relevan
                break
            elif relevansi_input in ["0", "tidak"]:
                relevansi = 0  # Tidak relevan
                break
            else:
                print("⚠︎⚠︎⚠︎ Input tidak valid. Masukkan 1/ya atau 0/tidak. ⚠︎⚠︎⚠︎")
        
        feedback.append((doc_index, relevansi))
    return feedback

def optimize_with_feedback(feedback, tfidf_matrix_reduced):
    # Ubah logika penentuan dokumen relevan/tidak relevan
    relevant_docs = [idx for idx, relevansi in feedback if relevansi == 1]  # Jika input 1 atau "ya"
    non_relevant_docs = [idx for idx, relevansi in feedback if relevansi == 0]  # Jika input 0 atau "tidak"
    
    if not relevant_docs:
        print("Tidak ada dokumen yang dianggap relevan. Pencarian ulang tidak dapat dilakukan.")
        return None
    
    relevant_matrix = tfidf_matrix_reduced[relevant_docs]
    non_relevant_matrix = tfidf_matrix_reduced[non_relevant_docs] if non_relevant_docs else np.zeros(relevant_matrix.shape)
    
    # Compute the centroid of relevant and non-relevant documents
    relevant_centroid = np.asarray(relevant_matrix.mean(axis=0)).flatten()
    non_relevant_centroid = np.asarray(non_relevant_matrix.mean(axis=0)).flatten() if non_relevant_docs else np.zeros(relevant_centroid.shape)
    
    # Update query vector by moving it towards the relevant centroid and away from the non-relevant centroid
    def adjust_query_vec(query_vec, relevant_centroid, non_relevant_centroid, alpha=1, beta=0.75, gamma=0.15):
        return alpha * query_vec + beta * relevant_centroid - gamma * non_relevant_centroid
    
    return adjust_query_vec

# Example usage
if __name__ == "__main__":
    # Load preprocessed data
    df, tfidf, tfidf_matrix, svd, tfidf_matrix_reduced = load_processed_data()
    
    query = input("Masukkan Query: ")
    print(f"Query: {query}\n")
    print()
    
    if query in relevance_feedback:
        print("Menggunakan feedback relevansi yang telah disimpan...\n")
        feedback = relevance_feedback[query]
        adjust_query_vec = optimize_with_feedback(feedback, tfidf_matrix_reduced)
        
        if adjust_query_vec:
            relevant_docs = [idx for idx, relevansi in feedback if relevansi == 1]
            non_relevant_docs = [idx for idx, relevansi in feedback if relevansi == 0]
            
            # Reprocess the query with the adjusted query vector
            query_vec = tfidf.transform([preprocess_text(query)])
            query_vec_reduced = svd.transform(query_vec)
            adjusted_query_vec = adjust_query_vec(query_vec_reduced, np.asarray(tfidf_matrix_reduced[relevant_docs].mean(axis=0)).flatten(), 
                                                  np.asarray(tfidf_matrix_reduced[non_relevant_docs].mean(axis=0)).flatten() if non_relevant_docs else np.zeros(query_vec_reduced.shape))
            
            # Compute cosine similarity with the adjusted query vector
            cosine_similarities = linear_kernel(adjusted_query_vec, tfidf_matrix_reduced).flatten()
            related_docs_indices = cosine_similarities.argsort()[-10:][::-1]
            
            # Display optimized results
            optimized_results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i]) for i in related_docs_indices]
            print("\n\n -- HASIL PENELUSURAN ULANG -- \n\n")
            for idx, (title, text, similarity) in enumerate(optimized_results):
                print(f"Dokumen {idx + 1}:")
                print(f"Judul: {title}")
                print(f"Similarity: {similarity:.4f}")
                print()
    else:
        initial_results = search_documents(query, tfidf, svd, tfidf_matrix_reduced, df)
        feedback = get_feedback(initial_results)
        relevance_feedback[query] = feedback  # Simpan feedback untuk query ini
        with open(feedback_path, 'wb') as f:
            pickle.dump(relevance_feedback, f)  # Simpan feedback relevansi ke file
        adjust_query_vec = optimize_with_feedback(feedback, tfidf_matrix_reduced)
        
        if adjust_query_vec:
            relevant_docs = [idx for idx, relevansi in feedback if relevansi == 1]
            non_relevant_docs = [idx for idx, relevansi in feedback if relevansi == 0]
            
            # Reprocess the query with the adjusted query vector
            query_vec = tfidf.transform([preprocess_text(query)])
            query_vec_reduced = svd.transform(query_vec)
            adjusted_query_vec = adjust_query_vec(query_vec_reduced, np.asarray(tfidf_matrix_reduced[relevant_docs].mean(axis=0)).flatten(), 
                                                  np.asarray(tfidf_matrix_reduced[non_relevant_docs].mean(axis=0)).flatten() if non_relevant_docs else np.zeros(query_vec_reduced.shape))
            
            # Compute cosine similarity with the adjusted query vector
            cosine_similarities = linear_kernel(adjusted_query_vec, tfidf_matrix_reduced).flatten()
            related_docs_indices = cosine_similarities.argsort()[-10:][::-1]
            
            # Display optimized results
            optimized_results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i]) for i in related_docs_indices]
            print("\n\n -- HASIL PENELUSURAN ULANG -- \n\n")
            for idx, (title, text, similarity) in enumerate(optimized_results):
                print(f"Dokumen {idx + 1}:")
                print(f"Judul: {title}")
                print(f"Similarity: {similarity:.4f}")
                print()

Loading preprocessed data from cache...
Query: tanah


Dokumen 1:
Judul: Peraturan Pemerintah Pengganti Undang-Undang Nomor 2 Tahun 2007
Teks: atur perintah anti undangundang republik indonesia nomor tahun tentang tangan masalah hukum dalam rangka pelaksanan rehabilitasi dan rekonstruksi wilayah dan hidup masyarakat di provinsi nangroe aceh ...
Similarity: 0.4647
Input tidak valid. Masukkan 1/ya atau 0/tidak.
Dokumen 2:
Judul: Peraturan Pemerintah Pengganti Undang-Undang Nomor 1 Tahun 1997
Teks: atur perintah anti undang undang republik indonesia nomor tahun tentang penanguhan mulai laku undang undang nomor tahun tentang bea oleh hak atas tanah dan bangun presiden republik indonesia timbang a...
Similarity: 0.2356
Dokumen 3:
Judul: Peraturan Pemerintah Pengganti Undang-Undang Nomor 1 Tahun 2008
Teks: lembar negararepublik indonesiano otonomi khusus pemerintahanpemerintah daerah papua penjelasandalam tambah lembar negara republikindonesia nomor atur pemerintahpenganti undangundang repub