In [1]:
data = r"/home/whoami/Programming/information retrieval/Relevance Feedback/hapus.csv"
stopword_id = r"/home/whoami/Programming/information retrieval/Relevance Feedback/stopwords-id.txt"

pdf_folder_Perpu = r"/home/whoami/Programming/information retrieval/Relevance Feedback/File Perpu"
pdf_folder_UU = r"/home/whoami/Programming/information retrieval/Relevance Feedback/File UU"
parent_folder_hasil = r"/home/whoami/Programming/information retrieval/Relevance Feedback/Hasil"


In [2]:
import pandas as pd
import numpy as np
import string
import re
import pickle
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from joblib import Parallel, delayed
from joblib import Memory
from sklearn.decomposition import TruncatedSVD

# Caching setup
location = './cacheall'
memory = Memory(location, verbose=0)

# Inisialisasi stemmer bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membaca daftar stop words bahasa Indonesia
# stopword_id = 'stopwords.txt'  # Specify the path to your stopwords file
with open(stopword_id, 'r') as f:
    stop_words_id = f.read().splitlines()


In [3]:
def preprocess_text(text):
    # Pastikan text adalah string
    if not isinstance(text, str):
        text = str(text)
    
    # Menghilangkan karakter berulang
    text = re.sub(r'(.)\1+', r'\1', text)
    
    # Menghilangkan angka
    text = ''.join([i for i in text if not i.isdigit()])
    
    # Menghilangkan tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Mengubah teks menjadi huruf kecil
    text = text.lower()
    
    # Melakukan stemming pada teks
    text = stemmer.stem(text)
    
    return text

@memory.cache
def preprocess_parallel(text_series):
    return Parallel(n_jobs=-1)(delayed(stemmer.stem)(text) for text in text_series)

# Membaca file CSV
# data = 'data.csv'  # Specify the path to your CSV file
df = pd.read_csv(data)


In [5]:
print(df)
print("####")


                                                Judul  \
0   Peraturan Pemerintah Pengganti Undang-Undang N...   
1   Peraturan Pemerintah Pengganti Undang-Undang N...   
2   Peraturan Pemerintah Pengganti Undang-Undang N...   
3   Peraturan Pemerintah Pengganti Undang-Undang N...   
4   Peraturan Pemerintah Pengganti Undang-Undang N...   
5   Peraturan Pemerintah Pengganti Undang-Undang N...   
6   Peraturan Pemerintah Pengganti Undang-Undang N...   
7   Peraturan Pemerintah Pengganti Undang-Undang N...   
8   Peraturan Pemerintah Pengganti Undang-Undang N...   
9   Peraturan Pemerintah Pengganti Undang-Undang N...   
10  Peraturan Pemerintah Pengganti Undang-Undang N...   
11  Peraturan Pemerintah Pengganti Undang-Undang N...   
12  Peraturan Pemerintah Pengganti Undang-Undang N...   
13  Peraturan Pemerintah Pengganti Undang-Undang N...   
14  Peraturan Pemerintah Pengganti Undang-Undang N...   
15  Peraturan Pemerintah Pengganti Undang-Undang N...   
16  Peraturan Pemerintah Pengga

In [6]:
print(df['Teks'])

0         PERATURAN PEMERINTAH PENGGANTI UNDANG-UNDA...
1     PERATURAN PEMERINTAH PENGGANTI UNDANG -UNDANG ...
2     PRESIDENREPUBLIK INDONESIAPERATURAN PEMERINTAH...
3     PRESIDENREPUBLIK INDONESIAPERATURAN PEMERINTAH...
4     PRESIDENREPUBLIK INDONESIAPERATURAN PEMERINTAH...
5     PRESIDENREPUBLIK INDONESIAPERATURAN PEMERINTAH...
6     PRESIDENREPUBLIK INDONESIAPERATURAN PEMERINTAH...
7      PERATURAN PEMERINTAHPENGGANTI UNDANG-UNDANG R...
8     PERATURAN PEMERINTAH PENGGANTI UNDANG-UNDANGRE...
9     LEMBARAN NEGARAREPUBLIK INDONESIANo. 57, 2008 ...
10        LEM B A RAN   N E GARA REP U B L I K   I N...
11    LEMBARAN NEGARAREPUBLIK INDONESIANo.167, 2013 ...
12       TAMBAHAN LEMBARAN NEGARA RI  No.5588 PEMERI...
13    LEMBARAN NEGARAREPUBLIK INDONESIANo.31, 2015 H...
14      LEMBARAN NEGARA REPUBLIK INDONESIA  No.99, 2...
15       LEMBARAN NEGARA  REPUBLIK INDONESIA  No.95,...
16       LEMBARAN NEGARA  REPUBLIK INDONESIA  No.87,...
17    SALINANPRESIDENREPUBLIK INDONESIAPERATURAN

In [7]:
# Pastikan tidak ada nilai NaN dan semua nilai adalah string
df['Teks'] = preprocess_parallel(df['Teks'].fillna(''))

In [8]:
# Inisialisasi TfidfVectorizer dengan stop words bahasa Indonesia
tfidf = TfidfVectorizer(stop_words=stop_words_id, max_df=0.85, min_df=2, ngram_range=(1, 2))

# Melakukan fit dan transformasi pada kolom Teks
tfidf_matrix = tfidf.fit_transform(df['Teks'])

# Menggunakan TruncatedSVD untuk pengurangan dimensi
svd = TruncatedSVD(n_components=100)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# Path untuk menyimpan feedback relevansi
feedback_path = 'relevance_feedback.pkl'

In [9]:
# Memuat feedback relevansi jika file ada, jika tidak, buat dictionary kosong
try:
    with open(feedback_path, 'rb') as f:
        relevance_feedback = pickle.load(f)
except FileNotFoundError:
    relevance_feedback = {}

def search_documents(query, top_n=10):
    # Preprocessing query
    query = preprocess_text(query)
    
    # Transformasi query menjadi vektor tf-idf
    query_vec = tfidf.transform([query])
    query_vec_reduced = svd.transform(query_vec)
    
    # Menghitung cosine similarity antara query dan semua dokumen
    cosine_similarities = linear_kernel(query_vec_reduced, tfidf_matrix_reduced).flatten()
    
    # Mendapatkan indeks dokumen dengan similarity tertinggi
    related_docs_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    # Mendapatkan judul, teks, dan nilai similarity dari dokumen yang relevan
    results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i], i) for i in related_docs_indices]
    
    return results

def get_feedback(results):
    feedback = []
    for idx, (title, text, similarity, doc_index) in enumerate(results):
        print(f"Dokumen {idx + 1}:")
        print(f"Judul: {title}")
        print(f"Teks: {text[:200]}...")  # Display only the first 200 characters
        print(f"Similarity: {similarity:.4f}")
        relevansi = int(input("Masukkan nilai relevansi (1-5): "))
        feedback.append((doc_index, relevansi))
    return feedback

def optimize_with_feedback(feedback, tfidf_matrix_reduced):
    relevant_docs = [idx for idx, relevansi in feedback if relevansi >= 3]
    non_relevant_docs = [idx for idx, relevansi in feedback if relevansi < 3]
    
    if not relevant_docs:
        print("Tidak ada dokumen yang dianggap relevan. Pencarian ulang tidak dapat dilakukan.")
        return None
    
    relevant_matrix = tfidf_matrix_reduced[relevant_docs]
    non_relevant_matrix = tfidf_matrix_reduced[non_relevant_docs] if non_relevant_docs else np.zeros(relevant_matrix.shape)
    
    # Compute the centroid of relevant and non-relevant documents
    relevant_centroid = np.asarray(relevant_matrix.mean(axis=0)).flatten()
    non_relevant_centroid = np.asarray(non_relevant_matrix.mean(axis=0)).flatten() if non_relevant_docs else np.zeros(relevant_centroid.shape)
    
    # Update query vector by moving it towards the relevant centroid and away from the non-relevant centroid
    def adjust_query_vec(query_vec, relevant_centroid, non_relevant_centroid, alpha=1, beta=0.75, gamma=0.15):
        return alpha * query_vec + beta * relevant_centroid - gamma * non_relevant_centroid
    
    return adjust_query_vec

In [10]:
# Example usage
if __name__ == "__main__":
    query = input("Masukkan Query: ")
    print(f"Query: {query}\n")
    print()
    
    if query in relevance_feedback:
        print("Menggunakan feedback relevansi yang telah disimpan...\n")
        feedback = relevance_feedback[query]
        adjust_query_vec = optimize_with_feedback(feedback, tfidf_matrix_reduced)
        
        if adjust_query_vec:
            relevant_docs = [idx for idx, relevansi in feedback if relevansi >= 3]
            non_relevant_docs = [idx for idx, relevansi in feedback if relevansi < 3]
            
            # Reprocess the query with the adjusted query vector
            query_vec = tfidf.transform([preprocess_text(query)])
            query_vec_reduced = svd.transform(query_vec)
            adjusted_query_vec = adjust_query_vec(query_vec_reduced, np.asarray(tfidf_matrix_reduced[relevant_docs].mean(axis=0)).flatten(), 
                                                  np.asarray(tfidf_matrix_reduced[non_relevant_docs].mean(axis=0)).flatten() if non_relevant_docs else np.zeros(query_vec_reduced.shape))
            
            # Compute cosine similarity with the adjusted query vector
            cosine_similarities = linear_kernel(adjusted_query_vec, tfidf_matrix_reduced).flatten()
            related_docs_indices = cosine_similarities.argsort()[-10:][::-1]
            
            # Display optimized results
            optimized_results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i]) for i in related_docs_indices]
            print("\n\n -- HASIL PENELUSURAN ULANG -- \n\n")
            for idx, (title, text, similarity) in enumerate(optimized_results):
                print(f"Dokumen {idx + 1}:")
                print(f"Judul: {title}")
                print(f"Similarity: {similarity:.4f}")
                print()
    else:
        initial_results = search_documents(query)
        feedback = get_feedback(initial_results)
        relevance_feedback[query] = feedback  # Simpan feedback untuk query ini
        with open(feedback_path, 'wb') as f:
            pickle.dump(relevance_feedback, f)  # Simpan feedback relevansi ke file
        adjust_query_vec = optimize_with_feedback(feedback, tfidf_matrix_reduced)
        
        if adjust_query_vec:
            relevant_docs = [idx for idx, relevansi in feedback if relevansi >= 3]
            non_relevant_docs = [idx for idx, relevansi in feedback if relevansi < 3]
            
            # Reprocess the query with the adjusted query vector
            query_vec = tfidf.transform([preprocess_text(query)])
            query_vec_reduced = svd.transform(query_vec)
            adjusted_query_vec = adjust_query_vec(query_vec_reduced, np.asarray(tfidf_matrix_reduced[relevant_docs].mean(axis=0)).flatten(), 
                                                  np.asarray(tfidf_matrix_reduced[non_relevant_docs].mean(axis=0)).flatten() if non_relevant_docs else np.zeros(query_vec_reduced.shape))
            
            # Compute cosine similarity with the adjusted query vector
            cosine_similarities = linear_kernel(adjusted_query_vec, tfidf_matrix_reduced).flatten()
            related_docs_indices = cosine_similarities.argsort()[-10:][::-1]
            
            # Display optimized results
            optimized_results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i]) for i in related_docs_indices]
            print("\n\n -- HASIL PENELUSURAN ULANG -- \n\n")
            for idx, (title, text, similarity) in enumerate(optimized_results):
                print(f"Dokumen {idx + 1}:")
                print(f"Judul: {title}")
                print(f"Similarity: {similarity:.4f}")
                print()


Query: pajak


Dokumen 1:
Judul: Peraturan Pemerintah Pengganti Undang-Undang Nomor 1 Tahun 1984
Teks: atur perintah ganti undang republik indonesia perpu nomor 1 tahun 1984 1 1984 tentang tangguh mulai laku undang pajak tambah nilai 1984 presiden republik indonesia timbang a bahwa laksana undang pajak...
Similarity: 0.4038
Dokumen 2:
Judul: Peraturan Pemerintah Pengganti Undang-Undang Nomor 5 Tahun 2008
Teks: lembar negararepublik indonesiano 211 2008 uang pajak npwp tata cara periksa buku ubah penjelasandalam tambah lembar negara republikindonesia nomor 4953 atur pemerintahpengganti undang republik indone...
Similarity: 0.2440
Dokumen 3:
Judul: Peraturan Pemerintah Pengganti Undang-Undang Nomor 1 Tahun 2020
Teks: lembar negara republik indonesia no 87 2020 uang stabilitas sistem uang bijak uang negara corona virus disease 2019 jelas dalam tambah lembar negara republik indonesia nomor 6 485 atur peme rintah gan...
Similarity: 0.1817
Dokumen 4:
Judul: Peraturan Pemerintah Pengganti Und

In [None]:
import shutil
import os

# Membuat folder parent jika belum ada
os.makedirs(parent_folder_hasil, exist_ok=True)

for idx, (title, text, similarity) in enumerate(optimized_results):
    found = False
    # Mencari file PDF dengan judul yang sesuai di folder1
    pdf_file = os.path.join(pdf_folder_Perpu, f"{title}.pdf")
    if os.path.isfile(pdf_file):
        found = True
    else:
        # Mencari file PDF dengan judul yang sesuai di folder2
        pdf_file = os.path.join(pdf_folder_UU, f"{title}.pdf")
        if os.path.isfile(pdf_file):
            found = True

    if found:
        # Buat folder output sesuai dengan query
        output_folder = os.path.join(parent_folder_hasil, query)
        os.makedirs(output_folder, exist_ok=True)

        # Menyalin file PDF ke folder output
        output_file = os.path.join(output_folder, f"{title}.pdf")
        shutil.copy(pdf_file, output_file)
    else:
        print(f"File PDF untuk {title} tidak ditemukan.")

print(f"Dokumen disimpan di folder {output_folder}")
