In [1]:
data = r"C:\Users\Sistem Temu Kembali Informasi\Tugas dan Latihan\Final tugas akhir\IR\UU + Perpu.csv"
stopword_id = r'C:\Users\Sistem Temu Kembali Informasi\Tugas dan Latihan\Final tugas akhir\IR\stopwords-id.txt'

pdf_folder_Perpu = r"C:\Users\Sistem Temu Kembali Informasi\Tugas dan Latihan\Final tugas akhir\PERPU\File Perpu"
pdf_folder_UU = r"C:\Users\Sistem Temu Kembali Informasi\Tugas dan Latihan\Final tugas akhir\UU\File UU"
parent_folder_hasil = r"C:\Users\Sistem Temu Kembali Informasi\Tugas dan Latihan\Final tugas akhir\Relevance Feedback\Hasil"

In [16]:
import pandas as pd
import numpy as np
import string
import re
import pickle
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from joblib import Parallel, delayed
from joblib import Memory
from sklearn.decomposition import TruncatedSVD

# Caching setup
location = './cacheall'
memory = Memory(location, verbose=0)

# Inisialisasi stemmer bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membaca daftar stop words bahasa Indonesia
# stopword_id = 'stopwords.txt'  # Specify the path to your stopwords file
with open(stopword_id, 'r') as f:
    stop_words_id = f.read().splitlines()

def preprocess_text(text):
    # Pastikan text adalah string
    if not isinstance(text, str):
        text = str(text)
    
    # Menghilangkan karakter berulang
    text = re.sub(r'(.)\1+', r'\1', text)
    
    # Menghilangkan angka
    text = ''.join([i for i in text if not i.isdigit()])
    
    # Menghilangkan tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Mengubah teks menjadi huruf kecil
    text = text.lower()
    
    # Melakukan stemming pada teks
    text = stemmer.stem(text)
    
    return text

@memory.cache
def preprocess_parallel(text_series):
    return Parallel(n_jobs=-1)(delayed(stemmer.stem)(text) for text in text_series)

# Membaca file CSV
# data = 'data.csv'  # Specify the path to your CSV file
df = pd.read_csv(data)

# Pastikan tidak ada nilai NaN dan semua nilai adalah string
df['Teks'] = preprocess_parallel(df['Teks'].fillna(''))

# Inisialisasi TfidfVectorizer dengan stop words bahasa Indonesia
tfidf = TfidfVectorizer(stop_words=stop_words_id, max_df=0.85, min_df=2, ngram_range=(1, 2))

# Melakukan fit dan transformasi pada kolom Teks
tfidf_matrix = tfidf.fit_transform(df['Teks'])

# Menggunakan TruncatedSVD untuk pengurangan dimensi
svd = TruncatedSVD(n_components=100)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# Path untuk menyimpan feedback relevansi
feedback_path = 'relevance_feedback.pkl'

# Memuat feedback relevansi jika file ada, jika tidak, buat dictionary kosong
try:
    with open(feedback_path, 'rb') as f:
        relevance_feedback = pickle.load(f)
except FileNotFoundError:
    relevance_feedback = {}

def search_documents(query, top_n=10):
    # Preprocessing query
    query = preprocess_text(query)
    
    # Transformasi query menjadi vektor tf-idf
    query_vec = tfidf.transform([query])
    query_vec_reduced = svd.transform(query_vec)
    
    # Menghitung cosine similarity antara query dan semua dokumen
    cosine_similarities = linear_kernel(query_vec_reduced, tfidf_matrix_reduced).flatten()
    
    # Mendapatkan indeks dokumen dengan similarity tertinggi
    related_docs_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    # Mendapatkan judul, teks, dan nilai similarity dari dokumen yang relevan
    results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i], i) for i in related_docs_indices]
    
    return results

def get_feedback(results):
    feedback = []
    for idx, (title, text, similarity, doc_index) in enumerate(results):
        print(f"Dokumen {idx + 1}:")
        print(f"Judul: {title}")
        print(f"Teks: {text[:200]}...")  # Display only the first 200 characters
        print(f"Similarity: {similarity:.4f}")
        relevansi = int(input("Masukkan nilai relevansi (1-5): "))
        feedback.append((doc_index, relevansi))
    return feedback

def optimize_with_feedback(feedback, tfidf_matrix_reduced):
    relevant_docs = [idx for idx, relevansi in feedback if relevansi >= 3]
    non_relevant_docs = [idx for idx, relevansi in feedback if relevansi < 3]
    
    if not relevant_docs:
        print("Tidak ada dokumen yang dianggap relevan. Pencarian ulang tidak dapat dilakukan.")
        return None
    
    relevant_matrix = tfidf_matrix_reduced[relevant_docs]
    non_relevant_matrix = tfidf_matrix_reduced[non_relevant_docs] if non_relevant_docs else np.zeros(relevant_matrix.shape)
    
    # Compute the centroid of relevant and non-relevant documents
    relevant_centroid = np.asarray(relevant_matrix.mean(axis=0)).flatten()
    non_relevant_centroid = np.asarray(non_relevant_matrix.mean(axis=0)).flatten() if non_relevant_docs else np.zeros(relevant_centroid.shape)
    
    # Update query vector by moving it towards the relevant centroid and away from the non-relevant centroid
    def adjust_query_vec(query_vec, relevant_centroid, non_relevant_centroid, alpha=1, beta=0.75, gamma=0.15):
        return alpha * query_vec + beta * relevant_centroid - gamma * non_relevant_centroid
    
    return adjust_query_vec

# Example usage
if __name__ == "__main__":
    query = input("Masukkan Query: ")
    print(f"Query: {query}\n")
    print()
    
    if query in relevance_feedback:
        print("Menggunakan feedback relevansi yang telah disimpan...\n")
        feedback = relevance_feedback[query]
        adjust_query_vec = optimize_with_feedback(feedback, tfidf_matrix_reduced)
        
        if adjust_query_vec:
            relevant_docs = [idx for idx, relevansi in feedback if relevansi >= 3]
            non_relevant_docs = [idx for idx, relevansi in feedback if relevansi < 3]
            
            # Reprocess the query with the adjusted query vector
            query_vec = tfidf.transform([preprocess_text(query)])
            query_vec_reduced = svd.transform(query_vec)
            adjusted_query_vec = adjust_query_vec(query_vec_reduced, np.asarray(tfidf_matrix_reduced[relevant_docs].mean(axis=0)).flatten(), 
                                                  np.asarray(tfidf_matrix_reduced[non_relevant_docs].mean(axis=0)).flatten() if non_relevant_docs else np.zeros(query_vec_reduced.shape))
            
            # Compute cosine similarity with the adjusted query vector
            cosine_similarities = linear_kernel(adjusted_query_vec, tfidf_matrix_reduced).flatten()
            related_docs_indices = cosine_similarities.argsort()[-10:][::-1]
            
            # Display optimized results
            optimized_results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i]) for i in related_docs_indices]
            print("\n\n -- HASIL PENELUSURAN ULANG -- \n\n")
            for idx, (title, text, similarity) in enumerate(optimized_results):
                print(f"Dokumen {idx + 1}:")
                print(f"Judul: {title}")
                print(f"Similarity: {similarity:.4f}")
                print()
    else:
        initial_results = search_documents(query)
        feedback = get_feedback(initial_results)
        relevance_feedback[query] = feedback  # Simpan feedback untuk query ini
        with open(feedback_path, 'wb') as f:
            pickle.dump(relevance_feedback, f)  # Simpan feedback relevansi ke file
        adjust_query_vec = optimize_with_feedback(feedback, tfidf_matrix_reduced)
        
        if adjust_query_vec:
            relevant_docs = [idx for idx, relevansi in feedback if relevansi >= 3]
            non_relevant_docs = [idx for idx, relevansi in feedback if relevansi < 3]
            
            # Reprocess the query with the adjusted query vector
            query_vec = tfidf.transform([preprocess_text(query)])
            query_vec_reduced = svd.transform(query_vec)
            adjusted_query_vec = adjust_query_vec(query_vec_reduced, np.asarray(tfidf_matrix_reduced[relevant_docs].mean(axis=0)).flatten(), 
                                                  np.asarray(tfidf_matrix_reduced[non_relevant_docs].mean(axis=0)).flatten() if non_relevant_docs else np.zeros(query_vec_reduced.shape))
            
            # Compute cosine similarity with the adjusted query vector
            cosine_similarities = linear_kernel(adjusted_query_vec, tfidf_matrix_reduced).flatten()
            related_docs_indices = cosine_similarities.argsort()[-10:][::-1]
            
            # Display optimized results
            optimized_results = [(df.iloc[i]['Judul'], df.iloc[i]['Teks'], cosine_similarities[i]) for i in related_docs_indices]
            print("\n\n -- HASIL PENELUSURAN ULANG -- \n\n")
            for idx, (title, text, similarity) in enumerate(optimized_results):
                print(f"Dokumen {idx + 1}:")
                print(f"Judul: {title}")
                print(f"Similarity: {similarity:.4f}")
                print()


Query: korupsi


Menggunakan feedback relevansi yang telah disimpan...



 -- HASIL PENELUSURAN ULANG -- 


Dokumen 1:
Judul: Undang-Undang Nomor 30 Tahun 2002
Similarity: 0.7687

Dokumen 2:
Judul: Undang-Undang Nomor 19 Tahun 2019
Similarity: 0.7044

Dokumen 3:
Judul: Peraturan Pemerintah Pengganti Undang-Undang Nomor 1 Tahun 2015
Similarity: 0.6410

Dokumen 4:
Judul: Undang-Undang Nomor 10 Tahun 2015
Similarity: 0.6297

Dokumen 5:
Judul: Undang-Undang Nomor 31 Tahun 1999
Similarity: 0.6082

Dokumen 6:
Judul: Undang-Undang Nomor 20 Tahun 2001
Similarity: 0.5766

Dokumen 7:
Judul: Undang-Undang Nomor 46 Tahun 2009
Similarity: 0.5546

Dokumen 8:
Judul: Undang-Undang Nomor 3 Tahun 1971
Similarity: 0.5147

Dokumen 9:
Judul: Undang-Undang Nomor 7 Tahun 2006
Similarity: 0.4243

Dokumen 10:
Judul: Undang-Undang Nomor 1 Tahun 2023
Similarity: 0.2634



In [17]:
import shutil
import os

# Membuat folder parent jika belum ada
os.makedirs(parent_folder_hasil, exist_ok=True)

for idx, (title, text, similarity) in enumerate(optimized_results):
    found = False
    # Mencari file PDF dengan judul yang sesuai di folder1
    pdf_file = os.path.join(pdf_folder_Perpu, f"{title}.pdf")
    if os.path.isfile(pdf_file):
        found = True
    else:
        # Mencari file PDF dengan judul yang sesuai di folder2
        pdf_file = os.path.join(pdf_folder_UU, f"{title}.pdf")
        if os.path.isfile(pdf_file):
            found = True

    if found:
        # Buat folder output sesuai dengan query
        output_folder = os.path.join(parent_folder_hasil, query)
        os.makedirs(output_folder, exist_ok=True)

        # Menyalin file PDF ke folder output
        output_file = os.path.join(output_folder, f"{title}.pdf")
        shutil.copy(pdf_file, output_file)
    else:
        print(f"File PDF untuk {title} tidak ditemukan.")

print(f"Dokumen disimpan di folder {output_folder}")


Dokumen disimpan di folder C:\Users\muham\OneDrive - Universitas Airlangga\Semester 6\Sistem Temu Kembali Informasi\Tugas dan Latihan\Final tugas akhir\Relevance Feedback\Hasil\korupsi


In [None]:
# # Tambahan kode untuk menghitung presisi
# # Judul dokumen hasil awal
# judul_hasil_awal = [title for title, _, _ in initial_results]

# # Judul dokumen hasil setelah relevance feedback
# judul_hasil_feedback = [title for title, _, _ in optimized_results]

# # Menghitung jumlah dokumen yang sama dalam hasil setelah feedback
# jumlah_dokumen_sama = sum([1 for doc in judul_hasil_feedback if doc in judul_hasil_awal])

# # Jumlah dokumen yang diambil setelah feedback
# total_dokumen_diambil = len(judul_hasil_feedback)

# # Menghitung presisi
# presisi = jumlah_dokumen_sama / total_dokumen_diambil

# print(f'Presisi: {presisi:.2f}')  # Output: Presisi

# # Menghitung presisi awal
# jumlah_dokumen_sama_awal = sum([1 for doc in judul_hasil_awal if doc in judul_hasil_awal])
# total_dokumen_diambil_awal = len(judul_hasil_awal)
# presisi_awal = jumlah_dokumen_sama_awal / total_dokumen_diambil_awal

# print(f'Presisi Awal: {presisi_awal:.2f}')  # Output: Presisi Awal


In [None]:
# # Definisikan bobot untuk setiap nilai relevansi
# relevance_weights = {1: -2, 2: -1, 3: 0, 4: 1, 5: 2}

# def adjust_query_vec(query_vec, relevant_docs, non_relevant_docs):
#     alpha = 0.75  # Bobot untuk dokumen relevan
#     beta = 0.25  # Bobot untuk dokumen tidak relevan

#     # Menghitung mean vector dengan bobot relevansi
#     relevant_mean_vec = np.average(
#         [tfidf_matrix_reduced[idx] for idx in relevant_docs], 
#         axis=0, 
#         weights=[relevance_weights[relevance_feedback[idx]] for idx in relevant_docs]
#     )

#     non_relevant_mean_vec = np.average(
#         [tfidf_matrix_reduced[idx] for idx in non_relevant_docs], 
#         axis=0, 
#         weights=[relevance_weights[relevance_feedback[idx]] for idx in non_relevant_docs]
#     ) if non_relevant_docs else np.zeros(query_vec.shape)

#     adjusted_query_vec = query_vec + alpha * relevant_mean_vec - beta * non_relevant_mean_vec
#     return adjusted_query_vec

# # Mengumpulkan dokumen relevan dan tidak relevan berdasarkan feedback
# relevant_docs = [idx for idx, relevansi in feedback if relevansi >= 3]
# non_relevant_docs = [idx for idx, relevansi in feedback if relevansi < 3]

# # Optimisasi query dengan feedback
# query_vec = tfidf.transform([preprocess_text(query)])
# query_vec_reduced = svd.transform(query_vec)
# adjusted_query_vec = adjust_query_vec(query_vec_reduced, relevant_docs, non_relevant_docs)

# # Menghitung kesamaan kosinus dengan vektor query yang sudah dioptimalkan
# cosine_similarities = linear_kernel(adjusted_query_vec, tfidf_matrix_reduced).flatten()
# related_docs_indices = cosine_similarities.argsort()[-10:][::-1]
