In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import re
import string
from collections import Counter
import math
import warnings
warnings.filterwarnings('ignore')


In [2]:
class BookSearchEngine:
    def __init__(self, csv_path):
        """
        Inisialisasi Search Engine dengan path dataset
        """
        self.csv_path = csv_path
        self.df = None
        self.tfidf_vectorizer = None
        self.tfidf_matrix = None
        self.bm25_vectorizer = None
        self.bm25_matrix = None
        self.learning_model = None
        self.processed_texts = None
        
    def load_data(self):
        """
        Load dan preprocessing data dari CSV
        """
        print("Loading data...")
        self.df = pd.read_csv(self.csv_path)
        
        # Cleaning data
        self.df = self.df.dropna(subset=['Book-Title', 'Book-Author', 'description'])
        
        # Combine text fields untuk search
        self.df['combined_text'] = (
            self.df['Book-Title'].fillna('') + ' ' + 
            self.df['Book-Author'].fillna('') + ' ' + 
            self.df['description'].fillna('')
        )
        
        # Preprocessing text
        self.processed_texts = [self.preprocess_text(text) for text in self.df['combined_text']]
        
        print(f"Data loaded: {len(self.df)} books")
        
    def preprocess_text(self, text):
        """
        Preprocessing teks: lowercase, remove punctuation, dll
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove extra whitespaces
        text = ' '.join(text.split())
        
        return text
    
    def build_tfidf_index(self):
        """
        Build TF-IDF index
        """
        print("Building TF-IDF index...")
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=10000,
            stop_words='english',
            ngram_range=(1, 2)
        )
        
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.processed_texts)
        
    def calculate_bm25_scores(self, query, k1=1.5, b=0.75):
        """
        Calculate BM25 scores untuk query
        """
        query_terms = self.preprocess_text(query).split()
        
        # Document frequencies
        doc_freqs = {}
        for term in query_terms:
            doc_freqs[term] = sum(1 for doc in self.processed_texts if term in doc)
        
        # Average document length
        avg_doc_len = np.mean([len(doc.split()) for doc in self.processed_texts])
        
        scores = []
        N = len(self.processed_texts)
        
        for doc in self.processed_texts:
            doc_terms = doc.split()
            doc_len = len(doc_terms)
            doc_term_freq = Counter(doc_terms)
            
            score = 0
            for term in query_terms:
                if term in doc_term_freq:
                    tf = doc_term_freq[term]
                    df = doc_freqs.get(term, 0)
                    if df > 0:
                        idf = math.log((N - df + 0.5) / (df + 0.5))
                        score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * doc_len / avg_doc_len))
            
            scores.append(score)
        
        return np.array(scores)
    
    def extract_features(self, query):
        """
        Extract features untuk Learning to Rank
        """
        # TF-IDF similarity
        query_processed = self.preprocess_text(query)
        query_tfidf = self.tfidf_vectorizer.transform([query_processed])
        tfidf_scores = cosine_similarity(query_tfidf, self.tfidf_matrix).flatten()
        
        # BM25 scores
        bm25_scores = self.calculate_bm25_scores(query)
        
        # Additional features
        query_terms = query_processed.split()
        
        # Exact title match
        title_exact_match = [1 if query.lower() in title.lower() else 0 
                           for title in self.df['Book-Title']]
        
        # Author match
        author_match = [1 if any(term in author.lower() for term in query_terms) else 0
                for author in self.df['Book-Author'].fillna('')]

        
        # Title term count
        title_term_count = [sum(1 for term in query_terms if term in title.lower()) 
                           for title in self.df['Book-Title']]
        
        # Combine features
        features = np.column_stack([
            tfidf_scores,
            bm25_scores,
            title_exact_match,
            author_match,
            title_term_count
        ])
        
        return features
    
    def generate_synthetic_relevance_scores(self, query):
        """
        Generate synthetic relevance scores untuk training
        (Dalam implementasi nyata, ini akan berupa human judgments)
        """
        features = self.extract_features(query)
        
        # Simple heuristic untuk synthetic scores
        tfidf_scores = features[:, 0]
        title_match = features[:, 2]
        author_match = features[:, 3]
        
        # Weighted combination
        relevance_scores = (
            0.4 * tfidf_scores + 
            0.3 * title_match + 
            0.2 * author_match + 
            0.1 * features[:, 4]  # title term count
        )
        
        # Normalize to 0-3 scale (0: not relevant, 3: highly relevant)
        if relevance_scores.max() > relevance_scores.min():
            relevance_scores = (relevance_scores - relevance_scores.min()) / (relevance_scores.max() - relevance_scores.min())
            relevance_scores = (relevance_scores * 3).round().astype(int)
        else:
            relevance_scores = np.zeros(len(relevance_scores), dtype=int)
        
        return relevance_scores
    
    def train_learning_to_rank_model(self, sample_queries=None):
        """
        Train Learning to Rank model
        """
        print("Training Learning to Rank model...")
        
        if sample_queries is None:
            # Generate sample queries from book titles and authors
            sample_queries = []
            for i in range(min(100, len(self.df))):  # Limit untuk efisiensi
                title = str(self.df.iloc[i]['Book-Title'])
                title_words = title.split()[:2]  # Ambil 2 kata pertama judul
                if len(title_words) > 0:
                    sample_queries.append(' '.join(title_words))
                
                author = str(self.df.iloc[i]['Book-Author'])
                author_words = author.split()[:1]  # Ambil 1 kata author
                if len(author_words) > 0:
                    sample_queries.append(author_words[0])
        
        # Collect training data
        X_train = []
        y_train = []
        
        for query in sample_queries[:50]:  # Limit untuk efisiensi
            try:
                features = self.extract_features(query)
                relevance_scores = self.generate_synthetic_relevance_scores(query)
                
                X_train.extend(features)
                y_train.extend(relevance_scores)
            except Exception as e:
                print(f"Error processing query '{query}': {e}")
                continue
        
        if len(X_train) > 0:
            X_train = np.array(X_train)
            y_train = np.array(y_train)
            
            # Train model
            self.learning_model = RandomForestRegressor(n_estimators=100, random_state=42)
            self.learning_model.fit(X_train, y_train)
            
            print("Learning to Rank model trained!")
        else:
            print("No training data available. Using TF-IDF fallback.")
    
    def search(self, query, top_k=10):
        """
        Search dengan Learning to Rank
        """
        if self.learning_model is None:
            # Fallback ke TF-IDF jika model belum ditraining
            return self.search_tfidf(query, top_k)
        
        try:
            # Extract features
            features = self.extract_features(query)
            
            # Predict relevance scores
            relevance_scores = self.learning_model.predict(features)
            
            # Get top-k results
            top_indices = np.argsort(relevance_scores)[::-1][:top_k]
            
            results = []
            for idx in top_indices:
                results.append({
                    'rank': len(results) + 1,
                    'score': relevance_scores[idx],
                    'title': self.df.iloc[idx]['Book-Title'],
                    'author': self.df.iloc[idx]['Book-Author'],
                    'description': str(self.df.iloc[idx]['description'])[:200] + '...',
                    'isbn': self.df.iloc[idx]['ISBN']
                })
            
            return results
        except Exception as e:
            print(f"Error in search: {e}")
            return self.search_tfidf(query, top_k)
    
    def search_tfidf(self, query, top_k=10):
        """
        Fallback search menggunakan TF-IDF
        """
        query_processed = self.preprocess_text(query)
        query_tfidf = self.tfidf_vectorizer.transform([query_processed])
        similarities = cosine_similarity(query_tfidf, self.tfidf_matrix).flatten()
        
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                'rank': len(results) + 1,
                'score': similarities[idx],
                'title': self.df.iloc[idx]['Book-Title'],
                'author': self.df.iloc[idx]['Book-Author'],
                'description': str(self.df.iloc[idx]['description'])[:200] + '...',
                'isbn': self.df.iloc[idx]['ISBN']
            })
        
        return results


In [3]:
class EvaluationMetrics:
    """
    Kelas untuk menghitung metrics evaluasi
    """
    
    @staticmethod
    def precision_at_k(relevant_items, retrieved_items, k):
        """
        Precision@K
        """
        if k == 0:
            return 0.0
        
        retrieved_k = retrieved_items[:k]
        relevant_retrieved = len([item for item in retrieved_k if item in relevant_items])
        
        return relevant_retrieved / k
    
    @staticmethod
    def recall_at_k(relevant_items, retrieved_items, k):
        """
        Recall@K
        """
        if len(relevant_items) == 0:
            return 0.0
        
        retrieved_k = retrieved_items[:k]
        relevant_retrieved = len([item for item in retrieved_k if item in relevant_items])
        
        return relevant_retrieved / len(relevant_items)
    
    @staticmethod
    def average_precision_at_k(relevant_items, retrieved_items, k):
        """
        Average Precision@K
        """
        if len(relevant_items) == 0:
            return 0.0
        
        retrieved_k = retrieved_items[:k]
        ap = 0.0
        relevant_count = 0
        
        for i, item in enumerate(retrieved_k):
            if item in relevant_items:
                relevant_count += 1
                precision_at_i = relevant_count / (i + 1)
                ap += precision_at_i
        
        return ap / min(len(relevant_items), k)
    
    @staticmethod
    def ndcg_at_k(relevant_items, retrieved_items, k, relevance_scores=None):
        """
        NDCG@K
        """
        if relevance_scores is None:
            # Binary relevance: relevant=1, not relevant=0
            relevance_scores = {item: 1 for item in relevant_items}
        
        retrieved_k = retrieved_items[:k]
        
        # DCG calculation
        dcg = 0.0
        for i, item in enumerate(retrieved_k):
            if item in relevance_scores:
                relevance = relevance_scores[item]
                dcg += (2**relevance - 1) / math.log2(i + 2)
        
        # IDCG calculation
        ideal_relevances = sorted([relevance_scores.get(item, 0) for item in relevant_items], reverse=True)[:k]
        idcg = 0.0
        for i, relevance in enumerate(ideal_relevances):
            idcg += (2**relevance - 1) / math.log2(i + 2)
        
        if idcg == 0:
            return 0.0
        
        return dcg / idcg
    
    @staticmethod
    def map_at_k(queries_results, k):
        """
        Mean Average Precision@K
        """
        ap_scores = []
        for relevant_items, retrieved_items in queries_results:
            ap = EvaluationMetrics.average_precision_at_k(relevant_items, retrieved_items, k)
            ap_scores.append(ap)
        
        return np.mean(ap_scores) if ap_scores else 0.0


In [4]:
def main():
    """
    Main function untuk menjalankan sistem
    """
    # Path dataset
    csv_path = "D:/IT DEL/Semester 8 (FINAL)/STBI/Information-Retrieval-Books-search-engine-main/Information-Retrieval-Books-search-engine-main/data_books_updated.csv"
    
    # Initialize search engine
    search_engine = BookSearchEngine(csv_path)
    
    try:
        # Load dan preprocess data
        search_engine.load_data()
        
        # Build indices
        search_engine.build_tfidf_index()
        
        # Train Learning to Rank model
        search_engine.train_learning_to_rank_model()
        
        print("\n" + "="*60)
        print("BOOK SEARCH ENGINE WITH LEARNING TO RANK")
        print("="*60)
        
        # Interactive search
        while True:
            print("\nMasukkan query (atau 'quit' untuk keluar):")
            query = input("Query: ").strip()
            
            if query.lower() in ['quit', 'exit', 'q']:
                break
            
            if not query:
                continue
            
            # Perform search
            results = search_engine.search(query, top_k=10)
            
            print(f"\nHasil pencarian untuk: '{query}'")
            print("-" * 50)
            
            if len(results) == 0:
                print("Tidak ada hasil yang ditemukan.")
                continue
            
            for result in results:
                print(f"{result['rank']}. {result['title']}")
                print(f"   Author: {result['author']}")
                print(f"   Score: {result['score']:.4f}")
                print(f"   Description: {result['description']}")
                print(f"   ISBN: {result['isbn']}")
                print()
            
            # Simulasi evaluasi (dengan synthetic ground truth)
            # Dalam implementasi nyata, ground truth akan disediakan secara manual
            print("\nEVALUASI METRICS:")
            print("-" * 30)
            
            # Generate synthetic relevant items (top 3 results sebagai relevant)
            relevant_items = [str(r['isbn']) for r in results[:3]]
            retrieved_items = [str(r['isbn']) for r in results]
            
            # Calculate metrics
            precision_5 = EvaluationMetrics.precision_at_k(relevant_items, retrieved_items, 5)
            recall_5 = EvaluationMetrics.recall_at_k(relevant_items, retrieved_items, 5)
            ap_5 = EvaluationMetrics.average_precision_at_k(relevant_items, retrieved_items, 5)
            ndcg_5 = EvaluationMetrics.ndcg_at_k(relevant_items, retrieved_items, 5)
            
            print(f"Precision@5: {precision_5:.4f}")
            print(f"Recall@5: {recall_5:.4f}")
            print(f"MAP@5: {ap_5:.4f}")
            print(f"NDCG@5: {ndcg_5:.4f}")
            
            # Contoh evaluasi batch untuk MAP@5
            queries_results = [(relevant_items, retrieved_items)]
            map_5 = EvaluationMetrics.map_at_k(queries_results, 5)
            print(f"Batch MAP@5: {map_5:.4f}")
    
    except FileNotFoundError:
        print(f"Error: File tidak ditemukan di path: {csv_path}")
        print("Pastikan path file sudah benar!")
    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Loading data...
Data loaded: 57555 books
Building TF-IDF index...
Training Learning to Rank model...
Learning to Rank model trained!

BOOK SEARCH ENGINE WITH LEARNING TO RANK

Masukkan query (atau 'quit' untuk keluar):


Query:  love of my life



Hasil pencarian untuk: 'love of my life'
--------------------------------------------------
1. The Haunter of the Dark: The H.P. Lovecraft Omnibus, #3
   Author: H.P. Lovecraft, August Derleth (Introduction)
   Score: 2.0000
   ISBN: 9780586063231

2. Dreams of My Russian Summers
   Author: Andreï Makine, Geoffrey Strachan (Translator)
   Score: 2.0000
   Description: Dreams of My Russian Summers, tells the poignant story of a boy growing up amid the harsh realities of Soviet life in the 1960s and '70s, and of his extraordinary love for an elegant Frenchwoman, Char...
   ISBN: 9780684852683

3. The Art of Amy Brown
   Author: Amy Brown (Annotations), Charles de Lint (Goodreads Author) (Introduction)
   Score: 2.0000
   Description: The Art of Amy Brown is the first collection of Amy Brown's wildly popular art. It was published by Chimera Publishing in 2003. The book is 157 pages of art, with a commentary by Amy Brown for every p...
   ISBN: 9780974461229

4. Everybody, Always: Becomin

Query:  quit
