In [1]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Hansen
[nltk_data]     Dafa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Data dokumen
docs = {
    'D1': "Machine learning improves search engines.",
    'D2': "Information retrieval techniques are evolving.",
    'D3': "Search engines use advanced algorithms.",
    'D4': "Deep learning and neural networks are popular.",
    'D5': "Boolean retrieval uses logical operators.",
    'D6': "Query processing is essential in search engines.",
    'D7': "Text mining and NLP are related to information retrieval.",
    'D8': "Search algorithms improve information discovery.",
    'D9': "Data science leverages machine learning.",
    'D10': "Ranking methods optimize search engine results."
}


In [3]:
# Daftar query
queries = [
    ("Search AND Engine", "AND"),
    ("Information OR Retrieval", "OR"),
    ("Machine NOT Learning", "NOT")
]

In [4]:

# Fungsi preprocessing
def simple_preprocess(text):
    tokens = word_tokenize(text)
    return [t.lower() for t in tokens if t.isalnum()]

def stemming_preprocess(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    return [stemmer.stem(t.lower()) for t in tokens if t.isalnum()]

def lemmatization_preprocess(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(t.lower()) for t in tokens if t.isalnum()]

In [5]:
# Membangun inverted index
def build_inverted_index(docs, preprocess_func):
    inverted_index = {}
    for doc_id, text in docs.items():
        terms = preprocess_func(text)
        for term in terms:
            inverted_index.setdefault(term, set()).add(doc_id)
    return inverted_index

In [None]:
# Evaluasi query
def process_query(query_terms, operator, inverted_index):
    set1 = inverted_index.get(query_terms[0], set())
    set2 = inverted_index.get(query_terms[1], set())
    
    if operator == "AND":
        return set1 & set2
    elif operator == "OR":
        return set1 | set2
    elif operator == "NOT":
        return set1 - set2
    return set()


In [None]:
# Ground truth untuk setiap query
ground_truth = {
    0: {'D10'},         # Query 1
    1: {'D2', 'D5', 'D7'}, # Query 2
    2: set()             # Query 3
}

In [None]:

# Evaluasi metrik
def evaluate(retrieved, relevant, total_docs):
    tp = len(retrieved & relevant)
    fp = len(retrieved - relevant)
    fn = len(relevant - retrieved)
    tn = total_docs - (tp + fp + fn)
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
    accuracy = (tp + tn) / total_docs
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, accuracy, f1


In [None]:
# Eksekusi untuk semua metode
methods = {
    'a': {'name': 'Simple', 'preprocess': simple_preprocess},
    'b': {'name': 'Stemming', 'preprocess': stemming_preprocess},
    'c': {'name': 'Lemmatization', 'preprocess': lemmatization_preprocess}
}
results = {}

In [None]:
for method_id, method in methods.items():
    inverted_index = build_inverted_index(docs, method['preprocess'])
    method_results = []
    
    for q_idx, (query_text, operator) in enumerate(queries):
        # Preprocess query
        query_terms = []
        for term in query_text.split():
            if term not in ["AND", "OR", "NOT"]:
                processed = method['preprocess'](term)
                if processed:
                    query_terms.append(processed[0])
        
        # Proses query
        retrieved = process_query(query_terms, operator, inverted_index)
        precision, recall, accuracy, f1 = evaluate(retrieved, ground_truth[q_idx], len(docs))
        
        method_results.append({
            'precision': round(precision, 2),
            'recall': round(recall, 2),
            'accuracy': round(accuracy, 2),
            'f1': round(f1, 2)
        })
    
    results[method_id] = method_results

# Menampilkan hasil
for method_id, method in methods.items():
    print(f"\n=== {method['name']} Method ===")
    print(f"{'Query':<10} {'Precision':<10} {'Recall':<10} {'Accuracy':<10} {'F1-score':<10}")
    for q_idx in range(3):
        res = results[method_id][q_idx]
        print(f"Query {q_idx+1}: {res['precision']:<10} {res['recall']:<10} {res['accuracy']:<10} {res['f1']:<10}")