# Quran Query-Tafsir Ranking Tutorial

This notebook demonstrates how to:
1. Load tafsir data
2. Extract TF-IDF and SBERT features
3. Train ranking models (Logistic Regression, SVM, XGBoost)
4. Evaluate using MAP, nDCG, MRR, and Recall@K metrics

In [None]:
# Import required libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from src.data_loader import TafsirDataLoader
from src.features import FeatureExtractor
from src.models import RankingModel, train_multiple_models
from src.evaluation import RankingMetrics, print_metrics

## 1. Load Tafsir Data

We'll create sample data for demonstration. In practice, you would load your own tafsir dataset.

In [None]:
# Initialize data loader
loader = TafsirDataLoader()

# Create sample data (or use loader.load_csv() for real data)
data = loader.create_sample_data(n_queries=10, n_docs_per_query=5)

print(f"Dataset shape: {data.shape}")
print(f"\nColumns: {data.columns.tolist()}")
data.head()

In [None]:
# Split data into train and test sets
train_data, test_data = loader.train_test_split(test_size=0.2)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

## 2. Feature Extraction

Extract TF-IDF features (and optionally SBERT embeddings) from query-document pairs.

In [None]:
# Initialize feature extractor (TF-IDF only for demonstration)
feature_extractor = FeatureExtractor(use_tfidf=True, use_sbert=False)

# Fit TF-IDF on all text data
all_texts = data['query'].tolist() + data['tafsir_text'].tolist()
feature_extractor.fit_tfidf(all_texts)

print("Feature extractor initialized and fitted.")

In [None]:
# Extract features for training data
train_queries = train_data['query'].tolist()
train_docs = train_data['tafsir_text'].tolist()
train_labels = train_data['relevance'].values

X_train = feature_extractor.extract_features(train_queries, train_docs)

print(f"Training feature shape: {X_train.shape}")

In [None]:
# Extract features for test data
test_queries = test_data['query'].tolist()
test_docs = test_data['tafsir_text'].tolist()
test_labels = test_data['relevance'].values

X_test = feature_extractor.extract_features(test_queries, test_docs)

print(f"Test feature shape: {X_test.shape}")

## 3. Train Ranking Models

Train different models: Logistic Regression, SVM, and XGBoost.

In [None]:
# Train Logistic Regression
lr_model = RankingModel(model_type='logistic_regression')
lr_model.fit(X_train, train_labels)
print("Logistic Regression trained.")

# Train SVM
svm_model = RankingModel(model_type='svm')
svm_model.fit(X_train, train_labels)
print("SVM trained.")

In [None]:
# Train XGBoost (if installed)
try:
    xgb_model = RankingModel(model_type='xgboost')
    xgb_model.fit(X_train, train_labels)
    print("XGBoost trained.")
except ImportError:
    print("XGBoost not installed. Skipping.")
    xgb_model = None

## 4. Evaluate Models

Compute ranking metrics: MAP, nDCG, MRR, and Recall@K.

In [None]:
# Initialize metrics calculator
metrics_calc = RankingMetrics()

# Evaluate Logistic Regression
print("=" * 50)
print("LOGISTIC REGRESSION")
lr_metrics = metrics_calc.evaluate_model(
    lr_model, feature_extractor, test_data, k_values=[1, 3, 5]
)
print_metrics(lr_metrics)

In [None]:
# Evaluate SVM
print("=" * 50)
print("SVM")
svm_metrics = metrics_calc.evaluate_model(
    svm_model, feature_extractor, test_data, k_values=[1, 3, 5]
)
print_metrics(svm_metrics)

In [None]:
# Evaluate XGBoost (if available)
if xgb_model:
    print("=" * 50)
    print("XGBOOST")
    xgb_metrics = metrics_calc.evaluate_model(
        xgb_model, feature_extractor, test_data, k_values=[1, 3, 5]
    )
    print_metrics(xgb_metrics)

## 5. Compare Models

In [None]:
# Create comparison DataFrame
comparison_data = {
    'Model': ['Logistic Regression', 'SVM'],
    'MAP': [lr_metrics['MAP'], svm_metrics['MAP']],
    'MRR': [lr_metrics['MRR'], svm_metrics['MRR']],
    'nDCG@5': [lr_metrics.get('nDCG@5', 0), svm_metrics.get('nDCG@5', 0)],
    'Recall@5': [lr_metrics.get('Recall@5', 0), svm_metrics.get('Recall@5', 0)]
}

if xgb_model:
    comparison_data['Model'].append('XGBoost')
    comparison_data['MAP'].append(xgb_metrics['MAP'])
    comparison_data['MRR'].append(xgb_metrics['MRR'])
    comparison_data['nDCG@5'].append(xgb_metrics.get('nDCG@5', 0))
    comparison_data['Recall@5'].append(xgb_metrics.get('Recall@5', 0))

comparison_df = pd.DataFrame(comparison_data)
comparison_df

## 6. Save Models

In [None]:
# Save the best model
# lr_model.save('../data/lr_model.joblib')
# feature_extractor.save('../data/feature_extractor.joblib')
print("Models can be saved using model.save() and feature_extractor.save()")

## 7. Example: Search and Rank

Demonstrate how to search for relevant tafsir passages given a query.

In [None]:
# Example search
query = "Apa makna taqwa dalam Islam?"

# Get unique tafsir passages
unique_tafsirs = data.drop_duplicates(subset=['tafsir_text'])

# Create query-document pairs
queries = [query] * len(unique_tafsirs)
documents = unique_tafsirs['tafsir_text'].tolist()

# Extract features and predict
X = feature_extractor.extract_features(queries, documents)
scores = lr_model.predict_scores(X)

# Rank and display results
results = list(zip(documents, scores))
results.sort(key=lambda x: x[1], reverse=True)

print(f"Query: {query}\n")
print("Top 5 Results:")
print("-" * 50)
for i, (doc, score) in enumerate(results[:5], 1):
    print(f"{i}. (Score: {score:.4f})")
    print(f"   {doc}\n")