In [1]:
# IMPORT LIBRARY
import time
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import average_precision_score, roc_auc_score
import pickle
from tqdm.notebook import tqdm  

In [2]:
# KONFIGURASI PATH & PARAMETER
DATA_DIR = Path("/kaggle/input/fp-quran-relevance-features")

features_path = DATA_DIR / "query_tafsir_features (1).csv"

# folder output model di /kaggle/working
OUT_DIR = Path("/kaggle/working/models")
OUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
TEST_SIZE = 0.2


In [3]:
def calculate_ranking_metrics(y_true: np.ndarray, y_scores: np.ndarray) -> Dict[str, float]:
    """
    Menghitung MAP (Global Average Precision) dan AUC-ROC
    dipakai sebagai proxy metrik ranking ketika tidak ada Query ID.
    """
    map_score = average_precision_score(y_true, y_scores)
    auc_roc = roc_auc_score(y_true, y_scores)
    return {
        "MAP (Global AP)": map_score,
        "AUC-ROC": auc_roc,
    }

In [4]:
# LOAD DATA FITUR
try:
    df_features = pd.read_csv(features_path)
    print(f" Dataset fitur dimuat dari: {features_path}")
    print(f" Total pasangan query-tafsir: {len(df_features):,}")
except FileNotFoundError:
    print(f" Error: File fitur tidak ditemukan di {features_path}")
    raise SystemExit

# Kolom fitur & label
FEATURE_COLS = [
    "feat_tfidf_similarity",
    "feat_sbert_similarity",
    "feat_keyword_overlap",
]
TARGET_COL = "label"

X = df_features[FEATURE_COLS].values
y = df_features[TARGET_COL].values

print("\nDistribusi label:")
print(df_features[TARGET_COL].value_counts(normalize=True).rename("proporsi"))


 Dataset fitur dimuat dari: /kaggle/input/fp-quran-relevance-features/query_tafsir_features (1).csv
 Total pasangan query-tafsir: 184,655

Distribusi label:
label
0    0.826379
1    0.173621
Name: proporsi, dtype: float64


In [5]:
# SPLIT TRAIN/TEST + SCALING

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=y,
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n Data dibagi Train/Test (80/20) dan dinormalisasi untuk model linear.")
print(f"  X_train: {X_train.shape}, X_test: {X_test.shape}")

# untuk XGBoost (imbalance-aware)
value_counts = pd.Series(y).value_counts()
scale_pos_weight = value_counts[0] / value_counts[1]
print(f"  scale_pos_weight (XGBoost) ≈ {scale_pos_weight:.2f}")



 Data dibagi Train/Test (80/20) dan dinormalisasi untuk model linear.
  X_train: (147724, 3), X_test: (36931, 3)
  scale_pos_weight (XGBoost) ≈ 4.76


In [6]:

models = {
    # SINGLE (linear, pakai data scaled)
    'LogisticRegression': LogisticRegression(
        random_state=RANDOM_SEED,
        solver='liblinear',
        class_weight='balanced'
    ),
    'SVM': SVC(
        kernel='linear',
        probability=True,
        random_state=RANDOM_SEED,
        class_weight='balanced'
        # kalau terlalu lama, bisa tambahkan: max_iter=2000
    ),

    # ENSEMBLE (tree-based, pakai data asli)
    'RandomForest': RandomForestClassifier(
        random_state=RANDOM_SEED,
        n_estimators=100,
        class_weight='balanced',
        n_jobs=-1
    ),

    # XGBoost pakai GPU
    'XGBoost': xgb.XGBClassifier(
        random_state=RANDOM_SEED,
        use_label_encoder=False,
        eval_metric='logloss',
        tree_method='gpu_hist',      # GPU
        predictor='gpu_predictor',
        scale_pos_weight=scale_pos_weight,
        n_estimators=400,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8
    ),

    # LightGBM pakai GPU
    'LightGBM': lgb.LGBMClassifier(
        random_state=RANDOM_SEED,
        n_estimators=400,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        class_weight='balanced',
        device_type='gpu'   # pakai GPU di Kaggle
    )
}

results = {}
trained_models = {}

In [None]:
# TRAIN & EVALUASI PER MODEL

for name, model in models.items():
    # pilih data scaled / unscaled
    if name in ['LogisticRegression', 'SVM']:
        X_train_data = X_train_scaled
        X_test_data = X_test_scaled
    else:
        X_train_data = X_train
        X_test_data = X_test

    print(f"\n Melatih Model: {name} ...")
    start_time = time.time()

    model.fit(X_train_data, y_train)
    y_scores = model.predict_proba(X_test_data)[:, 1]

    metrics = calculate_ranking_metrics(y_test, y_scores)
    end_time = time.time()
    metrics['Training_Time (s)'] = end_time - start_time
    results[name] = metrics
    trained_models[name] = model

    print(f"{name} selesai. "
          f"MAP: {metrics['MAP (Global AP)']:.4f}, "
          f"AUC: {metrics['AUC-ROC']:.4f}, "
          f"Waktu: {metrics['Training_Time (s)']:.2f} detik")



 Melatih Model: LogisticRegression ...
LogisticRegression selesai. MAP: 0.9001, AUC: 0.9531, Waktu: 0.20 detik

 Melatih Model: SVM ...
SVM selesai. MAP: 0.8998, AUC: 0.9528, Waktu: 988.87 detik

 Melatih Model: RandomForest ...
RandomForest selesai. MAP: 0.8858, AUC: 0.9496, Waktu: 8.90 detik

 Melatih Model: XGBoost ...



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




XGBoost selesai. MAP: 0.9035, AUC: 0.9569, Waktu: 1.65 detik

 Melatih Model: LightGBM ...
[LightGBM] [Info] Number of positive: 25648, number of negative: 122076
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 147724, number of used features: 3
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 3 dense feature groups (0.56 MB) transferred to GPU in 0.001091 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LightGBM selesai. MAP: 0.9016, AUC: 0.9558, Waktu: 8.00 detik


In [12]:
feature_names = df_features.drop(columns=['label']).columns.tolist()

# RINGKASAN HASIL
df_results = pd.DataFrame(results).T
df_results_sorted = df_results.sort_values(by="MAP (Global AP)", ascending=False)

print("RINGKASAN KINERJA MODEL (Diurutkan berdasarkan MAP)")
print(df_results_sorted.to_string(float_format="%.4f"))

results_path = OUT_DIR / "model_comparison_results.csv"
df_results_sorted.to_csv(results_path, index=True)
print(f"\nTabel hasil disimpan ke: {results_path}")

def get_model_meta(name: str):
    if name in ["LogisticRegression", "SVM"]:
        return {"type": "single", "use_scaled": True}
    else:
        return {"type": "ensemble", "use_scaled": False}

best_model_name = df_results_sorted.index[0]
best_model = trained_models[best_model_name]
best_meta = get_model_meta(best_model_name)

print(f"\nBest overall model: {best_model_name} (type={best_meta['type']})")

def save_model(obj: dict, path: Path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)
    print(f"Disimpan: {path.name}")

# simpan semua model
for name, clf in trained_models.items():
    meta = get_model_meta(name)

    obj = {
        "model": clf,
        "scaler": scaler,
        "feature_names": feature_names,
        "model_name": name,
        "type": meta["type"],
        "use_scaled": meta["use_scaled"],
    }

    save_model(obj, OUT_DIR / f"model_{name}.pkl")

# simpan best overall
save_model(
    {
        "model": best_model,
        "scaler": scaler,
        "feature_names": feature_names,
        "best_model_name": best_model_name,
        "type": best_meta["type"],
        "use_scaled": best_meta["use_scaled"],
    },
    OUT_DIR / "best_relevance_model.pkl",
)


RINGKASAN KINERJA MODEL (Diurutkan berdasarkan MAP)
                    MAP (Global AP)  AUC-ROC  Training_Time (s)
XGBoost                      0.9035   0.9569             1.6549
LightGBM                     0.9016   0.9558             8.0048
LogisticRegression           0.9001   0.9531             0.2034
SVM                          0.8998   0.9528           988.8723
RandomForest                 0.8858   0.9496             8.9020

Tabel hasil disimpan ke: /kaggle/working/models/model_comparison_results.csv

Best overall model: XGBoost (type=ensemble)
Disimpan: model_LogisticRegression.pkl
Disimpan: model_SVM.pkl
Disimpan: model_RandomForest.pkl
Disimpan: model_XGBoost.pkl
Disimpan: model_LightGBM.pkl
Disimpan: best_relevance_model.pkl
