<a href="https://colab.research.google.com/github/MiracleCakra/Machine-Learning_Ganjil_2025/blob/main/Week07_JS07/Praktikum04_JS07_Perbedaan_3_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Praktikum 4

Percobaan kali ini kita akan melihat perbedaan ketiga model yang telah kita bahas dan bandingkan hasilnya.

In [4]:
!pip install hnswlib -q
!pip install annoy
!pip install faiss-cpu -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== Annoy ===
Build time: 31.385603666305542 detik
Query time: 0.000194549560546875 detik
Neighbors: [424264, 219969, 781357, 928435, 758054] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.01598191261291504 detik
Query time: 0.007107257843017578 detik
Neighbors: [424264 219969 781357 928435 758054] ...

=== HNSW (hnswlib) ===
Build time: 162.89802765846252 detik
Query time: 0.00018978118896484375 detik
Neighbors: [424264 219969 781357 928435 758054] ...


## Lakukan percobaan pada metric distance yang berbeda. catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [9]:
import numpy as np
import time
import pandas as pd
from annoy import AnnoyIndex
import faiss
import hnswlib

# Helper functions
def avg_time(fn, repeats=3):
    """Rata-rata durasi eksekusi sebuah callable."""
    ts = []
    for _ in range(repeats):
        t0 = time.time()
        fn()
        ts.append(time.time() - t0)
    return float(np.mean(ts))

def recall_at_k(true_idx, approx_idx, k):
    """Recall@k: proporsi tetangga ground-truth yang ditemukan oleh metode approx."""
    return len(set(true_idx).intersection(set(approx_idx))) / float(k)

def safe_normalize_l2(x: np.ndarray, axis=1, eps=1e-12):
    """Normalisasi L2 dengan guard epsilon agar aman dari pembagian nol."""
    n = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / (n + eps)

# Core experiment
def run_all_experiments(
    n_data: int,
    dim: int,
    k: int,
    repeats_query: int = 3,
    annoy_trees: int = 10,
    hnsw_M: int = 16,
    hnsw_ef_construction: int = 200,
    hnsw_ef: int = 50,
    save_csv: str = None,
    save_md: str = None
):
    """
    Menjalankan perbandingan Annoy, FAISS (Flat/Exact), dan HNSWlib
    untuk dua metrik: L2 (Euclidean) dan IP (Cosine via normalisasi).
    Menghasilkan tabel hasil (markdown) + opsional simpan CSV/MD.
    """
    print(f"Menyiapkan dataset: N={n_data}, dim={dim}, k={k}")
    np.random.seed(42)
    X = np.random.random((n_data, dim)).astype(np.float32)
    query = np.random.random((1, dim)).astype(np.float32)

    results = []

    # Skenario A: L2 (Euclidean)
    print("\n--- Eksperimen: Metrik L2 (Euclidean) ---")

    # --- FAISS Exact (ground truth) ---
    try:
        faiss_l2 = faiss.IndexFlatL2(dim)
        t0 = time.time()
        faiss_l2.add(X)  # build time untuk IndexFlat = waktu add
        build_faiss_l2 = time.time() - t0

        # satu kali ambil hasil untuk ground-truth indeks
        D_f_l2, I_f_l2 = faiss_l2.search(query, k)
        # rata-rata waktu query
        qtime_faiss_l2 = avg_time(lambda: faiss_l2.search(query, k), repeats=repeats_query)

        results.append({
            "Dataset N": n_data,
            "Dim": dim,
            "k": k,
            "Repeats": repeats_query,
            "Library": "FAISS (Flat)",
            "Metric": "L2",
            "Params": "-",
            "Build Time (s)": round(build_faiss_l2, 6),
            "Query Time (s)": round(qtime_faiss_l2, 6),
            f"Recall@{k}": 1.0,
            "Speedup vs Exact": 1.0
        })
    except Exception as e:
        print("FAISS L2 error:", e)
        I_f_l2 = None
        qtime_faiss_l2 = None
        results.append({
            "Dataset N": n_data, "Dim": dim, "k": k, "Repeats": repeats_query,
            "Library": "FAISS (Flat)", "Metric": "L2", "Params": "-",
            "Build Time (s)": None, "Query Time (s)": None,
            f"Recall@{k}": None, "Speedup vs Exact": None
        })

    # --- Annoy (euclidean) ---
    try:
        ann_l2 = AnnoyIndex(dim, 'euclidean')
        t0 = time.time()
        for i in range(n_data):
            ann_l2.add_item(i, X[i])
        ann_l2.build(annoy_trees)
        build_ann_l2 = time.time() - t0

        # simpan indeks untuk recall
        ann_l2_idx = ann_l2.get_nns_by_vector(query[0], k)
        # rata-rata waktu query
        qtime_ann_l2 = avg_time(lambda: ann_l2.get_nns_by_vector(query[0], k), repeats=repeats_query)

        rec_ann_l2 = recall_at_k(I_f_l2[0], ann_l2_idx, k) if I_f_l2 is not None else None
        spd_ann_l2 = (qtime_faiss_l2 / qtime_ann_l2) if (qtime_faiss_l2 and qtime_ann_l2) else None

        results.append({
            "Dataset N": n_data,
            "Dim": dim,
            "k": k,
            "Repeats": repeats_query,
            "Library": "Annoy",
            "Metric": "L2",
            "Params": f"trees={annoy_trees}",
            "Build Time (s)": round(build_ann_l2, 6),
            "Query Time (s)": round(qtime_ann_l2, 6),
            f"Recall@{k}": None if rec_ann_l2 is None else round(rec_ann_l2, 4),
            "Speedup vs Exact": None if spd_ann_l2 is None else round(spd_ann_l2, 2)
        })
    except Exception as e:
        print("Annoy L2 error:", e)
        results.append({
            "Dataset N": n_data, "Dim": dim, "k": k, "Repeats": repeats_query,
            "Library": "Annoy", "Metric": "L2", "Params": f"trees={annoy_trees}",
            "Build Time (s)": None, "Query Time (s)": None,
            f"Recall@{k}": None, "Speedup vs Exact": None
        })

    # --- HNSW (l2) ---
    try:
        hnsw_l2 = hnswlib.Index(space='l2', dim=dim)
        t0 = time.time()
        hnsw_l2.init_index(max_elements=n_data, ef_construction=hnsw_ef_construction, M=hnsw_M)
        hnsw_l2.add_items(X)
        build_hnsw_l2 = time.time() - t0

        hnsw_l2.set_ef(hnsw_ef)
        # simpan indeks untuk recall
        I_h_l2, _ = hnsw_l2.knn_query(query, k=k)
        # rata-rata waktu query
        qtime_hnsw_l2 = avg_time(lambda: hnsw_l2.knn_query(query, k=k), repeats=repeats_query)

        rec_hnsw_l2 = recall_at_k(I_f_l2[0], I_h_l2[0], k) if I_f_l2 is not None else None
        spd_hnsw_l2 = (qtime_faiss_l2 / qtime_hnsw_l2) if (qtime_faiss_l2 and qtime_hnsw_l2) else None

        results.append({
            "Dataset N": n_data,
            "Dim": dim,
            "k": k,
            "Repeats": repeats_query,
            "Library": "HNSWlib",
            "Metric": "L2",
            "Params": f"M={hnsw_M}, efc={hnsw_ef_construction}, ef={hnsw_ef}",
            "Build Time (s)": round(build_hnsw_l2, 6),
            "Query Time (s)": round(qtime_hnsw_l2, 6),
            f"Recall@{k}": None if rec_hnsw_l2 is None else round(rec_hnsw_l2, 4),
            "Speedup vs Exact": None if spd_hnsw_l2 is None else round(spd_hnsw_l2, 2)
        })
    except Exception as e:
        print("HNSW L2 error:", e)
        results.append({
            "Dataset N": n_data, "Dim": dim, "k": k, "Repeats": repeats_query,
            "Library": "HNSWlib", "Metric": "L2",
            "Params": f"M={hnsw_M}, efc={hnsw_ef_construction}, ef={hnsw_ef}",
            "Build Time (s)": None, "Query Time (s)": None,
            f"Recall@{k}": None, "Speedup vs Exact": None
        })

    print("\n--- Eksperimen: Metrik IP (Cosine via normalisasi) ---")
    Xn = X.copy()
    qn = query.copy()
    faiss.normalize_L2(Xn)
    faiss.normalize_L2(qn)

    # --- FAISS Exact (IP) ---
    try:
        faiss_ip = faiss.IndexFlatIP(dim)
        t0 = time.time()
        faiss_ip.add(Xn)
        build_faiss_ip = time.time() - t0

        D_f_ip, I_f_ip = faiss_ip.search(qn, k)
        qtime_faiss_ip = avg_time(lambda: faiss_ip.search(qn, k), repeats=repeats_query)

        results.append({
            "Dataset N": n_data,
            "Dim": dim,
            "k": k,
            "Repeats": repeats_query,
            "Library": "FAISS (Flat)",
            "Metric": "IP (Cosine)",
            "Params": "-",
            "Build Time (s)": round(build_faiss_ip, 6),
            "Query Time (s)": round(qtime_faiss_ip, 6),
            f"Recall@{k}": 1.0,
            "Speedup vs Exact": 1.0
        })
    except Exception as e:
        print("FAISS IP error:", e)
        I_f_ip = None
        qtime_faiss_ip = None
        results.append({
            "Dataset N": n_data, "Dim": dim, "k": k, "Repeats": repeats_query,
            "Library": "FAISS (Flat)", "Metric": "IP (Cosine)", "Params": "-",
            "Build Time (s)": None, "Query Time (s)": None,
            f"Recall@{k}": None, "Speedup vs Exact": None
        })

    # --- Annoy (angular) ---
    try:
        ann_ip = AnnoyIndex(dim, 'angular')  # angular ~ cosine
        t0 = time.time()
        for i in range(n_data):
            ann_ip.add_item(i, Xn[i])
        ann_ip.build(annoy_trees)
        build_ann_ip = time.time() - t0

        ann_ip_idx = ann_ip.get_nns_by_vector(qn[0], k)
        qtime_ann_ip = avg_time(lambda: ann_ip.get_nns_by_vector(qn[0], k), repeats=repeats_query)

        rec_ann_ip = recall_at_k(I_f_ip[0], ann_ip_idx, k) if I_f_ip is not None else None
        spd_ann_ip = (qtime_faiss_ip / qtime_ann_ip) if (qtime_faiss_ip and qtime_ann_ip) else None

        results.append({
            "Dataset N": n_data,
            "Dim": dim,
            "k": k,
            "Repeats": repeats_query,
            "Library": "Annoy",
            "Metric": "IP (Cosine)",
            "Params": f"trees={annoy_trees}",
            "Build Time (s)": round(build_ann_ip, 6),
            "Query Time (s)": round(qtime_ann_ip, 6),
            f"Recall@{k}": None if rec_ann_ip is None else round(rec_ann_ip, 4),
            "Speedup vs Exact": None if spd_ann_ip is None else round(spd_ann_ip, 2)
        })
    except Exception as e:
        print("Annoy IP error:", e)
        results.append({
            "Dataset N": n_data, "Dim": dim, "k": k, "Repeats": repeats_query,
            "Library": "Annoy", "Metric": "IP (Cosine)", "Params": f"trees={annoy_trees}",
            "Build Time (s)": None, "Query Time (s)": None,
            f"Recall@{k}": None, "Speedup vs Exact": None
        })

    # --- HNSW (ip) ---
    try:
        hnsw_ip = hnswlib.Index(space='ip', dim=dim)
        t0 = time.time()
        hnsw_ip.init_index(max_elements=n_data, ef_construction=hnsw_ef_construction, M=hnsw_M)
        hnsw_ip.add_items(Xn)
        build_hnsw_ip = time.time() - t0

        hnsw_ip.set_ef(hnsw_ef)
        I_h_ip, _ = hnsw_ip.knn_query(qn, k=k)
        qtime_hnsw_ip = avg_time(lambda: hnsw_ip.knn_query(qn, k=k), repeats=repeats_query)

        rec_hnsw_ip = recall_at_k(I_f_ip[0], I_h_ip[0], k) if I_f_ip is not None else None
        spd_hnsw_ip = (qtime_faiss_ip / qtime_hnsw_ip) if (qtime_faiss_ip and qtime_hnsw_ip) else None

        results.append({
            "Dataset N": n_data,
            "Dim": dim,
            "k": k,
            "Repeats": repeats_query,
            "Library": "HNSWlib",
            "Metric": "IP (Cosine)",
            "Params": f"M={hnsw_M}, efc={hnsw_ef_construction}, ef={hnsw_ef}",
            "Build Time (s)": round(build_hnsw_ip, 6),
            "Query Time (s)": round(qtime_hnsw_ip, 6),
            f"Recall@{k}": None if rec_hnsw_ip is None else round(rec_hnsw_ip, 4),
            "Speedup vs Exact": None if spd_hnsw_ip is None else round(spd_hnsw_ip, 2)
        })
    except Exception as e:
        print("HNSW IP error:", e)
        results.append({
            "Dataset N": n_data, "Dim": dim, "k": k, "Repeats": repeats_query,
            "Library": "HNSWlib", "Metric": "IP (Cosine)",
            "Params": f"M={hnsw_M}, efc={hnsw_ef_construction}, ef={hnsw_ef}",
            "Build Time (s)": None, "Query Time (s)": None,
            f"Recall@{k}": None, "Speedup vs Exact": None
        })

    df = pd.DataFrame(results)
    lib_order = {"FAISS (Flat)": 0, "Annoy": 1, "HNSWlib": 2}
    metric_order = {"L2": 0, "IP (Cosine)": 1}
    df["_lib_order"] = df["Library"].map(lib_order)
    df["_met_order"] = df["Metric"].map(metric_order)
    df = df.sort_values(by=["_met_order","_lib_order"]).drop(columns=["_lib_order","_met_order"]).reset_index(drop=True)

    print("\n--- Hasil Eksperimen (Markdown) ---")
    print(df.round(6).to_markdown(index=False))

    if save_csv:
        df.to_csv(save_csv, index=False)
        print(f"\nDisimpan CSV: {save_csv}")
    if save_md:
        with open(save_md, "w", encoding="utf-8") as f:
            f.write(df.round(6).to_markdown(index=False))
        print(f"Disimpan Markdown: {save_md}")

    return df

if __name__ == "__main__":
    df = run_all_experiments(n_data=1_000_000, dim=5, k=100, repeats_query=3)

Menyiapkan dataset: N=1000000, dim=5, k=100

--- Eksperimen: Metrik L2 (Euclidean) ---

--- Eksperimen: Metrik IP (Cosine via normalisasi) ---

--- Hasil Eksperimen (Markdown) ---
|   Dataset N |   Dim |   k |   Repeats | Library      | Metric      | Params               |   Build Time (s) |   Query Time (s) |   Recall@100 |   Speedup vs Exact |
|------------:|------:|----:|----------:|:-------------|:------------|:---------------------|-----------------:|-----------------:|-------------:|-------------------:|
|     1000000 |     5 | 100 |         3 | FAISS (Flat) | L2          | -                    |         0.015989 |         0.006236 |         1    |               1    |
|     1000000 |     5 | 100 |         3 | Annoy        | L2          | trees=10             |        23.4271   |         0.000268 |         0.99 |              23.27 |
|     1000000 |     5 | 100 |         3 | HNSWlib      | L2          | M=16, efc=200, ef=50 |       168.573    |         4.8e-05  |         1    |  