### Praktikum 4
Percobaan kali ini kita akan melihat perbedaan ketiga model yang telah kita bahas dan bandingkan hasilnya.

In [None]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== Annoy ===
Build time: 19.248942852020264 detik
Query time: 0.00024247169494628906 detik
Neighbors: [741391, 678185, 933814, 673809, 434007] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.005582332611083984 detik
Query time: 0.005474567413330078 detik
Neighbors: [741391 678185 933814 673809 434007] ...

=== HNSW (hnswlib) ===
Build time: 137.81605529785156 detik
Query time: 0.0002124309539794922 detik
Neighbors: [741391 678185 933814 673809 434007] ...


Lakukan percobaan pada metric distance yang berbeda. catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [None]:
import numpy as np
import time
import pandas as pd
import annoy
import faiss
import hnswlib

# --- Konfigurasi Global Eksperimen ---
K_NEIGHBORS = 10 # Jumlah tetangga terdekat yang dicari

# Daftar semua skenario eksperimen yang akan dijalankan
EXPERIMENT_SCENARIOS = [
    {'dimensions': 5, 'n_points': 1_000_000},
    {'dimensions': 50, 'n_points': 1_000_000}, # Skenario dimensi lebih tinggi
]

# --- Definisi Fungsi Eksperimen untuk Setiap Library ---

def run_annoy_experiment(X, query, metric, dimensions):
    """Menjalankan eksperimen untuk Annoy."""
    ann_index = annoy.AnnoyIndex(dimensions, metric)

    start_build = time.time()
    for i in range(len(X)):
        ann_index.add_item(i, X[i])
    ann_index.build(50) # 50 trees, trade-off build time vs accuracy
    build_time = time.time() - start_build

    start_search = time.time()
    ann_index.get_nns_by_vector(query[0], K_NEIGHBORS)
    search_time = time.time() - start_search
    return build_time, search_time

def run_faiss_flat_experiment(X, query, metric_flag, dimensions):
    """Menjalankan eksperimen untuk Faiss IndexFlat (Exact Search)."""
    index = faiss.IndexFlat(dimensions, metric_flag)

    start_build = time.time()
    index.add(X)
    build_time = time.time() - start_build

    start_search = time.time()
    index.search(query, K_NEIGHBORS)
    search_time = time.time() - start_search
    return build_time, search_time

def run_hnsw_experiment(X, query, space, dimensions):
    """Menjalankan eksperimen untuk HNSWlib."""
    hnsw_index = hnswlib.Index(space=space, dim=dimensions)

    start_build = time.time()
    hnsw_index.init_index(max_elements=len(X), ef_construction=200, M=16)
    hnsw_index.add_items(X)
    build_time = time.time() - start_build

    hnsw_index.set_ef(50)
    start_search = time.time()
    hnsw_index.knn_query(query, k=K_NEIGHBORS)
    search_time = time.time() - start_search
    return build_time, search_time

# --- Loop Utama Eksperimen ---
all_results = []
np.random.seed(42)

for scenario in EXPERIMENT_SCENARIOS:
    dims = scenario['dimensions']
    n_pts = scenario['n_points']

    print(f"\n{'='*20} SCENARIO: {n_pts} points, {dims} dimensions {'='*20}")

    # 1. Buat Dataset
    print(f"  Generating dataset...")
    X_raw = np.random.rand(n_pts, dims).astype('float32')
    query_raw = np.random.rand(1, dims).astype('float32')

    # --- Menjalankan untuk setiap metrik ---
    for metric_name in ['euclidean', 'angular']:
        print(f"\n  --- Metric: {metric_name} ---")
        X = X_raw.copy()
        query = query_raw.copy()

        if metric_name == 'angular':
            faiss.normalize_L2(X)
            faiss.normalize_L2(query)
            faiss_metric_flag = faiss.METRIC_INNER_PRODUCT
            hnsw_space = 'ip'
        else:
            faiss_metric_flag = faiss.METRIC_L2
            hnsw_space = 'l2'

        # Menjalankan algoritma
        # Annoy
        try:
            b_t, s_t = run_annoy_experiment(X, query, metric_name, dims)
            all_results.append(['Annoy', metric_name, dims, n_pts, b_t, s_t * 1000])
            print(f"    Annoy          | Build: {b_t:.4f}s | Search: {s_t*1000:.4f}ms")
        except Exception as e:
            print(f"    Annoy failed: {e}")

        # Faiss Flat (Baseline)
        try:
            b_t, s_t = run_faiss_flat_experiment(X, query, faiss_metric_flag, dims)
            all_results.append(['Faiss-Flat(Exact)', metric_name, dims, n_pts, b_t, s_t * 1000])
            print(f"    Faiss-Flat     | Build: {b_t:.4f}s | Search: {s_t*1000:.4f}ms")
        except Exception as e:
            print(f"    Faiss-Flat failed: {e}")

        # HNSWlib
        try:
            b_t, s_t = run_hnsw_experiment(X, query, hnsw_space, dims)
            all_results.append(['HNSWlib', metric_name, dims, n_pts, b_t, s_t * 1000])
            print(f"    HNSWlib        | Build: {b_t:.4f}s | Search: {s_t*1000:.4f}ms")
        except Exception as e:
            print(f"    HNSWlib failed: {e}")


# --- Tampilkan Hasil Akhir dalam Bentuk Tabel ---
df_results = pd.DataFrame(all_results, columns=[
    'Algorithm', 'Metric', 'Dimensions', 'Data Points', 'Build Time (s)', 'Search Time (ms)'
])

print("\n\n" + "="*60)
print("              HASIL AKHIR PERBANDINGAN KOMPREHENSIF")
print("="*60)
print(df_results.to_string())



  Generating dataset...

  --- Metric: euclidean ---
    Annoy          | Build: 91.8219s | Search: 0.5007ms
    Faiss-Flat     | Build: 0.0058s | Search: 5.5685ms
    HNSWlib        | Build: 143.0350s | Search: 0.1233ms

  --- Metric: angular ---
    Annoy          | Build: 118.6520s | Search: 0.3872ms
    Faiss-Flat     | Build: 0.0065s | Search: 6.1119ms
    HNSWlib        | Build: 130.4886s | Search: 0.1035ms

  Generating dataset...

  --- Metric: euclidean ---
    Annoy          | Build: 68.9897s | Search: 0.3228ms
    Faiss-Flat     | Build: 0.1714s | Search: 32.8851ms
    HNSWlib        | Build: 645.5942s | Search: 0.2596ms

  --- Metric: angular ---
    Annoy          | Build: 73.2444s | Search: 0.2797ms
    Faiss-Flat     | Build: 0.1463s | Search: 22.6104ms
    HNSWlib        | Build: 563.7616s | Search: 0.2861ms


              HASIL AKHIR PERBANDINGAN KOMPREHENSIF
            Algorithm     Metric  Dimensions  Data Points  Build Time (s)  Search Time (ms)
0               A