# JS06 - ANN (APPROXIMATE NEAREST NEIGHBORS)

## Praktikum 4

Perbedaan ketiga model

In [1]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

In [2]:
# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10


In [None]:
# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")

=== Annoy ===
Build time: 11.172114133834839 detik
Query time: 0.003032207489013672 detik
Neighbors: [225335, 579224, 595788, 839797, 856343] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.10020852088928223 detik
Query time: 0.04946756362915039 detik
Neighbors: [225335 579224 595788 839797 856343] ...

=== HNSW (hnswlib) ===
Build time: 64.21569991111755 detik
Query time: 0.0035521984100341797 detik
Neighbors: [225335 579224 595788 839797 856343] ...


In [4]:
# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'angular')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
# Normalisasi data
# dengan Inner product
X_norm = X / np.linalg.norm(X, axis=1, keepdims=True)
query_norm = query / np.linalg.norm(query, axis=1, keepdims=True)

faiss_index = faiss.IndexFlatIP(dim)

start = time.time()
faiss_index.add(X_norm)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query_norm, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
# dengan cosine
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='cosine', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")

=== Annoy ===
Build time: 12.624775171279907 detik
Query time: 0.0003116130828857422 detik
Neighbors: [25515, 80902, 930082, 211903, 422661] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.014153242111206055 detik
Query time: 0.009865283966064453 detik
Neighbors: [ 25515  80902 930082 211903 422661] ...

=== HNSW (hnswlib) ===
Build time: 45.22749352455139 detik
Query time: 0.009543657302856445 detik
Neighbors: [ 25515  80902 930082 211903 422661] ...


In [6]:
import pandas as pd

data = [
    ["Annoy", "euclidean", "10 trees", "1,000,000", "Build: 11.172114, Query: 0.003032"],
    ["Annoy", "angular", "10 trees", "1,000,000", "Build: 12.624775, Query: 0.000311"],
    ["FAISS", "L2", "-", "1,000,000", "Build: 0.1002085, Query: 0.049467"],
    ["FAISS", "Inner Product", "-", "1,000,000", "Build: 0.0141532, Query: 0.009865"],
    ["HNSW", "l2", "M=16, ef=50", "1,000,000", "Build: 64.21569, Query: 0.003552"],
    ["HNSW", "cosine", "M=16, ef=50", "1,000,000", "Build: 45.22749, Query: 0.009543"],
]

df = pd.DataFrame(data, columns=["Model", "Metric", "Parameter", "Jumlah Data", "Waktu (Build, Query)"])
print(df.to_string(index=False))


Model        Metric   Parameter Jumlah Data              Waktu (Build, Query)
Annoy     euclidean    10 trees   1,000,000 Build: 11.172114, Query: 0.003032
Annoy       angular    10 trees   1,000,000 Build: 12.624775, Query: 0.000311
FAISS            L2           -   1,000,000 Build: 0.1002085, Query: 0.049467
FAISS Inner Product           -   1,000,000 Build: 0.0141532, Query: 0.009865
 HNSW            l2 M=16, ef=50   1,000,000  Build: 64.21569, Query: 0.003552
 HNSW        cosine M=16, ef=50   1,000,000  Build: 45.22749, Query: 0.009543
