### Praktikum 6
Lakukan percobaan penggunaan ANNOY, FAISS, dan HNSWLIB pada dataset sekunder berukuran besar (Micro Spotify) pada link berikut: https://www.kaggle.com/datasets/bwandowando/spotify-songs-with-attributes-and-lyrics/data . Download data dan load CSV filenya (pilih dataset yg pertama dari dua dataset). pilih hanya fitur numerik saja, dan lakukan normalisasi menggunakan StandardScaler. Lakukan pencarian track terdekat dan bandingkan hasilnya.

In [None]:
# Install dependencies if running in a new environment like Google Colab
# !pip install kagglehub hnswlib faiss-cpu annoy

import pandas as pd
import numpy as np
import time
import kagglehub
from kagglehub import KaggleDatasetAdapter

# ANN Libraries
import faiss
from annoy import AnnoyIndex
import hnswlib

# Scikit-learn for baseline and preprocessing
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# --- 1. Load Dataset Directly from Kaggle Hub ---
print("Memuat dataset dari Kaggle Hub...")
try:
    # Set the path to the file you'd like to load within the dataset
    # Corrected filename based on the error output
    file_path = "spotify_songs_with_attributes_and_lyrics.csv"

    # Load the latest version of the dataset
    df = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "bwandowando/spotify-songs-with-attributes-and-lyrics",
        file_path,
    )
    print("Dataset berhasil dimuat.")
    print(f"Jumlah lagu dalam dataset: {len(df)}")
except Exception as e:
    print(f"Gagal memuat dataset: {e}")
    print("Pastikan Anda telah melakukan autentikasi dengan akun Kaggle Anda.")
    # In Google Colab, you might need to upload your kaggle.json or authenticate.
    # Exit is not ideal in Colab, better to just print the error and continue
    # exit()


# --- 2. Preprocess Data ---
print("\nMelakukan preprocessing data...")
# Check if df was loaded successfully before proceeding
if 'df' in locals() and df is not None:
    # Select numerical features for similarity search
    features = ['danceability', 'energy', 'loudness', 'speechiness',
                'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
    X = df[features].dropna().values.astype(np.float32)

    # Standardize the features for distance-based algorithms
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    print(f"Data siap digunakan dengan {X_scaled.shape[0]} lagu dan {X_scaled.shape[1]} fitur.")

    k = 10  # Number of nearest neighbors to find

    # --- 3. Exact Nearest Neighbor (Brute-force Baseline) ---
    print("\n--- Menjalankan Exact NN (Scikit-learn) ---")
    start_time = time.time()
    nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    nn.fit(X_scaled)
    # Find neighbors for all items in the dataset
    dist_exact, idx_exact = nn.kneighbors(X_scaled)
    time_exact = time.time() - start_time
    print(f"Selesai dalam {time_exact:.3f} detik")

    # --- 4. Annoy ---
    print("\n--- Menjalankan Annoy ---")
    start_time = time.time()
    f = X_scaled.shape[1]
    index_annoy = AnnoyIndex(f, 'euclidean')
    for i, v in enumerate(X_scaled):
        index_annoy.add_item(i, v)
    index_annoy.build(20) # More trees for better accuracy
    # Find neighbors for all items by iterating
    idx_annoy = [index_annoy.get_nns_by_vector(v, k) for v in X_scaled]
    time_annoy = time.time() - start_time
    print(f"Selesai dalam {time_annoy:.3f} detik")

    # --- 5. HNSW (hnswlib) ---
    print("\n--- Menjalankan HNSW (hnswlib) ---")
    start_time = time.time()
    p_hnsw = hnswlib.Index(space='l2', dim=X_scaled.shape[1])
    p_hnsw.init_index(max_elements=X_scaled.shape[0], ef_construction=200, M=16)
    p_hnsw.add_items(X_scaled)
    p_hnsw.set_ef(200) # Higher ef for better accuracy
    # Find neighbors for all items at once
    idx_hnsw, dist_hnsw = p_hnsw.knn_query(X_scaled, k=k)
    time_hnsw = time.time() - start_time
    print(f"Selesai dalam {time_hnsw:.3f} detik")

    # --- 6. FAISS (IVFFlat) ---
    print("\n--- Menjalankan FAISS (IVFFlat) ---")
    start_time = time.time()
    quantizer = faiss.IndexFlatL2(X_scaled.shape[1])
    nlist = 100 # Number of clusters
    index_faiss = faiss.IndexIVFFlat(quantizer, X_scaled.shape[1], nlist, faiss.METRIC_L2)
    index_faiss.train(X_scaled)
    index_faiss.add(X_scaled)
    index_faiss.nprobe = 10 # Search in 10 nearest clusters
    # Find neighbors for all items at once
    dist_faiss, idx_faiss = index_faiss.search(X_scaled, k)
    time_faiss = time.time() - start_time
    print(f"Selesai dalam {time_faiss:.3f} detik")

    # --- 7. Final Results Comparison ---
    print("\n==============================================")
    print("              Ringkasan Waktu Proses")
    print("==============================================")
    print(f"Exact NN (Brute-force): {time_exact:>7.3f} detik")
    print(f"Annoy                 : {time_annoy:>7.3f} detik")
    print(f"HNSW (hnswlib)        : {time_hnsw:>7.3f} detik")
    print(f"FAISS (IVFFlat)       : {time_faiss:>7.3f} detik")
    print("==============================================")


    print("\nContoh perbandingan top-5 tetangga terdekat untuk lagu pertama:")
    # Ensure df is not empty before accessing iloc
    if not df.empty:
        first_song_title = df['track_name'].iloc[0]
        print(f"Lagu: '{first_song_title}'")
        print(f"Exact NN: {idx_exact[0][:5]}")
        print(f"Annoy:    {idx_annoy[0][:5]}")
        print(f"HNSW:     {idx_hnsw[0][:5]}")
        print(f"FAISS:    {idx_faiss[0][:5]}")
    else:
        print("DataFrame is empty, cannot display song title and neighbors.")

else:
    print("DataFrame was not loaded successfully. Cannot proceed with preprocessing and analysis.")

Memuat dataset dari Kaggle Hub...


  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'spotify-songs-with-attributes-and-lyrics' dataset.
Gagal memuat dataset: 'spotify_songs_with_attributes_and_lyrics.csv' is not present in the dataset files. You can access the other files of the attached dataset at '/kaggle/input/spotify-songs-with-attributes-and-lyrics'
Pastikan Anda telah melakukan autentikasi dengan akun Kaggle Anda.

Melakukan preprocessing data...
DataFrame was not loaded successfully. Cannot proceed with preprocessing and analysis.


Tugas
Jalankan code berikut pada Google Colab dan PyDroid3 (Android Python) Application di Smartphone Android. Bandingkan hasilnya dan tuliskan analisa anda, tuliskan juga spesifikasi smartphone yang anda gunakan :).

In [None]:
import numpy as np
import time
from sklearn.neighbors import NearestNeighbors
from annoy import AnnoyIndex
import hnswlib
import faiss
from sklearn.preprocessing import StandardScaler

# -------------------------------
# Contoh dataset kecil untuk testing
# -------------------------------
np.random.seed(42)
n_samples = 10000   # jumlah database vector
d = 128             # dimensi
X = np.random.random((n_samples, d)).astype('float32')

# Standarisasi fitur
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k = 10  # jumlah nearest neighbors

# -------------------------------
# Exact NN (brute-force)
# -------------------------------
start = time.time()
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(X_scaled)
dist_exact, idx_exact = nn.kneighbors(X_scaled)
time_exact = time.time() - start
print(f"Exact NN done in {time_exact:.3f} s")

# -------------------------------
# Annoy
# -------------------------------
start = time.time()
f = X_scaled.shape[1]
index_annoy = AnnoyIndex(f, 'euclidean')
for i, v in enumerate(X_scaled):
    index_annoy.add_item(i, v)
index_annoy.build(10)
idx_annoy = [index_annoy.get_nns_by_vector(v, k) for v in X_scaled]
time_annoy = time.time() - start
print(f"Annoy done in {time_annoy:.3f} s")

# -------------------------------
# HNSW
# -------------------------------
start = time.time()
p_hnsw = hnswlib.Index(space='l2', dim=d)
p_hnsw.init_index(max_elements=n_samples, ef_construction=200, M=16)
p_hnsw.add_items(X_scaled)
p_hnsw.set_ef(200)
idx_hnsw, _ = p_hnsw.knn_query(X_scaled, k=k)
time_hnsw = time.time() - start
print(f"HNSW done in {time_hnsw:.3f} s")

# -------------------------------
# FAISS IVF
# -------------------------------
start = time.time()
quantizer = faiss.IndexFlatL2(d)
index_faiss = faiss.IndexIVFFlat(quantizer, d, nlist=100, metric=faiss.METRIC_L2)
index_faiss.train(X_scaled)
index_faiss.add(X_scaled)
index_faiss.nprobe = 10
_, idx_faiss = index_faiss.search(X_scaled, k)
time_faiss = time.time() - start
print(f"FAISS IVF done in {time_faiss:.3f} s")

# -------------------------------
# Tampilkan ringkasan waktu
# -------------------------------
print("\n=== Ringkasan Waktu (detik) ===")
print(f"Exact NN : {time_exact:.3f}")
print(f"Annoy    : {time_annoy:.3f}")
print(f"HNSW     : {time_hnsw:.3f}")
print(f"FAISS    : {time_faiss:.3f}")
