# <center><b> Similarity Search</center>

This notebook demonstrates semantic search using vector embeddings stored in Presto/Iceberg.

**How it works:**
1. You provide a text query (e.g., "I love this product")
2. The query is converted to a vector embedding
3. Presto finds the most similar reviews using vector search
4. Results are displayed with the original review text

**Try it yourself:**
- Modify the `INPUT_TEXT` variable
- Run the cells to see similar reviews
- Experiment with different queries!

#### <B> Setup & Configuration

In [None]:
# === CONFIGURATION ===

HOST = '' #host engine address
PORT = 8080
# Authentication (Optional)
HTTP_SCHEME = 'http'  # or 'https'
USER = '' #host instance username
PASSWORD = ''  # Leave empty if not using auth
DISABLE_SSL_VERIFICATION = True  # Only for dev with self-signed certs
# Table configs
CATALOG = '' # Presto catalog name
SCHEMA = '' # Presto schema name
TABLE = ''  # Presto table name
TEXT_COLUMN= '' # text column name
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
TOP_K = 10

# MODIFY THIS TEXT TO SEARCH FOR SIMILAR REVIEWS 
INPUT_TEXT = ""
print(f"Query: \"{INPUT_TEXT}\"\n")


In [None]:
# === IMPORTS ===
from gettext import Catalog
import prestodb
from prestodb.exceptions import PrestoUserError
from typing import Dict, Any, Tuple, List, Optional, cast
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)



# === HELPER FUNCTIONS ===
def get_presto_connection(
    host: str = HOST,
    port: int = PORT,
    user: str = USER,
    catalog: str = CATALOG,
    schema: str = SCHEMA,
    http_scheme: str = HTTP_SCHEME,
    principal_id: str = USER,
    password: str = PASSWORD,
    disable_ssl_verification: bool = DISABLE_SSL_VERIFICATION
) -> prestodb.dbapi.Connection:
    """Create Presto connection with optional basic authentication and SSL verification control"""
    try:
        # Build connection parameters
        conn_params = {
            'host': host,
            'port': port,
            'user': user,
            'catalog': catalog,
            'schema': schema,
            'http_scheme': http_scheme,
        }
        
        # Add basic authentication if credentials provided
        if user and password:
            conn_params['auth'] = prestodb.auth.BasicAuthentication(user, password)
        
        conn = prestodb.dbapi.connect(**conn_params)
        
        # Disable SSL verification if requested (for self-signed certificates)
        if disable_ssl_verification and http_scheme == 'https':
            conn._http_session.verify = False
        
        print(f"Connected to {http_scheme}://{host}:{port}")
        return conn
    except Exception as e:
        raise PrestoUserError(f"Connection Error: {e}")


def execute_query(conn, sql: str, fetch: bool = False):
    cursor = None
    try:
        cursor = conn.cursor()
        cursor.execute(sql)
        if fetch:
            return cursor.fetchall(), cursor.description
        return None, None
    except prestodb.exceptions.PrestoUserError as e:
        print(f"Query Failed: {e}")
        raise
    finally:
        if cursor:
            cursor.close()

# === LOAD MODEL ===
print("Loading embedding model...")
try:
    GLOBAL_MODEL = SentenceTransformer(MODEL_NAME)
    GLOBAL_EMBEDDING_DIM = GLOBAL_MODEL.get_sentence_embedding_dimension()
    print(f"✓ Model '{MODEL_NAME}' loaded (dimension: {GLOBAL_EMBEDDING_DIM})")
except Exception as e:
    print(f"✗ Model loading failed: {e}")
    GLOBAL_MODEL = None

# === SEARCH FUNCTIONS ===
def embed_input_text(input_text: str) -> str:
    """Generate embedding for input text and return as comma-separated string."""
    if not GLOBAL_MODEL:
        raise RuntimeError("Embedding model not initialized")
    embedding = GLOBAL_MODEL.encode(input_text, normalize_embeddings=True)
    return ",".join(map(str, embedding.tolist()))

def find_top_k(vector_array_str: str, k: int) -> list:
    """Execute vector search and return matching row_ids."""
    conn = get_presto_connection()
    row_ids = []
    
    sql_query = f"""
    SELECT *
    FROM {CATALOG}.system.approx_nearest_neighbors(
        CAST(ARRAY[{vector_array_str}] AS array(real)),
        '{SCHEMA}.{TABLE}.embedding',
        {k}
    )
    """
    
    cursor = None
    try:
        cursor = conn.cursor()
        cursor.execute(sql_query)
        results = cursor.fetchall()
        
        if results:
            row_ids = [row[0] for row in results]
    except Exception as e:
        print(f" Vector search failed: {e}")
    finally:
        if cursor is not None:
            cursor.close()
        if conn:
            conn.close()
    
    return row_ids

print(" Setup complete - ready for similarity search")

#### <b> Input & Embed

In [None]:

# Generate embedding
vector_array_str = embed_input_text(INPUT_TEXT)
print(f"✓ Query embedded into {GLOBAL_EMBEDDING_DIM}-dimensional vector")

#### <b> Vector Search

In [None]:
import time
start_time = time.time()

top_k_matches = find_top_k(vector_array_str, TOP_K)

elapsed = time.time() - start_time

if top_k_matches:
    print(f" Found {len(top_k_matches)} similar reviews in {elapsed:.3f}s")
    print(f"  Matched row IDs: {top_k_matches}")
else:
    print(" No matches found")

#### <b> Retrieve & Display Results

In [None]:
conn = get_presto_connection()
similar_comments = []

if top_k_matches:
    id_list_str = ", ".join(map(str, top_k_matches))
    
    sql_lookup = f"""
    SELECT row_id, {TEXT_COLUMN}
    FROM {CATALOG}.{SCHEMA}.{TABLE}
    WHERE row_id IN ({id_list_str})
    """
    
    cursor = None
    try:
        cursor = conn.cursor()
        cursor.execute(sql_lookup)
        raw_results = cursor.fetchall()
        
        # Create ordered map
        comment_map = {row_id: comment for row_id, comment in raw_results}
        
        # Preserve order from vector search
        for row_id in top_k_matches:
            if row_id in comment_map:
                similar_comments.append(comment_map[row_id])
        
        print(f"✓ Retrieved {len(similar_comments)} reviews\n")
        
    except Exception as e:
        print(f"✗ Lookup failed: {e}")
    finally:
        if cursor is not None:
            cursor.close()
        if conn:
            conn.close()

# === DISPLAY RESULTS ===
print("=" * 80)
print(f"INPUT QUERY: \"{INPUT_TEXT}\"")
print("=" * 80)
print(f"\nTop {TOP_K} Most Similar Reviews:\n")

if similar_comments:
    for i, comment in enumerate(similar_comments, 1):
        print(f"{i}. {comment}")
        print("-" * 80)
else:
    print("No matching reviews found.")