In [13]:
import faiss
import pickle
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [14]:
base_dir = "./../faiss_embeddings4"
model_name = 'sentence-transformers/distilbert-base-nli-stsb-mean-tokens'

In [15]:
index = faiss.read_index(f"{base_dir}/movie_index.faiss")

In [16]:
with open(f"{base_dir}/movie_ids.pkl", "rb") as f:
    id_list = pickle.load(f)

metadata = pd.read_csv(f"{base_dir}/movie_metadata.csv")

model = SentenceTransformer(model_name)

In [17]:
import ast

def metadata_filter(row, row_checker):
    # Extract numeric filters with defaults
    min_year = row_checker.get("min_year", float("-inf"))
    max_year = row_checker.get("max_year", float("inf"))
    min_rating = row_checker.get("min_rating", float("-inf"))
    max_rating = row_checker.get("max_rating", float("inf"))
    min_duration = row_checker.get("min_duration", float("-inf"))
    max_duration = row_checker.get("max_duration", float("inf"))

    # Extract list-based filters with safe defaults
    required_genres = set(row_checker.get("required_genres", []))
    excluded_genres = set(row_checker.get("excluded_genres", []))
    required_languages = set(row_checker.get("required_languages", []))
    excluded_languages = set(row_checker.get("excluded_languages", []))

    # Safely parse stringified lists (e.g., "['English']")
    def parse_list(cell):
        try:
            return set(ast.literal_eval(cell)) if isinstance(cell, str) else set(cell)
        except (ValueError, SyntaxError):
            return set()

    # Parse numeric values safely
    try:
        year = int(row.get("year", 0))
        rating = float(row.get("rating", 0.0))
        duration = int(row.get("duration", 0))
    except (ValueError, TypeError):
        return False  # Reject row on parsing error

    # Range checks
    year_check = min_year <= year <= max_year
    rating_check = min_rating <= rating <= max_rating
    duration_check = min_duration <= duration <= max_duration

    # List checks
    genres = parse_list(row.get("genres", "[]"))
    languages = parse_list(row.get("languages", "[]"))

    genre_inclusion_check = not required_genres or bool(genres & required_genres)
    genre_exclusion_check = not (genres & excluded_genres)

    language_inclusion_check = not required_languages or bool(languages & required_languages)
    language_exclusion_check = not (languages & excluded_languages)

    # Final decision
    return (
        year_check and
        rating_check and
        duration_check and
        genre_inclusion_check and
        genre_exclusion_check and
        language_inclusion_check and
        language_exclusion_check
    )


In [18]:
# === DEFINE QUERY FUNCTION WITH OFFSET LOGIC ===
def search_movies(query, top_k=10, search_batch_size=100, row_checker={}):
    query_embedding = model.encode([query], normalize_embeddings=True).astype("float32")

    results = []
    offset = 0

    while len(results) < top_k:
        fetch_size = min(search_batch_size + offset, len(id_list))
        D, I = index.search(query_embedding, fetch_size)

        for idx in I[0][offset:]:
            movie_id = id_list[idx]
            row = metadata[metadata["id"] == movie_id].iloc[0].to_dict()
            if metadata_filter(row, row_checker):
                results.append({
                    "id": movie_id,
                    "metadata": row
                })
            if len(results) == top_k:
                break

        offset += search_batch_size
        if offset >= len(id_list):
            break

    return results




In [19]:
# === EXAMPLE USAGE ===
results = search_movies("the godfather", top_k=10, row_checker={"excluded_genres": ["Drama", "Gangster", "Crime"]})
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt0045168, Result: {'id': 'tt0045168', 'title': 'The Dream of Zorro', 'year': 1952, 'duration': 93, 'MPA': nan, 'rating': 5.1, 'votes': '103', 'meta_score': 0.0, 'description': 'An old gentleman, a direct descendant of Zorro, has a single son named Raimundo in whom there is no trace of proud pride.', 'Movie_Link': 'https://www.imdb.com/title/tt0045168', 'writers': "['Mario Amendola', 'Sandro Continenza', 'Ruggero Maccari']", 'directors': "['Mario Soldati']", 'stars': "['Walter Chiari', 'Delia Scala', 'Vittorio Gassman', 'Carlo Ninchi', 'Umberto Aquilino', 'Anna Arena', 'Sandro Bianchi', 'Pietro Capanna', 'Giorgio Costantini', 'Juan de Landa']", 'budget': nan, 'opening_weekend_gross': nan, 'gross_worldwide': nan, 'gross_us_canada': nan, 'release_date': 1952.0, 'countries_origin': "['Italy']", 'filming_locations': '[]', 'production_companies': "['Industrie Cinematografiche Sociali (ICS)']", 'awards_content': '[]', 'genres': "['Adventure', 'Western']", 'languages': "['Italian']"}
2

In [20]:
def search_movies_dual_query_fast(
    positive_query,
    negative_query=None,
    top_k=10,
    search_batch_size=200,
    row_checker={},
    alpha=1.0,
    beta=1.0
):
    # Encode queries
    pos_embed = model.encode([positive_query], normalize_embeddings=True).astype("float32")
    neg_embed = None
    if negative_query:
        neg_embed = model.encode([negative_query], normalize_embeddings=True).astype("float32")

    # Search once for top-N potentially good matches
    D_pos, I_pos = index.search(pos_embed, search_batch_size)

    scored_results = []

    for rank, idx in enumerate(I_pos[0]):
        movie_id = id_list[idx]
        row = metadata[metadata["id"] == movie_id].iloc[0].to_dict()
        if not metadata_filter(row, row_checker):
            continue

        pos_sim = float(D_pos[0][rank])

        # Compute neg_sim manually if needed
        neg_sim = 0.0
        if neg_embed is not None:
            embedding = index.reconstruct(int(idx))  # Fix applied here
            neg_sim = float(np.dot(embedding, neg_embed[0]))


        score = alpha * pos_sim - beta * neg_sim

        scored_results.append({
            "id": movie_id,
            "positive_similarity": pos_sim,
            "negative_similarity": neg_sim,
            "score": score,
            "metadata": row
        })

    # Sort and return top_k
    sorted_results = sorted(scored_results, key=lambda x: x["score"], reverse=True)
    return sorted_results[:top_k]


In [27]:
results = search_movies_dual_query_fast(
    positive_query="spiderman",
    negative_query="tom holland",
    top_k=10,
    row_checker={},
    search_batch_size = 100,
    alpha=1.0,
    beta=1.0
)


In [28]:
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt0948470, Result: {'id': 'tt0948470', 'title': 'The Amazing Spider-Man', 'year': 2012, 'duration': 136, 'MPA': 'PG-13', 'rating': 6.9, 'votes': '729K', 'meta_score': 66.0, 'description': 'After Peter Parker is bitten by a genetically altered spider, he gains newfound, spider-like powers and ventures out to save the city from the machinations of a mysterious reptilian foe.', 'Movie_Link': 'https://www.imdb.com/title/tt0948470', 'writers': "['James Vanderbilt', 'Alvin Sargent', 'Steve Kloves']", 'directors': "['Marc Webb']", 'stars': "['Andrew Garfield', 'Emma Stone', 'Rhys Ifans', 'Irrfan Khan', 'Denis Leary', 'Martin Sheen', 'Sally Field', 'Campbell Scott', 'Embeth Davidtz', 'Chris Zylka']", 'budget': '$230,000,000 (estimated)', 'opening_weekend_gross': '$62,004,688', 'gross_worldwide': '$758,707,722', 'gross_us_canada': '$262,782,352', 'release_date': 2012.0, 'countries_origin': "['United States']", 'filming_locations': "['Universal Studios Hollywood - 1000 Universal Studios B

In [33]:
results = search_movies_dual_query_fast(
    positive_query="war ww2 brad pitt",
    top_k=10,
    row_checker={},
    search_batch_size = 100,
    alpha=1.0,
    beta=1.0
)

In [34]:
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt0116130, Result: {'id': 'tt0116130', 'title': 'Down Periscope', 'year': 1996, 'duration': 92, 'MPA': 'PG-13', 'rating': 6.2, 'votes': '28K', 'meta_score': 39.0, 'description': 'Lt. Cmdr. Tom Dodge is assigned as Captain to the USS Stingray, an old diesel driven submarine that has seen better days.', 'Movie_Link': 'https://www.imdb.com/title/tt0116130', 'writers': "['Hugh Wilson', 'Andrew Kurtzman', 'Eliot Wald']", 'directors': "['David S. Ward']", 'stars': "['Kelsey Grammer', 'Lauren Holly', 'Rob Schneider', 'Harry Dean Stanton', 'Bruce Dern', 'William H. Macy', 'Ken Hudson Campbell', 'Toby Huss', 'Duane Martin', 'Jonathan Penner']", 'budget': '$31,000,000 (estimated)', 'opening_weekend_gross': '$7,231,087', 'gross_worldwide': '$37,553,752', 'gross_us_canada': '$25,785,603', 'release_date': 1996.0, 'countries_origin': "['United States']", 'filming_locations': "['U.S. Navy Submarine Base, New London, Connecticut, USA (opening credits)']", 'production_companies': "['Sprockets Mu