In [1]:
import faiss
import pickle
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [2]:
base_dir = "./../faiss_embeddings5"
model_name = 'sentence-transformers/all-MiniLM-L12-v2'

In [3]:
index = faiss.read_index(f"{base_dir}/movie_index.faiss")

In [4]:
with open(f"{base_dir}/movie_ids.pkl", "rb") as f:
    id_list = pickle.load(f)

metadata = pd.read_csv(f"{base_dir}/movie_metadata.csv")

model = SentenceTransformer(model_name)

In [5]:
import ast

def metadata_filter(row, row_checker):
    # Extract numeric filters with defaults
    min_year = row_checker.get("min_year", float("-inf"))
    max_year = row_checker.get("max_year", float("inf"))
    min_rating = row_checker.get("min_rating", float("-inf"))
    max_rating = row_checker.get("max_rating", float("inf"))
    min_duration = row_checker.get("min_duration", float("-inf"))
    max_duration = row_checker.get("max_duration", float("inf"))

    # Extract list-based filters with safe defaults
    required_genres = set(row_checker.get("required_genres", []))
    excluded_genres = set(row_checker.get("excluded_genres", []))
    required_languages = set(row_checker.get("required_languages", []))
    excluded_languages = set(row_checker.get("excluded_languages", []))

    # Safely parse stringified lists (e.g., "['English']")
    def parse_list(cell):
        try:
            return set(ast.literal_eval(cell)) if isinstance(cell, str) else set(cell)
        except (ValueError, SyntaxError):
            return set()

    # Parse numeric values safely
    try:
        year = int(row.get("year", 0))
        rating = float(row.get("rating", 0.0))
        duration = int(row.get("duration", 0))
    except (ValueError, TypeError):
        return False  # Reject row on parsing error

    # Range checks
    year_check = min_year <= year <= max_year
    rating_check = min_rating <= rating <= max_rating
    duration_check = min_duration <= duration <= max_duration

    # List checks
    genres = parse_list(row.get("genres", "[]"))
    languages = parse_list(row.get("languages", "[]"))

    genre_inclusion_check = not required_genres or bool(genres & required_genres)
    genre_exclusion_check = not (genres & excluded_genres)

    language_inclusion_check = not required_languages or bool(languages & required_languages)
    language_exclusion_check = not (languages & excluded_languages)

    # Final decision
    return (
        year_check and
        rating_check and
        duration_check and
        genre_inclusion_check and
        genre_exclusion_check and
        language_inclusion_check and
        language_exclusion_check
    )


In [6]:
# === DEFINE QUERY FUNCTION WITH OFFSET LOGIC ===
def search_movies(query, top_k=10, search_batch_size=100, row_checker={}):
    query_embedding = model.encode([query], normalize_embeddings=True).astype("float32")

    results = []
    offset = 0

    while len(results) < top_k:
        fetch_size = min(search_batch_size + offset, len(id_list))
        D, I = index.search(query_embedding, fetch_size)

        for idx in I[0][offset:]:
            movie_id = id_list[idx]
            row = metadata[metadata["id"] == movie_id].iloc[0].to_dict()
            if metadata_filter(row, row_checker):
                results.append({
                    "id": movie_id,
                    "metadata": row
                })
            if len(results) == top_k:
                break

        offset += search_batch_size
        if offset >= len(id_list):
            break

    return results




In [7]:
# === EXAMPLE USAGE ===
results = search_movies("the godfather", top_k=10, row_checker={"excluded_genres": ["Drama", "Gangster", "Crime"]})
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt0033434, Result: {'id': 'tt0033434', 'title': 'Brivido', 'year': 1941, 'duration': 82, 'MPA': nan, 'rating': 0.0, 'votes': nan, 'meta_score': 0.0, 'description': nan, 'Movie_Link': 'https://www.imdb.com/title/tt0033434', 'writers': "['Mino Caudana', 'Alessandro De Stefani', 'Giacomo Gentilomo']", 'directors': "['Giacomo Gentilomo']", 'stars': "['Umberto Melnati', 'María Mercader', 'Carlo Campanini', 'Clara Calamai', 'Sandro Ruffini', 'Andrea Checchi', 'Pina Renzi', 'Ernesto Almirante', 'Nino Crisman', 'Giacomo Moschini']", 'budget': nan, 'opening_weekend_gross': nan, 'gross_worldwide': nan, 'gross_us_canada': nan, 'release_date': 1941.0, 'countries_origin': "['Italy']", 'filming_locations': "['Titanus Studios, Rome, Lazio, Italy (Studio)']", 'production_companies': "['Industria Cinematografica Italiana (INCINE)']", 'awards_content': '[]', 'genres': "['Comedy', 'Mystery', 'Thriller']", 'languages': "['Italian']"}
2. ID: tt0047986, Result: {'id': 'tt0047986', 'title': 'Destinazi

In [8]:
def search_movies_dual_query_fast(
    positive_query,
    negative_query=None,
    top_k=10,
    search_batch_size=200,
    row_checker={},
    alpha=1.0,
    beta=1.0
):
    # Encode queries
    pos_embed = model.encode([positive_query], normalize_embeddings=True).astype("float32")
    neg_embed = None
    if negative_query:
        neg_embed = model.encode([negative_query], normalize_embeddings=True).astype("float32")

    # Search once for top-N potentially good matches
    D_pos, I_pos = index.search(pos_embed, search_batch_size)

    scored_results = []

    for rank, idx in enumerate(I_pos[0]):
        movie_id = id_list[idx]
        row = metadata[metadata["id"] == movie_id].iloc[0].to_dict()
        if not metadata_filter(row, row_checker):
            continue

        pos_sim = float(D_pos[0][rank])

        # Compute neg_sim manually if needed
        neg_sim = 0.0
        if neg_embed is not None:
            embedding = index.reconstruct(int(idx))  # Fix applied here
            neg_sim = float(np.dot(embedding, neg_embed[0]))


        score = alpha * pos_sim - beta * neg_sim

        scored_results.append({
            "id": movie_id,
            "positive_similarity": pos_sim,
            "negative_similarity": neg_sim,
            "score": score,
            "metadata": row
        })

    # Sort and return top_k
    sorted_results = sorted(scored_results, key=lambda x: x["score"], reverse=True)
    return sorted_results[:top_k]


In [9]:
results = search_movies_dual_query_fast(
    positive_query="spiderman",
    negative_query="tom holland",
    top_k=10,
    row_checker={},
    search_batch_size = 100,
    alpha=1.0,
    beta=1.0
)


In [10]:
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt6320628, Result: {'id': 'tt6320628', 'title': 'Spider-Man: Far from Home', 'year': 2019, 'duration': 129, 'MPA': 'PG-13', 'rating': 7.4, 'votes': '591K', 'meta_score': 69.0, 'description': 'Peter Parker, the beloved superhero Spider-Man, faces four destructive elemental monsters while on holiday in Europe. Soon, he receives help from Mysterio, a fellow hero with mysterious origins.', 'Movie_Link': 'https://www.imdb.com/title/tt6320628', 'writers': "['Chris McKenna', 'Erik Sommers', 'Stan Lee']", 'directors': "['Jon Watts']", 'stars': "['Tom Holland', 'Samuel L. Jackson', 'Jake Gyllenhaal', 'Marisa Tomei', 'Jon Favreau', 'Zendaya', 'Jacob Batalon', 'Tony Revolori', 'Angourie Rice', 'Remy Hii']", 'budget': '$160,000,000 (estimated)', 'opening_weekend_gross': '$92,579,212', 'gross_worldwide': '$1,132,705,055', 'gross_us_canada': '$391,283,774', 'release_date': 0.0, 'countries_origin': "['United States', 'Czech Republic', 'Australia', 'Canada', 'Italy']", 'filming_locations': "['P

In [17]:
results = search_movies_dual_query_fast(
    positive_query="best movies with bradd pitt",
    top_k=10,
    row_checker={},
    search_batch_size = 100,
    alpha=1.0,
    beta=1.0
)

In [18]:
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt0035586, Result: {'id': 'tt0035586', 'title': 'The Young Mr. Pitt', 'year': 1942, 'duration': 118, 'MPA': 'Approved', 'rating': 6.7, 'votes': '462', 'meta_score': 0.0, 'description': 'This biopic tells the story of the life of Pitt The Younger, who became Prime Minister of Great Britain at the age of twenty-four.', 'Movie_Link': 'https://www.imdb.com/title/tt0035586', 'writers': "['Viscount Castlerosse', 'Sidney Gilliat', 'Frank Launder']", 'directors': "['Carol Reed']", 'stars': "['Robert Donat', 'Geoffrey Atkins', 'Jean Cadell', 'Robert Morley', 'Phyllis Calvert', 'Raymond Lovell', 'Agnes Lauchlan', 'John Mills', 'Felix Aylmer', 'Ian McLean']", 'budget': nan, 'opening_weekend_gross': nan, 'gross_worldwide': nan, 'gross_us_canada': nan, 'release_date': 1942.0, 'countries_origin': "['United Kingdom']", 'filming_locations': "['Gaumont-British Studios, London, England, UK (studio: made at the Gaumont-British Studios, London.)']", 'production_companies': "['Twentieth Century-Fox 