In [1]:
import faiss
import pickle
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [5]:
base_dir = "./../faiss_embeddings2"
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

In [6]:
index = faiss.read_index(f"{base_dir}/movie_index.faiss")

In [7]:
with open(f"{base_dir}/movie_ids.pkl", "rb") as f:
    id_list = pickle.load(f)

metadata = pd.read_csv(f"{base_dir}/movie_metadata.csv")

model = SentenceTransformer(model_name)

In [8]:
import ast

def metadata_filter(row, row_checker):
    # Extract numeric filters with defaults
    min_year = row_checker.get("min_year", float("-inf"))
    max_year = row_checker.get("max_year", float("inf"))
    min_rating = row_checker.get("min_rating", float("-inf"))
    max_rating = row_checker.get("max_rating", float("inf"))
    min_duration = row_checker.get("min_duration", float("-inf"))
    max_duration = row_checker.get("max_duration", float("inf"))

    # Extract list-based filters with safe defaults
    required_genres = set(row_checker.get("required_genres", []))
    excluded_genres = set(row_checker.get("excluded_genres", []))
    required_languages = set(row_checker.get("required_languages", []))
    excluded_languages = set(row_checker.get("excluded_languages", []))

    # Safely parse stringified lists (e.g., "['English']")
    def parse_list(cell):
        try:
            return set(ast.literal_eval(cell)) if isinstance(cell, str) else set(cell)
        except (ValueError, SyntaxError):
            return set()

    # Parse numeric values safely
    try:
        year = int(row.get("year", 0))
        rating = float(row.get("rating", 0.0))
        duration = int(row.get("duration", 0))
    except (ValueError, TypeError):
        return False  # Reject row on parsing error

    # Range checks
    year_check = min_year <= year <= max_year
    rating_check = min_rating <= rating <= max_rating
    duration_check = min_duration <= duration <= max_duration

    # List checks
    genres = parse_list(row.get("genres", "[]"))
    languages = parse_list(row.get("languages", "[]"))

    genre_inclusion_check = not required_genres or bool(genres & required_genres)
    genre_exclusion_check = not (genres & excluded_genres)

    language_inclusion_check = not required_languages or bool(languages & required_languages)
    language_exclusion_check = not (languages & excluded_languages)

    # Final decision
    return (
        year_check and
        rating_check and
        duration_check and
        genre_inclusion_check and
        genre_exclusion_check and
        language_inclusion_check and
        language_exclusion_check
    )


In [9]:
# === DEFINE QUERY FUNCTION WITH OFFSET LOGIC ===
def search_movies(query, top_k=10, search_batch_size=100, row_checker={}):
    query_embedding = model.encode([query], normalize_embeddings=True).astype("float32")

    results = []
    offset = 0

    while len(results) < top_k:
        fetch_size = min(search_batch_size + offset, len(id_list))
        D, I = index.search(query_embedding, fetch_size)

        for idx in I[0][offset:]:
            movie_id = id_list[idx]
            row = metadata[metadata["id"] == movie_id].iloc[0].to_dict()
            if metadata_filter(row, row_checker):
                results.append({
                    "id": movie_id,
                    "metadata": row
                })
            if len(results) == top_k:
                break

        offset += search_batch_size
        if offset >= len(id_list):
            break

    return results




In [10]:
# === EXAMPLE USAGE ===
results = search_movies("the godfather", top_k=10, row_checker={"excluded_genres": ["Drama", "Gangster", "Crime"]})
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt0127751, Result: {'id': 'tt0127751', 'title': 'Recoil', 'year': 1998, 'duration': 96, 'MPA': 'R', 'rating': 5.3, 'votes': '765', 'meta_score': 0.0, 'description': "A mafia Godfather exacts revenge on the policeman responsible for his son's death during a bank robbery. The officer then unleashes his fury in retaliation for the death of his family.", 'Movie_Link': 'https://www.imdb.com/title/tt0127751', 'writers': "['Art Camacho', 'Richard Preston Jr.']", 'directors': "['Art Camacho']", 'stars': "['Gary Daniels', 'Gregory McKinney', 'Thomas Kopache', 'Billy Maddox', 'John Sanderford', 'Robin Curtis', 'Kelli McCarty', 'Maurice Lamont', 'Richard Foronjy', 'Michael Alaimo']", 'budget': nan, 'opening_weekend_gross': nan, 'gross_worldwide': nan, 'gross_us_canada': nan, 'release_date': 1998.0, 'countries_origin': "['United States']", 'filming_locations': "['1010 S. Flower St, Los Angeles, California, USA']", 'production_companies': "['PM Entertainment Group']", 'awards_content': '[]',

In [11]:
def search_movies_dual_query_fast(
    positive_query,
    negative_query=None,
    top_k=10,
    search_batch_size=200,
    row_checker={},
    alpha=1.0,
    beta=1.0
):
    # Encode queries
    pos_embed = model.encode([positive_query], normalize_embeddings=True).astype("float32")
    neg_embed = None
    if negative_query:
        neg_embed = model.encode([negative_query], normalize_embeddings=True).astype("float32")

    # Search once for top-N potentially good matches
    D_pos, I_pos = index.search(pos_embed, search_batch_size)

    scored_results = []

    for rank, idx in enumerate(I_pos[0]):
        movie_id = id_list[idx]
        row = metadata[metadata["id"] == movie_id].iloc[0].to_dict()
        if not metadata_filter(row, row_checker):
            continue

        pos_sim = float(D_pos[0][rank])

        # Compute neg_sim manually if needed
        neg_sim = 0.0
        if neg_embed is not None:
            embedding = index.reconstruct(int(idx))  # Fix applied here
            neg_sim = float(np.dot(embedding, neg_embed[0]))


        score = alpha * pos_sim - beta * neg_sim

        scored_results.append({
            "id": movie_id,
            "positive_similarity": pos_sim,
            "negative_similarity": neg_sim,
            "score": score,
            "metadata": row
        })

    # Sort and return top_k
    sorted_results = sorted(scored_results, key=lambda x: x["score"], reverse=True)
    return sorted_results[:top_k]


In [12]:
results = search_movies_dual_query_fast(
    positive_query="spiderman",
    negative_query="tom holland",
    top_k=10,
    row_checker={},
    search_batch_size = 100,
    alpha=1.0,
    beta=1.0
)


In [13]:
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt0948470, Result: {'id': 'tt0948470', 'title': 'The Amazing Spider-Man', 'year': 2012, 'duration': 136, 'MPA': 'PG-13', 'rating': 6.9, 'votes': '729K', 'meta_score': 66.0, 'description': 'After Peter Parker is bitten by a genetically altered spider, he gains newfound, spider-like powers and ventures out to save the city from the machinations of a mysterious reptilian foe.', 'Movie_Link': 'https://www.imdb.com/title/tt0948470', 'writers': "['James Vanderbilt', 'Alvin Sargent', 'Steve Kloves']", 'directors': "['Marc Webb']", 'stars': "['Andrew Garfield', 'Emma Stone', 'Rhys Ifans', 'Irrfan Khan', 'Denis Leary', 'Martin Sheen', 'Sally Field', 'Campbell Scott', 'Embeth Davidtz', 'Chris Zylka']", 'budget': '$230,000,000 (estimated)', 'opening_weekend_gross': '$62,004,688', 'gross_worldwide': '$758,707,722', 'gross_us_canada': '$262,782,352', 'release_date': 2012.0, 'countries_origin': "['United States']", 'filming_locations': "['Universal Studios Hollywood - 1000 Universal Studios B

In [14]:
results = search_movies_dual_query_fast(
    positive_query="spiderman tom holland",
    top_k=10,
    row_checker={},
    search_batch_size = 100,
    alpha=1.0,
    beta=1.0
)

In [15]:
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt10872600, Result: {'id': 'tt10872600', 'title': 'Spider-Man: No Way Home', 'year': 2021, 'duration': 148, 'MPA': 'PG-13', 'rating': 8.2, 'votes': '941K', 'meta_score': 71.0, 'description': "With Spider-Man's identity now revealed, Peter asks Doctor Strange for help. When a spell goes wrong, dangerous foes from other worlds start to appear.", 'Movie_Link': 'https://www.imdb.com/title/tt10872600', 'writers': "['Chris McKenna', 'Erik Sommers', 'Stan Lee']", 'directors': "['Jon Watts']", 'stars': "['Tom Holland', 'Zendaya', 'Benedict Cumberbatch', 'Jacob Batalon', 'Jon Favreau', 'Jamie Foxx', 'Willem Dafoe', 'Alfred Molina', 'Benedict Wong', 'Tony Revolori']", 'budget': '$200,000,000 (estimated)', 'opening_weekend_gross': '$260,138,569', 'gross_worldwide': '$1,952,732,181', 'gross_us_canada': '$814,866,759', 'release_date': 2021.0, 'countries_origin': "['United States']", 'filming_locations': "['Iceland']", 'production_companies': "['Columbia Pictures', 'Pascal Pictures', 'Marvel 

In [16]:
results = search_movies_dual_query_fast(
    positive_query="war ww2 brad pitt",
    top_k=10,
    row_checker={},
    search_batch_size = 100,
    alpha=1.0,
    beta=1.0
)

In [17]:
for i, res in enumerate(results, 1):
    print(f"{i}. ID: {res['id']}, Result: {res['metadata']}")

1. ID: tt0076102, Result: {'id': 'tt0076102', 'title': 'The Biggest Battle', 'year': 1978, 'duration': 90, 'MPA': 'PG', 'rating': 4.7, 'votes': '619', 'meta_score': 0.0, 'description': 'How World War II affected the lives of a German family and an American family, both of whom had sons and fathers fighting in the war.', 'Movie_Link': 'https://www.imdb.com/title/tt0076102', 'writers': "['Umberto Lenzi', 'Cesare Frugoni']", 'directors': "['Umberto Lenzi']", 'stars': "['Helmut Berger', 'Samantha Eggar', 'Giuliano Gemma', 'John Huston', 'Stacy Keach', 'Ray Lovelock', 'Aldo Massasso', 'Venantino Venantini', 'Ida Galli', 'Edwige Fenech']", 'budget': nan, 'opening_weekend_gross': nan, 'gross_worldwide': nan, 'gross_us_canada': nan, 'release_date': 1978.0, 'countries_origin': "['Italy', 'West Germany', 'Yugoslavia']", 'filming_locations': "['Venice, California, USA (beach scenes)']", 'production_companies': "['Dania Film', 'National Cinematografica']", 'awards_content': '[]', 'genres': "['Dram