<a href="https://colab.research.google.com/github/Morris136/Homework-/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset and embeddings
df = pd.read_csv('movie_plots.csv')  # Contains 'title' column
embeddings = np.load('movie_plots_embeddings.npy')  # shape: (num_items, embedding_dim)

def find_top_k_similar(embedding, embeddings_matrix, df, top_k=5):
    """
    Find top_k most similar items to the given embedding.

    Args:
        embedding (np.array): Single embedding vector, shape (embedding_dim,)
        embeddings_matrix (np.array): All item embeddings, shape (num_items, embedding_dim)
        df (pd.DataFrame): Dataset with titles or metadata
        top_k (int): Number of top similar items to return

    Returns:
        List of tuples: [(title, similarity_score), ...] sorted descending by similarity
    """
    # Reshape embedding to 2D array for sklearn cosine_similarity
    embedding = embedding.reshape(1, -1)

    # Compute cosine similarity between input embedding and all stored embeddings
    similarities = cosine_similarity(embedding, embeddings_matrix)[0]  # shape: (num_items,)

    # Get indices of top_k highest similarity scores
    top_k_idx = similarities.argsort()[-top_k:][::-1]

    # Collect titles and similarity scores
    results = [(df.iloc[i]['title'], similarities[i]) for i in top_k_idx]

    return results

# --- Example usage ---

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

# Pick an example text to get its embedding and find similar movies
example_text = df.iloc[0]['plot']
example_embedding = model.encode([example_text])[0]

top_similar = find_top_k_similar(example_embedding, embeddings, df, top_k=5)

print(f"Top 5 movies similar to: '{df.iloc[0]['title']}'\n")
for title, score in top_similar:
    print(f"{title} (similarity: {score:.4f})")
