In [1]:
import pandas as pd
from scipy.sparse import load_npz
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
tag_matrix = load_npz('/kaggle/input/movielens/v1_genome/v1_genome/tag_matrix.npz')
tag_vocab = pd.read_csv('/kaggle/input/movielens/v1_genome/v1_genome/tag_index.csv')
movie_index = pd.read_csv('/kaggle/input/movielens/v1_genome/v1_genome/movie_index.csv')


In [3]:
tag_matrix.shape, movie_index.head()

((9734, 1094),
    row_index  movie_id                               title
 0          0         1                    Toy Story (1995)
 1          1         2                      Jumanji (1995)
 2          2         3             Grumpier Old Men (1995)
 3          3         4            Waiting to Exhale (1995)
 4          4         5  Father of the Bride Part II (1995))

In [4]:
# Getting top 5 similar items

def similar_movies(title, top_n):
    """Given a title, get top_n similar movies"""
    row = movie_index[movie_index['title'].str.lower() == title.lower()]
    if row.empty:
        print("Movie not found")
        return
    movie_row_index = int(row['row_index'].values[0])

    target = tag_matrix[movie_row_index]
    similarity_scores = cosine_similarity(target, tag_matrix).flatten()

    similar_indices = similarity_scores.argsort()[::-1][1:top_n+1]
    similar_movies = movie_index.iloc[similar_indices][["title"]].copy()
    similar_movies["similarity"] = similarity_scores[similar_indices]
    
    return similar_movies

In [5]:
similar_movies('Toy Story (1995)', top_n=20)

Unnamed: 0,title,similarity
2771,Toy Story 2 (1999),0.94841
4333,"Monsters, Inc. (2001)",0.944849
2065,"Bug's Life, A (1998)",0.92862
5447,Finding Nemo (2003),0.928589
9025,Toy Story 3 (2010),0.926867
7973,Ratatouille (2007),0.916357
6793,"Incredibles, The (2004)",0.903782
3811,Shrek (2001),0.903104
4604,Ice Age (2002),0.892367
8692,Up (2009),0.891387


In [6]:
tag_vocab.head(5)

Unnamed: 0,col_index,tag_id,tag
0,0,22,aardman
1,1,112,secret service
2,2,167,hillarious
3,3,270,christian
4,4,362,mummy


In [7]:
tag_names = tag_vocab.sort_values('col_index')['tag'].to_numpy()

In [8]:
def top_shared_tags(row_a, row_b, k):
    """Top k tags which compared movies share"""
    # Get index pointers - start and end for both movies
    a0, a1 = tag_matrix.indptr[row_a], tag_matrix.indptr[row_a + 1]
    b0, b1 = tag_matrix.indptr[row_b], tag_matrix.indptr[row_b + 1]

    # Get indices and data (tag values) for those pointers
    a_cols, a_vals = tag_matrix.indices[a0:a1], tag_matrix.data[a0:a1]
    b_cols, b_vals = tag_matrix.indices[b0:b1], tag_matrix.data[b0:b1]

    # Common tags in both movies
    common, ia, ib = np.intersect1d(a_cols, b_cols, return_indices=True)

    # Tag strength
    strength = a_vals[ia] * b_vals[ib]
    # sort tags based on strength
    order = np.argsort(strength)[::-1][:k]

    return [str(tag_names[c]) for c in common[order]]



In [9]:
top_shared_tags(0, 4333, k=3)


['animation', 'animated', 'kids']

In [10]:
def row_by_title(title: str) -> int:
    row = movie_index[movie_index["title"].str.lower() == title.lower()]
    if row.empty:
        raise ValueError(f"Movie not found: {title}")
    return int(row.iloc[0]["row_index"])


In [11]:
def recommend_with_why(title: str, top_n: int = 5, k_tags: int = 3):
    """
    Recommend top-N similar movies and show shared-tag explanations.
    """
    r = row_by_title(title)

    
    sims = cosine_similarity(tag_matrix[r], tag_matrix).ravel()

   
    idx = sims.argsort()[::-1]
    idx = idx[idx != r][:top_n]

   
    out = movie_index.iloc[idx][["title"]].copy()
    out["similarity"] = sims[idx]
    out["why"] = [", ".join(top_shared_tags(r, j, k_tags)) for j in idx]
    return out


In [12]:
recommend_with_why("Toy Story (1995)", top_n=20)


Unnamed: 0,title,similarity,why
2771,Toy Story 2 (1999),0.94841,"toys, kids and family, computer animation"
4333,"Monsters, Inc. (2001)",0.944849,"animation, animated, kids"
2065,"Bug's Life, A (1998)",0.92862,"kids and family, animation, animated"
5447,Finding Nemo (2003),0.928589,"kids, animated, animation"
9025,Toy Story 3 (2010),0.926867,"toys, computer animation, animation"
7973,Ratatouille (2007),0.916357,"animation, animated, computer animation"
6793,"Incredibles, The (2004)",0.903782,"animation, animated, computer animation"
3811,Shrek (2001),0.903104,"kids, animation, computer animation"
4604,Ice Age (2002),0.892367,"kids, animation, animated"
8692,Up (2009),0.891387,"kids, animated, animation"


In [13]:
def build_taste_vector(fav_movies):
    """Given facoorite movies, combine their score vectors"""
    indices = []

    for movie in fav_movies:
        match = movie_index[movie_index["title"].str.lower() == movie.lower()]
        if match.empty:
            print("Movie not found")
            continue
        indices.append(int(match.iloc[0]["row_index"]))

    if not indices:
        raise ValueError("No Valid movies found.")

    vectors = tag_matrix[indices]
    taste_vec = vectors.mean(axis=0)
    
    return taste_vec
    

In [17]:
def recommend_for_user(fav_movies, top_n=10):
    """Recommend based on user's favorite movie list"""

    taste_vec = build_taste_vector(fav_movies)

    sims = cosine_similarity(np.asarray(taste_vec), tag_matrix).ravel()

    ranked_idx = sims.argsort()[::-1]
    
    seed_rows = set(
        movie_index[movie_index["title"].isin(fav_movies)]["row_index"]
    )
    ranked_idx = [i for i in ranked_idx if i not in seed_rows][:top_n]

    recs = movie_index.iloc[ranked_idx][["title"]].copy()
    recs["similarity"] = sims[ranked_idx]

    return recs

In [18]:
favorites = [
    "Toy Story (1995)",
    "Finding Nemo (2003)",
    "Monsters, Inc. (2001)"
]

recommend_for_user(favorites, top_n=10)


Unnamed: 0,title,similarity
2065,"Bug's Life, A (1998)",0.9551
7973,Ratatouille (2007),0.949706
2771,Toy Story 2 (1999),0.946657
9025,Toy Story 3 (2010),0.932415
3811,Shrek (2001),0.932234
4604,Ice Age (2002),0.931415
6793,"Incredibles, The (2004)",0.92473
8966,How to Train Your Dragon (2010),0.916575
346,"Lion King, The (1994)",0.913185
8692,Up (2009),0.909701


In [22]:
favorites = [
    "Inception (2010)",
    "Interstellar (2014)",
    "The Matrix (1999)",
    "Blade Runner 2049 (2017)",
    "Ex Machina (2015)"
]

recommend_for_user(favorites, top_n=10)


Movie not found
Movie not found
Movie not found
Movie not found


Unnamed: 0,title,similarity
2271,"Matrix, The (1999)",0.903733
49,"Usual Suspects, The (1995)",0.903624
9239,Source Code (2011),0.900826
3739,Memento (2000),0.895985
7880,"Prestige, The (2006)",0.88789
302,"Shawshank Redemption, The (1994)",0.883282
9543,Looper (2012),0.883246
7693,Inside Man (2006),0.881336
2441,"Sixth Sense, The (1999)",0.878963
2624,Fight Club (1999),0.876476
