In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

### V1

In [2]:
keywords_df = pd.read_csv('../datasets/keywords.csv')
keywords_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [3]:
keywords_df['keywords'] = keywords_df['keywords'].apply(ast.literal_eval)
keywords_df['keyword_names'] = keywords_df['keywords'].apply(lambda x: [item['name'] for item in x])

In [4]:
mlb_v1 = MultiLabelBinarizer(sparse_output=True)
keywords_matrix_v1 = mlb_v1.fit_transform(keywords_df['keyword_names'])

In [5]:
def get_similar_movies_sparse_v1(movie_idx, n_recommendations=5):
    # Get similarities for one movie
    sim_scores = cosine_similarity(
        keywords_matrix_v1[movie_idx:movie_idx + 1], 
        keywords_matrix_v1
    ).flatten()
    
    # Get top similar movies
    similar_indices = np.argsort(sim_scores)[-n_recommendations-1:][::-1]
    similar_scores = sim_scores[similar_indices]
    
    # Remove the movie itself
    similar_indices = similar_indices[similar_indices != movie_idx]
    similar_scores = similar_scores[1:]
    
    return pd.DataFrame({
        'movie_id': keywords_df.iloc[similar_indices]['id'].values,
        'similarity': similar_scores
    })

In [6]:
get_similar_movies_sparse_v1(0, 5)

Unnamed: 0,movie_id,similarity
0,118051,0.471405
1,14499,0.3849
2,273578,0.333333
3,84023,0.333333
4,163710,0.333333
