In [12]:
import tensorflow_datasets as tfds
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf

In [13]:
# Define the genre mapping (example mapping for 21 genres, modify as needed)
genre_mapping = {
    0: 'Action', 1: 'Adventure', 2: 'Animation', 3: 'Children', 4: 'Comedy', 
    5: 'Crime', 6: 'Documentary', 7: 'Drama', 8: 'Fantasy', 9: 'Film-Noir', 
    10: 'Horror', 11: 'Musical', 12: 'Mystery', 13: 'Romance', 14: 'Sci-Fi', 
    15: 'Thriller', 16: 'War', 17: 'Western', 18: 'IMAX', 19: 'Documentary', 
    20: 'Family'
}

In [22]:
# Load the MovieLens dataset
data, info = tfds.load('movielens/100k-movies', split='train', as_supervised=False, with_info=True)

# Convert dataset to a DataFrame
movies = tfds.as_dataframe(data)

# Convert 'movie_id' to string format (removing b prefix from byte string)
movies['movie_id'] = movies['movie_id'].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else str(x))
print(movies.head(),"\n\n")
print(movies.describe())

  movie_genres movie_id                      movie_title
0          [4]     1681           b'You So Crazy (1994)'
1       [4, 7]     1457   b'Love Is All There Is (1996)'
2       [1, 3]      500          b'Fly Away Home (1996)'
3          [0]      838  b'In the Line of Duty 2 (1987)'
4          [7]     1648       b'Niagara, Niagara (1997)' 


       movie_genres movie_id            movie_title
count          1682     1682                   1682
unique          868     1682                   1664
top             [7]     1681  b"Ulee's Gold (1997)"
freq            376        1                      2


In [15]:
# Preprocess the genres: Convert genre IDs to genre names
def map_genres(genre_ids):
    return ', '.join([genre_mapping[genre_id] for genre_id in genre_ids])

In [16]:
# Apply genre processing
movies['genres'] = movies['movie_genres'].apply(lambda x: map_genres(x.numpy()) if isinstance(x, tf.Tensor) else map_genres(x))

In [17]:
# Content-Based Filtering using TF-IDF on genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

In [18]:
# Compute cosine similarity between movies based on genres
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [19]:
# Define the recommendation function
def recommend_movie(movie_id, cosine_sim=cosine_sim):
    # Ensure the movie_id is valid and in string format
    movie_id = str(movie_id)
    
    if movie_id not in movies['movie_id'].values:
        return f"Movie ID {movie_id} not found in the dataset."

    idx = movies.index[movies['movie_id'] == movie_id].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Get top 5 similar movies

    movie_indices = [i[0] for i in sim_scores]
    return movies['movie_title'].iloc[movie_indices].tolist()

# Recommend movies similar to a specific movie (e.g., with movie_id = 1681 as a string)
print(recommend_movie(1681))

[b'To Wong Foo, Thanks for Everything! Julie Newmar (1995)', b'Reckless (1995)', b'Senseless (1998)', b'Live Nude Girls (1995)', b'Vermin (1998)']


In [20]:
print(recommend_movie(11))

[b'Albino Alligator (1996)', b'Purple Noon (1960)', b'Playing God (1997)', b'Seven (Se7en) (1995)', b'Hard Eight (1996)']
