In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.optimizers import Adam

In [2]:
data = pd.read_csv('/content/movies.csv')

In [3]:
# Preprocessing
# Split the genres into a list for each movie
data['genres'] = data['genres'].apply(lambda x: x.split('|'))

In [4]:
# One-hot encode the genres using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genres'])

In [9]:
# Create a model to learn embeddings for the movies
input_dim = genres_encoded.shape[1]
embedding_dim = input_dim  # Change embedding_dim to match the output dimension

model = Sequential([
    Dense(128, activation='relu', input_shape=(input_dim,)),
    Dense(64, activation='relu'),
    Dense(embedding_dim, activation='relu') # Output layer now has the correct dimension
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')


In [11]:
# Train the model
model.fit(genres_encoded, genres_encoded, epochs=50, batch_size=256, verbose=1)


Epoch 1/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0314
Epoch 2/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0164
Epoch 3/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0163
Epoch 4/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0165
Epoch 5/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0163
Epoch 6/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0164
Epoch 7/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0163
Epoch 8/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0163
Epoch 9/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0164
Epoch 10/50
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - lo

<keras.src.callbacks.history.History at 0x7c5152a86410>

In [12]:
# Generate embeddings for all movies
movie_embeddings = model.predict(genres_encoded)


[1m2738/2738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step


In [13]:
# Function to recommend movies based on cosine similarity
def recommend_movies(movie_title, top_n=5):
    # Find the index of the input movie
    idx = data[data['title'] == movie_title].index[0]

    # Compute cosine similarities
    similarities = cosine_similarity([movie_embeddings[idx]], movie_embeddings)[0]

    # Get the indices of the most similar movies
    similar_indices = similarities.argsort()[-top_n-1:][::-1]
    similar_indices = [i for i in similar_indices if i != idx][:top_n]  # Exclude the input movie

    # Fetch the titles of the recommended movies
    recommended_movies = data.iloc[similar_indices]['title'].values
    return recommended_movies

In [14]:
# Example usage
movie_to_recommend = "Toy Story (1995)"
recommended = recommend_movies(movie_to_recommend, top_n=5)

print(f"Movies similar to '{movie_to_recommend}':")
for movie in recommended:
    print(movie)


Movies similar to 'Toy Story (1995)':
Pil's Adventures (2021)
Moana (2016)
The Good Dinosaur (2015)
Soul (2020)
Chickenhare and the Hamster of Darkness (2022)
