### Dataset

https://grouplens.org/datasets/movielens/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pandas
!pip install numpy
!pip install keras
!pip install tensorflow
!pip install sklearn

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate
import tensorflow as tf
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(physical_devices[0], 'GPU')

###Movie recommendation using Keras

In [None]:
# Load the MovieLens dataset
ratingsDF = pd.read_csv('/content/drive/MyDrive/ml-data-set/ratings.csv')

print(f"Number of rows in the original dataset: {len(ratingsDF)}")
# Preprocess the data
user_ids = ratingsDF['userId'].unique()
user2idx = {user_id: i for i, user_id in enumerate(user_ids)}

movie_ids = ratingsDF['movieId'].unique()
movie2idx = {movie_id: i for i, movie_id in enumerate(movie_ids)}

ratingsDF['userId'] = ratingsDF['userId'].map(user2idx)

# Filter movies with at least 10 votes
movie_counts = ratingsDF.groupby('movieId')['userId'].count()
popular_movies = movie_counts[movie_counts >= 10].index
ratingsDF = ratingsDF[ratingsDF['movieId'].isin(popular_movies)]

# Filter users who have voted for at least 50 movies
user_counts = ratingsDF.groupby('userId')['movieId'].count()
active_users = user_counts[user_counts >= 50].index
ratingsDF = ratingsDF[ratingsDF['userId'].isin(active_users)]

movieIndex = ratingsDF.groupby("movieId").count().sort_values(by= \
"rating",ascending=False)[0:1000].index
ratingsDF2 = ratingsDF[ratingsDF.movieId.isin(movieIndex)]
ratingsDF2.count()

userIndex = ratingsDF2.groupby("userId").count().sort_values(by= \
"rating",ascending=False).sample(n=1000, random_state=2018).index
ratingsDF3 = ratingsDF2[ratingsDF2.userId.isin(userIndex)]
ratingsDF3.count()

movies = ratingsDF3.movieId.unique()
moviesDF = pd.DataFrame(data=movies,columns=['originalMovieId'])

users = ratingsDF3.userId.unique()
usersDF = pd.DataFrame(data=users,columns=['originalUserId'])

ratingsDF3 = ratingsDF3.merge(moviesDF,left_on='movieId', \
right_on='originalMovieId')
ratingsDF3.drop(labels='originalMovieId', axis=1, inplace=True)

ratingsDF3 = ratingsDF3.merge(usersDF,left_on='userId', \
right_on='originalUserId')
ratingsDF3.drop(labels='originalUserId', axis=1, inplace=True)

n_items = ratingsDF3['movieId'].nunique()
n_users = ratingsDF3['userId'].nunique()

print(f"Number of rows in filtered dataset: {len(ratingsDF3)}")
ratingsDF3.head()

Number of rows in the original dataset: 25000095
Number of rows in filtered dataset: 128327


Unnamed: 0,userId,movieId,rating,timestamp
0,85,1,5.0,945462775
1,85,2,3.0,945462968
2,85,19,1.0,945669825
3,85,21,4.0,945668950
4,85,34,4.0,945461880


Once we filtered out our data set, we must separate it into train and test dataset. 20% of data is testing dataset, while 80% is considered training dataset.

In [None]:
train, test = train_test_split(ratingsDF3, test_size=0.2, random_state=42)

Now we create our neural network model. Model consists out of 3 layers:
*   Input layer (takes movie and user vector as input)
*   Embedding layer
*   Output layer (this layer is tasked with giving the predicted values)

Firstly, we create user and movie vectors, which are then concatenated and passed to our 2 hidden layers. First hidden layer consists of 128 neurons, while the second consists of 32 neurons. The output layer consists only of 1 neuron which gives predicted value provided by the user to the movie.








In [None]:
def custom_activation(x):
    min_val = 0.0  # Define your minimum value
    max_val = 5.0  # Define your maximum value
    return tf.keras.backend.sigmoid(x) * (max_val - min_val) + min_val

In [None]:
# creating movie embedding path
movie_input = Input(shape=[1], name="Movie-Input")
movie_embedding = Embedding(n_items+1, 5, name="Movie-Embedding")(movie_input)
movie_vec = Flatten(name="Flatten-Movies")(movie_embedding)
# creating user embedding path
user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(n_users+1, 5, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)
# concatenate features
conc = Concatenate()([movie_vec, user_vec])
# add fully-connected-layers
fc1 = Dense(128, activation='relu')(conc)
fc2 = Dense(32, activation='relu')(fc1)
out = Dense(1, activation=custom_activation)(fc2)
# Creating model and compiling it
model = Model([user_input, movie_input], out)
model.compile('adam', 'mean_squared_error')

In [None]:
model.fit([train.userId, train.movieId], train.rating, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7e48a51be890>

In [None]:
mse = model.evaluate([np.array(test.userId), np.array(test.movieId)], np.array(test.rating))
print('Mean Squared Error:', mse)

Mean Squared Error: 0.9621791839599609


In [None]:
#First we must separate movies that user watched and didn't watch
#into separate data frames
user_id = random.choice(ratingsDF3['userId'].tolist())  # Example user ID

user_watched_movies = ratingsDF3[ratingsDF3['userId'] == user_id]

unwatched_movies = ratingsDF3[~ratingsDF3['movieId'].isin(user_watched_movies['movieId'].tolist())]

#We predict the rating for unwatched movies
unwatched_movies['predicted_rating'] = model.predict([np.array(unwatched_movies['userId']), np.array(unwatched_movies['movieId'])])

unwatched_movies = unwatched_movies.sort_values(by='predicted_rating', ascending=False)
recommended_movies = unwatched_movies[['movieId','predicted_rating']].drop_duplicates(subset=['movieId'])

user_watched_movies = user_watched_movies.sort_values(by='rating', ascending=False)

moviesDF = pd.read_csv('/content/drive/MyDrive/ml-data-set/movies.csv')

# Print user watched movies
print('User Watched Movies:')
for _, row in user_watched_movies.head(10).iterrows():
    movie_id = row['movieId'].astype('int')
    rating = row['rating']
    movie = moviesDF[moviesDF['movieId'] == movie_id]
    if not movie.empty:
      movie_name = movie['title'].values[0]
      movie_genre = movie['genres'].values[0]
      print(f'Movie ID: {movie_id}, Rating: {rating}, Movie Title: {movie_name}, Genre: {movie_genre}')

# Print recommended movies
print('Recommended Movies:')
for _, row in recommended_movies.head(10).iterrows():
    movie_id = row['movieId'].astype('int')
    predicted_rating = row['predicted_rating']
    movie = moviesDF[moviesDF['movieId'] == movie_id]
    if not movie.empty:
      movie_name = movie['title'].values[0]
      movie_genre = movie['genres'].values[0]
      print(f'Movie ID: {movie_id}, Predicted Rating: {predicted_rating}, Movie Title: {movie_name}, Genre: {movie_genre}')

User Watched Movies:
Movie ID: 1, Rating: 5.0, Movie Title: Toy Story (1995), Genre: Adventure|Animation|Children|Comedy|Fantasy
Movie ID: 551, Rating: 5.0, Movie Title: Nightmare Before Christmas, The (1993), Genre: Animation|Children|Fantasy|Musical
Movie ID: 1028, Rating: 5.0, Movie Title: Mary Poppins (1964), Genre: Children|Comedy|Fantasy|Musical
Movie ID: 1721, Rating: 5.0, Movie Title: Titanic (1997), Genre: Drama|Romance
Movie ID: 1372, Rating: 5.0, Movie Title: Star Trek VI: The Undiscovered Country (1991), Genre: Action|Mystery|Sci-Fi
Movie ID: 1073, Rating: 5.0, Movie Title: Willy Wonka & the Chocolate Factory (1971), Genre: Children|Comedy|Fantasy|Musical
Movie ID: 968, Rating: 5.0, Movie Title: Night of the Living Dead (1968), Genre: Horror|Sci-Fi|Thriller
Movie ID: 899, Rating: 5.0, Movie Title: Singin' in the Rain (1952), Genre: Comedy|Musical|Romance
Movie ID: 2161, Rating: 5.0, Movie Title: NeverEnding Story, The (1984), Genre: Adventure|Children|Fantasy
Movie ID: 1391

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unwatched_movies['predicted_rating'] = model.predict([np.array(unwatched_movies['userId']), np.array(unwatched_movies['movieId'])])


###Item-based Collaborative Filtering

In [None]:
ratingsDF4 = ratingsDF3.pivot(index='movieId',columns='userId',values='rating')
ratingsDF4.head()

ratingsDF4.fillna(0,inplace=True)
ratingsDF4.head()

userId,85,178,205,427,522,656,671,843,903,1307,...,160237,161003,161172,161279,162012,162016,162097,162098,162334,162468
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,5.0,0.0,0.0,3.0,4.0,5.0,0.0,...,0.0,2.5,0.0,3.5,0.0,0.0,0.0,0.0,4.5,0.0
2,3.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0


In [None]:
csr_data = csr_matrix(ratingsDF4.values)
ratingsDF4.reset_index(inplace=True)

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

In [None]:
def get_movie_recommendation(movie_name):
    n_movies_to_reccomend = 1
    movie_list = moviesDF[moviesDF['title'] == movie_name]
    if len(movie_list):
        movie_idx= movie_list.iloc[0]['movieId']
        movie_idx = ratingsDF4[ratingsDF4['movieId'] == movie_idx].index[0]
        distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1)
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = ratingsDF4.iloc[val[0]]['movieId'].astype('int')
            idx = moviesDF[moviesDF['movieId'] == movie_idx].index
            movie_genre = moviesDF.iloc[idx]['genres'].values[0]
            recommend_frame.append({'movieId': movie_idx,'title':moviesDF.iloc[idx]['title'].values[0],'genre':movie_genre, 'distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found. Please check your input"

# Print user watched movies
print('User Watched Movies:')
for _, row in user_watched_movies.head(10).iterrows():
    movie_id = row['movieId'].astype('int')
    rating = row['rating']
    movie = moviesDF[moviesDF['movieId'] == movie_id]
    if not movie.empty:
      movie_name = movie['title'].values[0]
      movie_genre = movie['genres'].values[0]
      print(f'Movie ID: {movie_id}, Rating: {rating}, Movie Title: {movie_name}, Genre: {movie_genre}')

print("Recommended movies:")
for _, row in user_watched_movies.head(10).iterrows():
    movie_id = row['movieId'].astype('int')
    movie = moviesDF[moviesDF['movieId'] == movie_id]
    if not movie.empty:
      movie_name = movie['title'].values[0]
      movie_genre = movie['genres'].values[0]
      recommended_movies = get_movie_recommendation(movie_name)
      recommended_movie_id = recommended_movies.iloc[0]['movieId']
      recommended_movie_name = recommended_movies.iloc[0]['title']
      recommended_movie_genre = recommended_movies.iloc[0]['genre']
      distance = recommended_movies.iloc[0]['distance']
      print(f"Movie ID: {recommended_movie_id}, Movie Title: {recommended_movie_name}, Genre: {recommended_movie_genre}, Distance: {distance}")


User Watched Movies:
Movie ID: 1, Rating: 5.0, Movie Title: Toy Story (1995), Genre: Adventure|Animation|Children|Comedy|Fantasy
Movie ID: 6016, Rating: 5.0, Movie Title: City of God (Cidade de Deus) (2002), Genre: Action|Adventure|Crime|Drama|Thriller
Movie ID: 51255, Rating: 5.0, Movie Title: Hot Fuzz (2007), Genre: Action|Comedy|Crime|Mystery
Movie ID: 8874, Rating: 5.0, Movie Title: Shaun of the Dead (2004), Genre: Comedy|Horror
Movie ID: 58559, Rating: 5.0, Movie Title: Dark Knight, The (2008), Genre: Action|Crime|Drama|IMAX
Movie ID: 1148, Rating: 5.0, Movie Title: Wallace & Gromit: The Wrong Trousers (1993), Genre: Animation|Children|Comedy|Crime
Movie ID: 1219, Rating: 5.0, Movie Title: Psycho (1960), Genre: Crime|Horror
Movie ID: 2115, Rating: 5.0, Movie Title: Indiana Jones and the Temple of Doom (1984), Genre: Action|Adventure|Fantasy
Movie ID: 1208, Rating: 5.0, Movie Title: Apocalypse Now (1979), Genre: Action|Drama|War
Movie ID: 364, Rating: 5.0, Movie Title: Lion King, T