In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential, Model, load_model
from keras.layers import Embedding, Reshape, Activation, Input, Dense, Flatten, Dropout
from keras.layers.merge import Dot, multiply, concatenate
from keras.utils import data_utils, np_utils

try:
  shutil.rmtree("./sample_data")
except:
  pass

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
ratings_data = pd.read_csv("./drive/MyDrive/movie-recommendation-small/ratings.csv")
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
user_ids = list(ratings_data.userId.unique())
user_to_encoded = {value : index for index, value in enumerate(user_ids)}
encoded_to_user = {index : value for index, value in enumerate(user_ids)}

movie_ids = list(ratings_data.movieId.unique())
movie_to_encoded = {value : index for index, value in enumerate(movie_ids)}
encoded_to_movie = {index : value for index, value in enumerate(movie_ids)}

ratings_data["user"] = ratings_data.userId.map(user_to_encoded)
ratings_data["movie"] = ratings_data.movieId.map(movie_to_encoded)
max_user = len(user_to_encoded) + 1
max_movies = len(movie_to_encoded) + 1
ratings_data.rating = ratings_data.rating.values.astype(np.float32)

print(f"Number of users : {max_user}")
print(f"Number of movies : {max_movies}")

Number of users : 611
Number of movies : 9725


In [5]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,1
2,1,6,4.0,964982224,0,2
3,1,47,5.0,964983815,0,3
4,1,50,5.0,964982931,0,4


In [6]:
ratings_data = ratings_data.sample(frac=1, random_state = 69)
min_rating = min(ratings_data.rating)
max_rating = max(ratings_data.rating)

x = ratings_data[["user", "movie"]].values
y = ratings_data.rating.apply(lambda x : (x - min_rating) / (max_rating - min_rating)).values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((80668, 2), (80668,), (20168, 2), (20168,))

In [7]:
def create_model(type_of_model, max_work, max_user):
  embedding_dimensions = 30
  bias = 1

  movie_inputs = Input(shape=(1,), dtype='int32')
  mi = Embedding(max_work, embedding_dimensions, name="work")(movie_inputs)
  mi_bias = Embedding(max_work, bias, name="work_bias")(movie_inputs)

  user_inputs = Input(shape=(1,), dtype='int32')
  ui = Embedding(max_user, embedding_dimensions, name="user")(user_inputs)
  ui_bias = Embedding(max_user, bias, name="user_bias")(user_inputs)

  output_layer = multiply([mi, ui])
  output_layer = concatenate([output_layer, ui_bias, mi_bias])
  output_layer = Dropout(0.3)(output_layer)
  output_layer = Flatten()(output_layer)

  if type_of_model == "with_bias":
    output_layer = Dense(1)(output_layer)

  elif type_of_model == "with_neural_network":
    output_layer = Dense(64, activation="relu")(output_layer)
    output_layer = Dense(1)(output_layer)


  colab_filt_recommendation_model = Model(inputs=[movie_inputs, user_inputs], outputs= output_layer)
  colab_filt_recommendation_model.compile(loss='mae',
                                          optimizer='adam',
                                          metrics=["mae"])
  return colab_filt_recommendation_model

In [8]:
def to_numpy(series):
    return np.array([[element] for element in series])

In [10]:
model = create_model("with_neural_network", max_movies, max_user)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
work (Embedding)                (None, 1, 30)        291750      input_1[0][0]                    
__________________________________________________________________________________________________
user (Embedding)                (None, 1, 30)        18330       input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
x = [(x_train[:, 1], x_train[:, 0])]
history = model.fit(x, y_train, epochs = 30, validation_split=0.2, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
model.save("collaborative-filtering-recommendation-system.h5", save_format = "h5")

In [None]:
model.save_weights("./drive/MyDrive/movie-recommendation-small/collaborative-filtering-recommendation-system-weights.h5", save_format = "h5")

In [11]:
model.load_weights("./drive/MyDrive/movie-recommendation-small/collaborative-filtering-recommendation-system-weights.h5")
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
work (Embedding)                (None, 1, 30)        291750      input_1[0][0]                    
__________________________________________________________________________________________________
user (Embedding)                (None, 1, 30)        18330       input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
test_data = [x_test[:, 1], x_test[:, 0]]

predictions = model.predict(test_data)
test_performance = mean_absolute_error(y_test, predictions)

print(" Test Mae model 2 : %s " % test_performance)

 Test Mae model 2 : 0.09219341930733438 


In [67]:
mids = [6537, 6548, 41566, 6595, 26712]
ratings = [2.5, 3, 2, 2.6, 2.2]

In [68]:
import warnings
warnings.filterwarnings("ignore")

movie_df = pd.read_csv("./drive/MyDrive/movie-recommendation-small/movies.csv")
user_id = ratings_data.userId.sample(1).iloc[0]

movies_watched_by_user = ratings_data[ratings_data.userId == user_id]
movies_watched_by_user.drop("timestamp", axis = 1, inplace=True)

for i in range(len(mids)):
  movies_watched_by_user = movies_watched_by_user.append({"userId": user_id, "movieId": mids[i], "rating": ratings[i], "user": user_to_encoded[user_id], "movie": movie_to_encoded[mids[i]]}, ignore_index = True)

for column in list(movies_watched_by_user):
  if column != "rating":
    movies_watched_by_user[column] = movies_watched_by_user[column].astype(int)

movies_not_watched = movie_df[~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)]["movieId"]
movies_not_watched = list(set(movies_not_watched).intersection(set(movie_to_encoded.keys())))
movies_not_watched = [[movie_to_encoded.get(x)] for x in movies_not_watched]

user_encoder = user_to_encoded.get(user_id)
user_movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched))

ratings = model.predict([user_movie_array[:, 1], user_movie_array[:, 0]]).flatten()

top_ratings_indices = ratings.argsort()[-10:][::-1]
rat_norm = ratings[top_ratings_indices]
rating_denorm = rat_norm * (max_rating - min_rating) + min_rating
recommended_movie_ids = [encoded_to_movie.get(movies_not_watched[x][0]) for x in top_ratings_indices]

recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
count = 0
m, g, r = [], [], []
output = {}
for row in recommended_movies.itertuples():
  m.append(row.title)
  r.append(round(rating_denorm[count], 1))
  g.append(row.genres)
  # print(row.title, ":", row.genres, "\tratings : ", round(rating_denorm[count], 1))
  count += 1

output["movies"] = m
output["ratings"] = r
output["genres"] = g
output

{'genres': ['Comedy|Fantasy|Horror',
  'Action|Adventure|Comedy',
  'Horror|Mystery',
  'Mystery|Thriller',
  'Comedy|Horror',
  'Horror|Thriller',
  'Crime|Drama|Film-Noir|Thriller',
  'Action|Sci-Fi|Thriller',
  'Action|Crime',
  'Drama|Romance|Sci-Fi'],
 'movies': ['Dead Alive (Braindead) (1992)',
  'Austin Powers: The Spy Who Shagged Me (1999)',
  "Jacob's Ladder (1990)",
  'Old Boy (2003)',
  'Zombie Strippers! (2008)',
  'Films to Keep You Awake: The Christmas Tale (Películas para no dormir: Cuento de navidad) (2005)',
  'Drive (2011)',
  'Chronicle (2012)',
  'The Raid: Redemption (2011)',
  'Her (2013)'],
 'ratings': [5.0, 5.0, 5.0, 4.9, 4.9, 4.9, 4.8, 4.8, 4.8, 4.8]}