# Movie Recommender based on NN

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense, concatenate
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import load_model
import pickle
import matplotlib.pyplot as plt

# Connect with Google Drive
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/RecSys/')

Mounted at /content/drive


In [None]:
def save_history(history, filename='training_history.pkl'):
    with open(filename, 'wb') as file:
        pickle.dump(history.history, file)

def load_history(filename='training_history.pkl'):
    with open(filename, 'rb') as file:
        return pickle.load(file)

def save_movie_embeddings(movie_embeddings, filename='movie_embeddings.pkl'):
    with open(filename, 'wb') as file:
        pickle.dump(movie_embeddings, file)

def load_movie_embeddings(filename='movie_embeddings.pkl'):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [None]:
def plot_history(history):
    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

In [None]:
# Load the dataset
def load_data():
    # Define the column names explicitly
    columns_ratings = ['user_id', 'movie_id', 'rating', 'timestamp']
    # Define the column names explicitly
    columns_movies = ['movie_id', 'title', 'genres']
    # Read the CSV file, specifying the column names and skipping the first row
    ratings = pd.read_csv('ml-latest/ratings.csv', sep=',', names=columns_ratings, skiprows=1)
    # ratings = pd.read_csv('ml-latest/ratings.csv', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
    movies = pd.read_csv('ml-latest/movies.csv',  sep=',', names=columns_movies, skiprows=1)
    # Keep only the movie_id and title columns
    movies = movies[['movie_id', 'title']]
    return ratings, movies

# Standardize the ratings
def standardize_ratings(train_ratings, val_ratings):
    scaler = MinMaxScaler()

    train_ratings['rating'] = scaler.fit_transform(train_ratings[['rating']])
    val_ratings['rating'] = scaler.transform(val_ratings[['rating']])

    return train_ratings, val_ratings, scaler

# Unstandardize the ratings
def unstandardize_ratings(ratings, scaler):
  ratings = np.array(ratings).reshape(-1, 1)
  return scaler.inverse_transform(ratings).flatten()

# Preprocess the data
def preprocess_data(ratings, movies):
    user_ids = ratings['user_id'].unique().tolist()
    user_id_to_index = {x: i for i, x in enumerate(user_ids)}
    movie_ids = ratings['movie_id'].unique().tolist()
    movie_id_to_index = {x: i for i, x in enumerate(movie_ids)}

    ratings['user_id'] = ratings['user_id'].map(user_id_to_index)
    ratings['movie_id'] = ratings['movie_id'].map(movie_id_to_index)

    num_users = len(user_ids)
    num_movies = len(movie_ids)

    return ratings, num_users, num_movies, user_id_to_index, movie_id_to_index

# Build the neural network model
def build_model(num_users, num_movies, embedding_size=50):
    user_input = Input(shape=(1,), name='user_input')
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='user_embedding')(user_input)
    user_vec = Flatten(name='user_flatten')(user_embedding)

    movie_input = Input(shape=(1,), name='movie_input')
    movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size, name='movie_embedding')(movie_input)
    movie_vec = Flatten(name='movie_flatten')(movie_embedding)

    concat = concatenate([user_vec, movie_vec], axis=-1, name='concat')
    dense = Dense(128, activation='relu', name='dense')(concat)
    output = Dense(1, activation='linear', name='output')(dense)

    model = Model(inputs=[user_input, movie_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.00015), loss='mse')

    return model

# Train the model
def train_model(model, ratings, epochs=10, batch_size=256, checkpoint_filepath='best_model_try2.h5'):

    # Manually split the data into training and validation sets
    train_ratings, val_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
    # Let's standardize the ratings
    train_ratings, val_ratings, scaler = standardize_ratings(train_ratings, val_ratings)

    user_input_train = train_ratings['user_id'].values
    movie_input_train = train_ratings['movie_id'].values
    targets_train = train_ratings['rating'].values

    user_input_val = val_ratings['user_id'].values
    movie_input_val = val_ratings['movie_id'].values
    targets_val = val_ratings['rating'].values

    # Learning rate scheduler
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=1e-6)

    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    # Model checkpoint to save the model when validation loss improves
    model_checkpoint = ModelCheckpoint(filepath=checkpoint_filepath, monitor='val_loss', save_best_only=True, verbose=1)

    # Train the model with early stopping, learning rate reduction on plateau, and custom validation callback
    history = model.fit([user_input_train, movie_input_train], targets_train,
                        epochs=epochs, batch_size=batch_size,
                        validation_data=([user_input_val, movie_input_val], targets_val),
                        callbacks=[early_stopping, reduce_lr, model_checkpoint],
                        validation_freq= 1)

    return history

# Generate movie embeddings
def get_movie_embeddings(model, num_movies):
    movie_layer = model.get_layer('movie_embedding')
    movie_weights = movie_layer.get_weights()[0]
    return movie_weights

# Find similar movies using learned embeddings
def find_similar_movies_nn(movie_ids, movie_embeddings, movie_id_to_index, movie_index_to_id, k=10):
    movie_indices = [movie_id_to_index[movie_id] for movie_id in movie_ids]
    movie_vecs = movie_embeddings[movie_indices]

    avg_movie_vec = np.mean(movie_vecs, axis=0)

    similarities = np.dot(movie_embeddings, avg_movie_vec)
    similar_indices = np.argsort(similarities)[::-1]

    similar_movie_ids = [movie_index_to_id[idx] for idx in similar_indices if idx not in movie_indices][:k]
    return similar_movie_ids

In [None]:
# Main function
def main():
    ratings, movies = load_data()
    ratings, num_users, num_movies, user_id_to_index, movie_id_to_index = preprocess_data(ratings, movies)

    movie_index_to_id = {v: k for k, v in movie_id_to_index.items()}

    model = build_model(num_users, num_movies)

    history = train_model(model, ratings)

    movie_embeddings = get_movie_embeddings(model, num_movies)

    # Example movie_ids for which to find similar movies
    movie_ids = [1, 2, 3]  # Example movie IDs liked by the user

    similar_movie_ids = find_similar_movies_nn(movie_ids, movie_embeddings, movie_id_to_index, movie_index_to_id)
    print(f"Similar movies: {similar_movie_ids}")

    return history, model, movie_embeddings

if __name__ == '__main__':
    history, model, movie_embeddings = main()

    save_history(history.history, 'training_history.pkl')
    plot_history(history.history)

    # Save the movie embeddings to a file
    save_movie_embeddings(movie_embeddings, 'movie_embeddings.pkl')  # or 'movie_embeddings.pkl'

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.03489, saving model to best_model_try2.h5


  saving_api.save_model(


Epoch 2/10
Epoch 2: val_loss improved from 0.03489 to 0.03295, saving model to best_model_try2.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.03295 to 0.03198, saving model to best_model_try2.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.03198 to 0.03137, saving model to best_model_try2.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.03137 to 0.03110, saving model to best_model_try2.h5
Epoch 6/10
Epoch 6: val_loss improved from 0.03110 to 0.03106, saving model to best_model_try2.h5
Epoch 7/10
Epoch 7: val_loss did not improve from 0.03106
Epoch 8/10
Epoch 8: val_loss did not improve from 0.03106
Similar movies: [286897, 364, 3114, 68954, 953, 58559, 8961, 6377, 60069, 588]


ValueError: not enough values to unpack (expected 3, got 2)

## Now, let's test our data during inference time.

Here is the the code for the main

Let's load the model from the folder

In [None]:
model = load_model('best_model_try2.h5')

### Let's get the validation data
1. load the data
2. preprocess data
3. split data into val and train data and standardize it
4. Make data compatible with format of keras model
5. Print the standardized target values, user ids and movie ids to verify the data

In [None]:
# Manually split the data into training and validation sets
ratings, movies = load_data()
train_ratings, val_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

In [None]:
ratings, num_users, num_movies, user_id_to_index, movie_id_to_index = preprocess_data(ratings, movies)

In [None]:
# Manually split the data into training and validation sets
train_ratings, val_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
# Let's standardize the ratings
train_ratings, val_ratings, scaler = standardize_ratings(train_ratings, val_ratings)

In [None]:
# Prepare the input data from validation set
user_input_val = val_ratings['user_id'].values
movie_input_val = val_ratings['movie_id'].values
targets_val = val_ratings['rating'].values

In [None]:
# If you want you can print both arrays to see that they contain the ids for the user and movie
# user_input_val
# movie_input_val
# You see that the corresponding val_ratings have
targets_val

array([0.33333333, 0.33333333, 0.        , ..., 0.77777778, 0.44444444,
       0.66666667])

### Predictions & Mean Absolute Error
Compute predictions with trained model and evaluate interpretable performance of network with unstandardized data

In [None]:
# Make predictions
predictions = model.predict([user_input_val, movie_input_val])



In [None]:
predictions

array([[0.5346134 ],
       [0.6167196 ],
       [0.48491555],
       ...,
       [0.7675249 ],
       [0.40146667],
       [0.6773647 ]], dtype=float32)

As you can see, our algorithm predicts quite accurate. Considering you have a 5 star rating system with a stepsize of 0.5. You are on average just one step size away from the target value.

Hence, we can use our movie_embeddings to calculate the cosine similarity.

In [None]:
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(targets_val, predictions)
    print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 0.1324289928327861


In [None]:
# Calculate unstandardized Mean Absolute Error
# Unstandardize predictions
unstandardized_targets_val = unstandardize_ratings(targets_val, scaler)
unstandardized_predictions = unstandardize_ratings(predictions, scaler)
# Calculate Mean Absolute Error
mae = mean_absolute_error(unstandardized_targets_val, unstandardized_predictions)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 0.5959304687828529


What I found out, is that there is no difference in performance based on standardizing the data or not. Both have the mean absolute error of 0.59