In [1]:
import gc
import string
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import jaccard_score
import matplotlib.pyplot as plt
import seaborn as sns 

from tqdm.notebook import tqdm
import tensorflow as tf
import keras

# Data Loading and Preprocessing

In [2]:
master_df = pd.read_csv("../input/movie-lens-preprocessed-data/movie_lens_master.csv", low_memory=False)
master_df.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip_code,movie_id,movie_title,...,fantasy,file_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
0,196,242,3,881250949,49,M,writer,55105,242,Kolya,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,0,302,L.A. Confidential,...,0,1,0,0,1,0,0,1,0,0


In [3]:
def get_embeddings():
    # Fetch the glove embedding dictionary
    embedding_dict = {}
    with open("../input/glove6b/glove.6B.50d.txt", "r", encoding="utf8") as f:
        for line in tqdm(f):
            placeholder = line.split()
            word = placeholder[0]
            values = np.array(placeholder[1:], dtype=np.float32)
            embedding_dict[word] = values
    
    # Return the embedding dict
    return embedding_dict

glove_50_embeds = get_embeddings()

0it [00:00, ?it/s]

In [4]:
def clean_text(text_input):
    """
    Text Cleaning Function
    """
    # strip and lowercase
    text_input = text_input.strip()
    text_input = text_input.lower()
    
    # split the text
    text_input = text_input.split()
    
    # remove punctuations
    punct = string.punctuation
    table = str.maketrans("", "", punct)
    text_input =" ".join([word.translate(table) for word in text_input])
    text_input = text_input.strip()
    
    # Return the text
    return text_input

# Append the cleaned title as well
master_df["cleaned_movie_title"] = master_df["movie_title"].apply(clean_text)

# Modelling 

In [5]:
# Lets define a simple tensorflow model
def collaborative_model(num_users, num_items):
    # Create the input layers
    input_user_id = tf.keras.layers.Input(shape=(1,), name="input_1")
    input_item_id = tf.keras.layers.Input(shape=(1,), name="input_2")
    
    # Create the embedding layers 
    embedding_user_gmf = tf.keras.layers.Embedding(input_dim=num_users, output_dim=64,
                                                   embeddings_initializer="he_normal",
                                                   embeddings_regularizer=tf.keras.regularizers.l2(1e-6))(input_user_id)
    embedding_item_gmf = tf.keras.layers.Embedding(input_dim=num_items, output_dim=64,
                                                   embeddings_initializer="he_normal",
                                                   embeddings_regularizer=tf.keras.regularizers.l2(1e-6))(input_item_id)
    
    embedding_user_mlp = tf.keras.layers.Embedding(input_dim=num_users, output_dim=64,
                                                   embeddings_initializer="he_normal",
                                                   embeddings_regularizer=tf.keras.regularizers.l2(1e-6))(input_user_id)
    embedding_item_mlp = tf.keras.layers.Embedding(input_dim=num_items, output_dim=64,
                                                   embeddings_initializer="he_normal",
                                                   embeddings_regularizer=tf.keras.regularizers.l2(1e-6))(input_item_id)
    
    # GMF
    flatten_user_gmf = tf.keras.layers.Flatten()(embedding_user_gmf)
    flatten_item_gmf = tf.keras.layers.Flatten()(embedding_item_gmf)
    gmf_embed = tf.keras.layers.Multiply()([flatten_user_gmf, flatten_item_gmf])
    
    # MLP
    flatten_user_mlp = tf.keras.layers.Flatten()(embedding_user_mlp)
    flatten_item_mlp = tf.keras.layers.Flatten()(embedding_item_mlp)
    mlp_embed = tf.keras.layers.Concatenate()([flatten_user_mlp, flatten_item_mlp])
    
    # MLP Dense layers
    mlp_x = tf.keras.layers.Dense(units=256, activation="relu")(mlp_embed)
    mlp_x = tf.keras.layers.BatchNormalization()(mlp_x)
    mlp_x = tf.keras.layers.Dropout(0.3)(mlp_x)
    
    mlp_x = tf.keras.layers.Dense(units=128, activation="relu")(mlp_x)
    mlp_x = tf.keras.layers.BatchNormalization()(mlp_x)
    mlp_x = tf.keras.layers.Dropout(0.2)(mlp_x)
    
    mlp_x = tf.keras.layers.Dense(units=64, activation="relu")(mlp_x)
    mlp_x = tf.keras.layers.BatchNormalization()(mlp_x)
    mlp_x = tf.keras.layers.Dropout(0.1)(mlp_x)
    
    # Final merge 
    merged = tf.keras.layers.Concatenate()([gmf_embed, mlp_x])
    
    # Create the dense net
    x = tf.keras.layers.Dense(units=1, kernel_initializer="lecun_uniform")(merged)
    
    # Create the model
    model = tf.keras.models.Model(inputs=[input_user_id, input_item_id], outputs=[x])
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss=tf.keras.losses.MeanSquaredError(),
                  metrics=tf.keras.metrics.RootMeanSquaredError())
    
    # Returnt the model
    return model

# Metrics and helper functions

In [6]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [7]:
def title_to_vec(string_in, spatial_dim=50):
    split_data = string_in.split()
    matrix = np.zeros(spatial_dim, dtype=np.float32)
    for word in split_data:
        if word in glove_50_embeds.keys():
            matrix += glove_50_embeds[word]
        else:
            matrix += 0.0
    
    return matrix

In [8]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """
    Return precision and recall at k metrics for each user
    """
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, true_rating, pred_rating in predictions:
        user_est_true[uid].append((pred_rating, true_rating))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_rating >= threshold) for (_, true_rating) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((pred_rating >= threshold) for (pred_rating, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_rating >= threshold) and (pred_rating >= threshold))
                              for (pred_rating, true_rating) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [9]:
def IOU(y_true, y_pred): 
    """
    Calculates the IOU over the y_true and y_preds based upon the length
    """
    # Handle the length change
    if len(y_pred) > len(y_true):
        y_pred = y_pred[:len(y_true)]
      
    # Convert to set to do set operations
    set_y_true = set(list(y_true))
    set_y_pred = set(list(y_pred))
    
    # Set operation
    intersection = set_y_pred.intersection(set_y_true)
    union = set_y_pred.union(set_y_true)
    
    # return the output
    return len(intersection) / len(union), len(intersection), len(union)

# Engine

In [10]:
class Neural_Collaborative_Recommeder_Engine(object):
    def __init__(self, models, train_data, val_data, unique_data):
        # Init the instance variables
        self.models       = models
        self.train_data   = train_data
        self.val_data     = val_data
        self.unique_data  = unique_data
        self.total_rec = None
        
    def get_recommendation(self, user_id, top_many=20, pp=False):
        # Calculate the movies not watched
        movies_watched = self.train_data[self.train_data["user_id"] == user_id].item_id.values
        movies_not_watched = list(set(self.unique_data.item_id.tolist()) - set(movies_watched.tolist()))
        
        # True labels
        true_lbls = self.val_data[self.val_data["user_id"] == user_id]["movie_title"].drop_duplicates().values
        self.total_rec = top_many # len(true_lbls)
        
        # If movies unwatched list is empty 
        if len(movies_not_watched) == 0:
            return "User has watched all movies, no new movies to recommend (get a life bro!)"
        
        # Input to the model
        X = {
            "input_1" : tf.constant(np.array([(user_id - 1)] * len(movies_not_watched))),
            "input_2" : tf.constant(np.array(movies_not_watched) - 1)
        }
        
        # Predict from the mode the ratings
        if len(self.models) == 1:
            preds_labels = self.models[0].predict(X)
        else:
            # Perform ensemble to get the ratings
            preds_labels = np.zeros((len(movies_not_watched), 1))
            for model in self.models:
                preds_labels += model.predict(X)
            preds_labels /= len(self.models)
        
        # Collect the index 
        placeholder = zip(preds_labels.tolist(), movies_not_watched)
        placeholder = sorted(placeholder, key=lambda x:x[0][0], reverse=True)
        index = [placeholder[i][1] for i in range(0,  self.total_rec)]
        
        # Collect the movies at those positions 
        placeholder = self.unique_data.set_index("item_id")
        
        # After processing
        if pp:
            final_rec, index = self.post_processing_glove(placeholder=placeholder,
                                                          movie_ids=index,
                                                          user_id=user_id)
        else:
            final_rec = placeholder.loc[index].movie_title.values[:self.total_rec]
        
        # Return the recommendations
        return final_rec[:self.total_rec], index, true_lbls[:self.total_rec]
    
    def post_processing_glove(self, placeholder, movie_ids, user_id):
        # Fetch the movie ids and the cleaned movie_names
        # Choose the highest cosine similarity movies
        dict_final = {}
        
        # Train and val data
        recommend_data = placeholder.loc[movie_ids]
        watched_movies = self.train_data[self.train_data["user_id"] == user_id].item_id
        
        # Make the cosine similarity matrix 
        matrix_csim = cosine_similarity(X=placeholder.loc[watched_movies].values[:, -50:],
                                        Y=placeholder.loc[movie_ids].values[:, -50:])
        
        # Chose the top few movies
        for index, (item_id, csim_curr) in enumerate(zip(watched_movies, matrix_csim)):
            # Get the recommendations
            zipped_recc = zip(matrix_csim[index], recommend_data["movie_title"], recommend_data["movie_id"])
            dict_final[placeholder.loc[item_id]["movie_title"]] = sorted(zipped_recc, key = lambda x : x[0], reverse=True)
                             
        # Argsort and send the top 2 from each case
        final_list = []
        final_ids = []
        for i in range(6):
            for value in dict_final.values():
                _, movie, ids = value[i]
                if movie not in final_list:
                    final_list.append(movie)
                    final_ids.append(ids)
            if len(final_list) > self.total_rec:
                break
                    
        # Return the movie
        return final_list[:self.total_rec], final_ids[:self.total_rec]
            
    
    def get_score(self, user_id, y_pred, y_pred_ids):
        # For MAP @ K
        y_true = self.val_data[self.val_data["user_id"] == user_id]["movie_title"].values
        y_pred = np.array(y_pred)
        
        # For IOU
        y_true_ids = list(self.val_data[self.val_data["user_id"] == user_id]["item_id"].values)
        y_pred_ids = list(y_pred_ids)
        
        # Calculate the IOU
        score_iou, _, _ = IOU(y_true=y_true_ids, y_pred=y_pred_ids)
            
        # Return the scores
        return score_iou, y_true.tolist()

In [11]:
# Make the cosine similarity dataframe
csim_df = master_df[["item_id", "movie_title", "cleaned_movie_title", "movie_id"]].drop_duplicates()
list_csim_data = []
for movie in csim_df["cleaned_movie_title"]:
    list_csim_data.append(title_to_vec(movie))

# Convert to array
csim_data = np.asarray(list_csim_data)

# Append the similarity vector to the dataframe 
csim_df = pd.concat((csim_df.reset_index(drop=True), pd.DataFrame(csim_data)), axis=1)
print("NaN value count : %d" % csim_df.isna().sum().any())

NaN value count : 0


# Five fold training and validation

In [12]:
def model_machine(pp=False):
    # Perform 5-fold scores
    for fold in range(5):
        train_df = pd.read_csv("../input/mvlens-split-data/fold_%d_train.csv" % (fold + 1), low_memory=False)
        val_df = pd.read_csv("../input/mvlens-split-data/fold_%d_test.csv" % (fold + 1), low_memory=False)
        
        # Select the important features 
        features_to_use = ["user_id", "item_id"]
        target_variable = "rating"

        # Preprocess the data
        train_label = train_df[target_variable] - 1
        train_data  = train_df[features_to_use] - 1

        val_label = val_df[target_variable] - 1
        val_data  = val_df[features_to_use] - 1

        # Create dataset
        train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1" : train_data["user_id"].values,
                                                             "input_2": train_data["item_id"].values},
                                                             train_label.values)).batch(256)
        val_dataset = tf.data.Dataset.from_tensor_slices(({"input_1" : val_data["user_id"].values,
                                                           "input_2": val_data["item_id"].values},
                                                           val_label.values)).batch(256)
        
        test_dataset = tf.data.Dataset.from_tensor_slices(({"input_1" : val_data["user_id"].values,
                                                           "input_2": val_data["item_id"].values})).batch(256)

        # Train and save
        num_users = len(np.unique(master_df["user_id"]))
        num_items = len(np.unique(master_df["item_id"]))
        model_curr = collaborative_model(num_users=num_users, num_items=num_items)
        history = model_curr.fit(train_dataset, validation_data=val_dataset, epochs=6, verbose=False)
        
        # Save weights
        model_curr.save("model_fold_%d.h5" % (fold))
        
        # Model accuracy train and val
        model_metrics = history.history
        
        # Predict the validation_data to calculate the precision_recall
        val_preds = model_curr.predict(test_dataset)
        
        # Zip the predictions
        prediction_for_map = zip(val_df["user_id"].values, val_df["rating"].values, val_preds.reshape(-1).tolist())
        pre, rec = precision_recall_at_k(predictions=prediction_for_map, k=10, threshold=3.5)
        mean_pre = sum(p for p in pre.values()) / len(pre)
        mean_rec = sum(r for r in rec.values()) / len(rec)
        
        # Create the Engine to make predictions
        all_movies_id = csim_df["item_id"]
        all_movies_names = csim_df["movie_title"]
        engine = Neural_Collaborative_Recommeder_Engine(models=[model_curr],
                                                        train_data=train_df,
                                                        val_data=val_df,
                                                        unique_data=csim_df)
        
        # Collect the scores
        scores_iou = []
        for user in val_df.user_id.drop_duplicates():
            y_preds, y_preds_ids, _ = engine.get_recommendation(user_id=user, top_many=10, pp=pp)
            scores = engine.get_score(y_pred=y_preds, user_id=user, y_pred_ids=y_preds_ids)
            scores_iou.append(scores[0])
        
        # Print the metrics
        print("Fold-%d RMSE  || Train_RMSE : %.4f | Val_RMSE : %.4f" % (fold + 1,
                                                                       model_metrics["root_mean_squared_error"][-1],
                                                                       model_metrics["val_root_mean_squared_error"][-1]))
        
        print("Fold-%d IOU   || Mean : %.4f | Max : %.4f" % (fold + 1,np.mean(scores_iou),
                                                           np.max(scores_iou),))
        
        print("Fold-%d MAP@K || Mean : %.4f" % (fold + 1, np.mean(mean_pre)))
        
        print("Fold-%d MAR@K || Mean : %.4f" % (fold + 1, np.mean(mean_rec)))
        
        print("\n")

In [13]:
model_machine()

Fold-1 RMSE  || Train_RMSE : 0.7999 | Val_RMSE : 0.9658
Fold-1 IOU   || Mean : 0.0282 | Max : 0.1667
Fold-1 MAP@K || Mean : 0.4940
Fold-1 MAR@K || Mean : 0.1008


Fold-2 RMSE  || Train_RMSE : 0.8128 | Val_RMSE : 0.9570
Fold-2 IOU   || Mean : 0.0257 | Max : 0.2500
Fold-2 MAP@K || Mean : 0.4532
Fold-2 MAR@K || Mean : 0.0985


Fold-3 RMSE  || Train_RMSE : 0.8016 | Val_RMSE : 0.9485
Fold-3 IOU   || Mean : 0.0265 | Max : 0.3333
Fold-3 MAP@K || Mean : 0.3719
Fold-3 MAR@K || Mean : 0.1006


Fold-4 RMSE  || Train_RMSE : 0.8106 | Val_RMSE : 0.9443
Fold-4 IOU   || Mean : 0.0255 | Max : 0.2000
Fold-4 MAP@K || Mean : 0.3263
Fold-4 MAR@K || Mean : 0.0850


Fold-5 RMSE  || Train_RMSE : 0.8073 | Val_RMSE : 0.9391
Fold-5 IOU   || Mean : 0.0263 | Max : 0.3333
Fold-5 MAP@K || Mean : 0.3569
Fold-5 MAR@K || Mean : 0.0908




In [14]:
model_machine(pp=True)

Fold-1 RMSE  || Train_RMSE : 0.8052 | Val_RMSE : 0.9613
Fold-1 IOU   || Mean : 0.0259 | Max : 0.1667
Fold-1 MAP@K || Mean : 0.4993
Fold-1 MAR@K || Mean : 0.0933


Fold-2 RMSE  || Train_RMSE : 0.8111 | Val_RMSE : 0.9604
Fold-2 IOU   || Mean : 0.0269 | Max : 0.2000
Fold-2 MAP@K || Mean : 0.4798
Fold-2 MAR@K || Mean : 0.1036


Fold-3 RMSE  || Train_RMSE : 0.8035 | Val_RMSE : 0.9531
Fold-3 IOU   || Mean : 0.0275 | Max : 0.3333
Fold-3 MAP@K || Mean : 0.4270
Fold-3 MAR@K || Mean : 0.1275


Fold-4 RMSE  || Train_RMSE : 0.8151 | Val_RMSE : 0.9422
Fold-4 IOU   || Mean : 0.0255 | Max : 0.2000
Fold-4 MAP@K || Mean : 0.3577
Fold-4 MAR@K || Mean : 0.0872


Fold-5 RMSE  || Train_RMSE : 0.8057 | Val_RMSE : 0.9405
Fold-5 IOU   || Mean : 0.0270 | Max : 1.0000
Fold-5 MAP@K || Mean : 0.3355
Fold-5 MAR@K || Mean : 0.0774




# List of movies recommended by single user

In [15]:
def compare_true_rec(model, user_id, num_folds_to_use, train_data, val_data, top_many=10):
    print("User_id : %d" % user_id)
    # Start the folds
    model_list = []
    for fold in range(num_folds_to_use):
        # Load the model weights
        model.load_weights("model_fold_%d.h5" % (fold))
        model_list.append(model)
        
    # Make the recommendation engine
    all_movies_id = csim_df["item_id"]
    all_movies_names = csim_df["movie_title"]
    
    # Engine
    engine = Neural_Collaborative_Recommeder_Engine(models=model_list,
                                                    train_data=train_data,
                                                    val_data=val_data,
                                                    unique_data=csim_df)
    
    y_preds, _, true_lbls = engine.get_recommendation(user_id=user_id,
                                                      top_many=10,
                                                      pp=True)
    y_preds_pp, _, _ = engine.get_recommendation(user_id=user_id,
                                                 top_many=10,
                                                 pp=False)
    
    data_frame = pd.DataFrame({
        "true" : true_lbls,
        "recd_normal" : y_preds,
        "recd_post_process" : y_preds_pp,
    })
    
    return data_frame

In [16]:
# Declerations 
num_users = len(np.unique(master_df["user_id"]))
num_items = len(np.unique(master_df["item_id"]))
model_in = collaborative_model(num_users=num_users, num_items=num_items)
train_data = pd.read_csv("../input/mvlens-split-data/fold_1_train.csv", low_memory=False)
val_data = pd.read_csv("../input/mvlens-split-data/fold_1_test.csv", low_memory=False)
user_id = np.random.choice(val_data["user_id"].tolist(), size=1)[0]

# Get the dataframe
df = compare_true_rec(model=model_in, user_id=user_id,
                      num_folds_to_use=5,
                      train_data=train_data,
                      val_data=val_data)

User_id : 15


In [17]:
df.head(10)

Unnamed: 0,true,recd_normal,recd_post_process
0,Toy Story,Beautiful Thing,Schindler's List
1,Twelve Monkeys,"Shawshank Redemption, The",It's a Wonderful Life
2,Dead Man Walking,"Silence of the Lambs, The","Silence of the Lambs, The"
3,"White Balloon, The","Sound of Music, The",To Kill a Mockingbird
4,Angels and Insects,Apollo 13,"Shawshank Redemption, The"
5,"Birdcage, The",To Kill a Mockingbird,Beautiful Thing
6,"Godfather, The",Winnie the Pooh and the Blustery Day,Good Will Hunting
7,Big Night,Schindler's List,"Sound of Music, The"
8,"Ghost and the Darkness, The",It's a Wonderful Life,Winnie the Pooh and the Blustery Day
9,Star Trek: First Contact,Good Will Hunting,Apollo 13
