In [1]:
import gc
import string
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import jaccard_score
import matplotlib.pyplot as plt
import seaborn as sns 

from tqdm.notebook import tqdm
import tensorflow as tf

In [2]:
master_df = pd.read_csv("../input/movie-lens-preprocessed-data/movie_lens_master.csv", low_memory=False)
master_df.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip_code,movie_id,movie_title,...,fantasy,file_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
0,196,242,3,881250949,49,M,writer,55105,242,Kolya,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,0,302,L.A. Confidential,...,0,1,0,0,1,0,0,1,0,0


In [3]:
def get_embeddings():
    # Fetch the glove embedding dictionary
    embedding_dict = {}
    with open("../input/glove6b/glove.6B.50d.txt", "r", encoding="utf8") as f:
        for line in tqdm(f):
            placeholder = line.split()
            word = placeholder[0]
            values = np.array(placeholder[1:], dtype=np.float32)
            embedding_dict[word] = values
    
    # Return the embedding dict
    return embedding_dict

glove_50_embeds = get_embeddings()

0it [00:00, ?it/s]

In [4]:
def clean_text(text_input):
    """
    Text Cleaning Function
    """
    # strip and lowercase
    text_input = text_input.strip()
    text_input = text_input.lower()
    
    # split the text
    text_input = text_input.split()
    
    # remove punctuations
    punct = string.punctuation
    table = str.maketrans("", "", punct)
    text_input =" ".join([word.translate(table) for word in text_input])
    text_input = text_input.strip()
    
    # Return the text
    return text_input

# Append the cleaned title as well
master_df["cleaned_movie_title"] = master_df["movie_title"].apply(clean_text)

In [5]:
# Lets define a simple tensorflow model
def collaborative_model(num_users, num_items):
    # Create the input layers
    input_user_id = tf.keras.layers.Input(shape=(1,), name="input_1")
    input_item_id = tf.keras.layers.Input(shape=(1,), name="input_2")
    
    # Create the embedding layers 
    embedding_user = tf.keras.layers.Embedding(input_dim=num_users, output_dim=128)(input_user_id)
    embedding_item = tf.keras.layers.Embedding(input_dim=num_items, output_dim=128)(input_item_id)
    
    # Concat the embedding embedding_item
    flatten_user = tf.keras.layers.Flatten()(embedding_user)
    flatten_item = tf.keras.layers.Flatten()(embedding_item)
    
    # Multiply the layers 
    mul_embed = tf.keras.layers.Multiply()([flatten_user, flatten_item])
    
    # Create the dense net
    x = tf.keras.layers.Dense(units=1, kernel_initializer="lecun_uniform")(mul_embed)
    
    # Create the model
    model = tf.keras.models.Model(inputs=[input_user_id, input_item_id], outputs=[x])
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss=tf.keras.losses.MSE,
                  metrics=tf.keras.metrics.RootMeanSquaredError())
    
    # Returnt the model
    return model

In [6]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [7]:
def title_to_vec(string_in, spatial_dim=50):
    split_data = string_in.split()
    matrix = np.zeros(spatial_dim, dtype=np.float32)
    for word in split_data:
        if word in glove_50_embeds.keys():
            matrix += glove_50_embeds[word]
        else:
            matrix += 0.0
    
    return matrix

In [8]:
def map_at_k(y_true, y_pred, k):
    """
    Calculate the mean average precision @K
    """
    if len(y_pred) > k:
        y_pred = y_pred[:k]
        
    score = 0.0
    num_hits = 0.0
    
    for index, pred in enumerate(y_pred):
        if pred in y_true and pred not in y_pred[:index]:
            num_hits += 1.0
            score += num_hits / (index + 1.0)
            
    return score / min(len(y_true), k)

In [9]:
def IOU(y_true, y_pred): 
    """
    Calculates the IOU over the y_true and y_preds based upon the length
    """
    # Handle the length change
    if len(y_pred) > len(y_true):
        y_pred = y_pred[:len(y_true)]
      
    # Convert to set to do set operations
    set_y_true = set(list(y_true))
    set_y_pred = set(list(y_pred))
    
    # Set operation
    intersection = set_y_pred.intersection(set_y_true)
    union = set_y_pred.union(set_y_true)
    
    # return the output
    return len(intersection) / len(union), len(intersection), len(union)

In [10]:
class Neural_Collaborative_Recommeder_Engine(object):
    def __init__(self, model, train_data, val_data, unique_data):
        # Init the instance variables
        self.model        = model
        self.train_data   = train_data
        self.val_data     = val_data
        self.unique_data  = unique_data
        
    def get_recommendation(self, user_id, top_many=50, pp=False):
        # Calculate the movies not watched
        movies_watched = self.train_data[self.train_data["user_id"] == user_id].item_id.values
        movies_not_watched = list(set(self.unique_data.item_id.tolist()) - set(movies_watched.tolist()))
        
        # If movies unwatched list is empty 
        if len(movies_not_watched) == 0:
            return "User has watched all movies, no new movies to recommend (get a life bro!)"
        
        # Predict from the mode the ratings
        X = {
            "input_1" : tf.constant(np.array([(user_id - 1)] * len(movies_not_watched))),
            "input_2" : tf.constant(np.array(movies_not_watched) - 1)
        }
        pred_rating = self.model.predict(X)
        
        # argmax
        preds_labels = np.argmax(pred_rating, axis = 1)
        
        # Collect the index 
        index = np.argsort(preds_labels)[::-1][:top_many] + 1
        
        # Collect the index 
        placeholder = zip(preds_labels, movies_not_watched)
        placeholder = sorted(placeholder, key=lambda x:x[0], reverse=True)
        index = [placeholder[i][1] for i in range(0, top_many)]
        
        # Collect the movies at those positions 
        placeholder = self.unique_data.set_index("item_id")
        
        # After processing
        if pp:
            final_rec, index = self.post_processing_glove(placeholder=placeholder,
                                                   movie_ids=index,
                                                   user_id=user_id)
        else:
            final_rec = placeholder.loc[index].movie_title.values[:10]
        
        # Return the recommendations
        return final_rec, index
    
    def post_processing_glove(self, placeholder, movie_ids, user_id):
        # Fetch the movie ids and the cleaned movie_names
        # Choose the highest cosine similarity movies
        dict_final = {}
        
        # Train and val data
        recommend_data = placeholder.loc[movie_ids]
        watched_movies = self.train_data[self.train_data["user_id"] == user_id].item_id
        
        # Make the cosine similarity matrix 
        matrix_csim = cosine_similarity(X=placeholder.loc[watched_movies].values[:, -50:],
                                        Y=placeholder.loc[movie_ids].values[:, -50:])
        
        # Chose the top few movies
        for index, (item_id, csim_curr) in enumerate(zip(watched_movies, matrix_csim)):
            # Get the recommendations
            zipped_recc = zip(matrix_csim[index], recommend_data["movie_title"], recommend_data["movie_id"])
            dict_final[placeholder.loc[item_id]["movie_title"]] = sorted(zipped_recc, key = lambda x : x[0], reverse=True)
                             
        # Argsort and send the top 2 from each case
        final_list = []
        final_ids = []
        for value in dict_final.values():
            _, movie, ids = value[0]
            if movie not in final_list:
                final_list.append(movie)
                final_ids.append(ids)
                
        for value in dict_final.values():
            _, movie, ids = value[1]
            if movie not in final_list:
                final_list.append(movie)
                final_ids.append(ids)
                    
        # Return the movie
        return final_list, final_ids
            
    
    def get_score(self, user_id, y_pred, y_pred_ids):
        # For MAP @ K
        y_true = self.val_data[self.val_data["user_id"] == user_id]["movie_title"].values
        y_pred = np.array(y_pred)
        
        # For IOU
        y_true_ids = list(self.val_data[self.val_data["user_id"] == user_id]["item_id"].values)
        y_pred_ids = list(y_pred_ids)
            
        # Calculate the map
        score_mapk = map_at_k(y_true, y_pred, k=y_true.shape[0])
        
        # Calculate the IOU
        score_iou, len_1, len_2 = IOU(y_true=y_true_ids, y_pred=y_pred_ids)
            
        # Return the scores
        return score_iou, score_mapk, len_1, len_2, y_true.tolist()

In [11]:
# Make the cosine similarity dataframe
csim_df = master_df[["item_id","movie_title", "cleaned_movie_title", "movie_id"]].drop_duplicates()
list_csim_data = []
for movie in tqdm(csim_df["cleaned_movie_title"]):
    list_csim_data.append(title_to_vec(movie))

# Convert to array
csim_data = np.asarray(list_csim_data)

# Append the similarity vector to the dataframe 
csim_df = pd.concat((csim_df.reset_index(drop=True), pd.DataFrame(csim_data)), axis=1)
print("NaN value count : %d" % csim_df.isna().sum().any())
csim_df.head(2)

  0%|          | 0/1682 [00:00<?, ?it/s]

NaN value count : 0


Unnamed: 0,item_id,movie_title,cleaned_movie_title,movie_id,0,1,2,3,4,5,...,40,41,42,43,44,45,46,47,48,49
0,242,Kolya,kolya,242,-1.0081,-0.16537,0.42593,-0.022535,0.81007,-0.066713,...,-0.50072,0.63497,0.3375,-0.77,-0.38261,-0.31691,0.54004,-0.044697,-0.30532,0.34273
1,302,L.A. Confidential,la confidential,302,1.75379,1.23369,-1.512195,0.74263,-0.91908,-1.27909,...,1.65999,-1.17336,1.52652,-0.72051,-1.4365,-1.06,-1.4333,0.3043,1.290844,1.19421


In [12]:
def model_machine(data):
    # Define the split criterion
    fold_type = StratifiedKFold(n_splits=5, shuffle=True, random_state=50)
    splits = fold_type.split(X=data, y=data.user_id)
    
    # Perform 5-fold scores
    for fold, (train_idx, val_idx) in enumerate(splits):
        train_data = master_df.iloc[train_idx]
        val_data = master_df.iloc[val_idx]
        
        # Select the important features 
        features_to_use = ["user_id", "item_id"]
        target_variable = "rating"

        # Preprocess the data
        train_label_pipe = train_data[target_variable] - 1
        train_data_pipe  = train_data[features_to_use] - 1

        val_label_pipe = val_data[target_variable] - 1
        val_data_pipe  = val_data[features_to_use] - 1

        # Create dataset
        train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1" : train_data_pipe["user_id"].values,
                                                             "input_2": train_data_pipe["item_id"].values},
                                                             train_label_pipe.values)).batch(256)
        val_dataset = tf.data.Dataset.from_tensor_slices(({"input_1" : val_data_pipe["user_id"].values,
                                                           "input_2": val_data_pipe["item_id"].values},
                                                           val_label_pipe.values)).batch(256)

        # Train and save
        num_users = len(np.unique(data["user_id"]))
        num_items = len(np.unique(data["item_id"]))
        model_curr = collaborative_model(num_users=num_users, num_items=num_items)
        history = model_curr.fit(train_dataset, validation_data=val_dataset, epochs=5, verbose=False)
        
        # Model accuracy train and val
        model_metrics = history.history
        
        
        # Create the Engine to make predictions
        all_movies_id = csim_df["item_id"]
        all_movies_names = csim_df["movie_title"]
        engine = Neural_Collaborative_Recommeder_Engine(model=model_curr,
                                                        train_data=train_data,
                                                        val_data=val_data,
                                                        unique_data=csim_df)
        
        # Collect the scores
        scores_iou = []
        scores_map = []
        len_one = []
        len_two = []
        for user in tqdm(val_data.user_id.drop_duplicates()):
            y_preds, y_preds_ids = engine.get_recommendation(user_id=user, top_many=10, pp=True)
            scores = engine.get_score(y_pred=y_preds, user_id=user, y_pred_ids=y_preds_ids)
            scores_iou.append(scores[0])
            scores_map.append(scores[1])
            len_one.append(scores[2])
            len_two.append(scores[3])
        
        # Print the metrics
        print("Fold-%d RMSE || Train_RMSE : %.4f | Val_RMSE : %.4f" % (fold + 1,
                                                                         model_metrics["root_mean_squared_error"][-1],
                                                                         model_metrics["val_root_mean_squared_error"][-1]))
        
        print("Fold-%d IOU || Mean : %.4f | Max : %.4f | Min : %.4f" % (fold + 1,
                                                                            np.mean(scores_iou),
                                                                            np.max(scores_iou),
                                                                            np.min(scores_iou)))
        
        print("Fold-%d Intersection || Mean : %.4f | Max : %.4f | Min : %.4f" % (fold + 1,
                                                                                 np.mean(len_one),
                                                                                 np.max(len_one),
                                                                                 np.min(len_one)))
        
        print("Fold-%d Union || Mean : %.4f | Max : %.4f | Min : %.4f" % (fold + 1,
                                                                          np.mean(len_two),
                                                                          np.max(len_two),
                                                                          np.min(len_two)))
        
        print("Fold-%d MAP@K || Mean : %.4f | Max : %.4f | Min : %.4f" % (fold + 1,
                                                                            np.mean(scores_map),
                                                                            np.max(scores_map),
                                                                            np.min(scores_map)))

In [13]:
model_machine(master_df)

  0%|          | 0/943 [00:00<?, ?it/s]

Fold-1 RMSE || Train_RMSE : 0.8509 | Val_RMSE : 0.9203
Fold-1 IOU || Mean : 0.0163 | Max : 0.2500 | Min : 0.0000
Fold-1 Intersection || Mean : 0.5854 | Max : 10.0000 | Min : 0.0000
Fold-1 Union || Mean : 28.9958 | Max : 154.0000 | Min : 7.0000
Fold-1 MAP@K || Mean : 0.0101 | Max : 0.2333 | Min : 0.0000


  0%|          | 0/943 [00:00<?, ?it/s]

Fold-2 RMSE || Train_RMSE : 0.8507 | Val_RMSE : 0.9270
Fold-2 IOU || Mean : 0.0166 | Max : 0.1852 | Min : 0.0000
Fold-2 Intersection || Mean : 0.5790 | Max : 10.0000 | Min : 0.0000
Fold-2 Union || Mean : 28.9862 | Max : 158.0000 | Min : 7.0000
Fold-2 MAP@K || Mean : 0.0100 | Max : 0.2381 | Min : 0.0000


  0%|          | 0/943 [00:00<?, ?it/s]

Fold-3 RMSE || Train_RMSE : 0.8355 | Val_RMSE : 0.9164
Fold-3 IOU || Mean : 0.0173 | Max : 0.2500 | Min : 0.0000
Fold-3 Intersection || Mean : 0.5790 | Max : 10.0000 | Min : 0.0000
Fold-3 Union || Mean : 28.9873 | Max : 158.0000 | Min : 7.0000
Fold-3 MAP@K || Mean : 0.0110 | Max : 0.2857 | Min : 0.0000


  0%|          | 0/943 [00:00<?, ?it/s]

Fold-4 RMSE || Train_RMSE : 0.8465 | Val_RMSE : 0.9249
Fold-4 IOU || Mean : 0.0190 | Max : 0.1818 | Min : 0.0000
Fold-4 Intersection || Mean : 0.6649 | Max : 10.0000 | Min : 0.0000
Fold-4 Union || Mean : 28.9099 | Max : 156.0000 | Min : 7.0000
Fold-4 MAP@K || Mean : 0.0129 | Max : 0.2500 | Min : 0.0000


  0%|          | 0/943 [00:00<?, ?it/s]

Fold-5 RMSE || Train_RMSE : 0.8267 | Val_RMSE : 0.9199
Fold-5 IOU || Mean : 0.0157 | Max : 0.1852 | Min : 0.0000
Fold-5 Intersection || Mean : 0.5695 | Max : 10.0000 | Min : 0.0000
Fold-5 Union || Mean : 28.9926 | Max : 155.0000 | Min : 7.0000
Fold-5 MAP@K || Mean : 0.0100 | Max : 0.2000 | Min : 0.0000
