In [1]:
import gc
import string
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import matplotlib.pyplot as plt
import seaborn as sns 

from tqdm import tqdm
import surprise

In [2]:
master_df = pd.read_csv("movie_lens_master.csv", low_memory=False)
master_df.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip_code,movie_id,movie_title,...,fantasy,file_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
0,196,242,3,881250949,49,M,writer,55105,242,Kolya,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,0,302,L.A. Confidential,...,0,1,0,0,1,0,0,1,0,0


In [3]:
def get_embeddings():
    # Fetch the glove embedding dictionary
    embedding_dict = {}
    with open("glove\glove.6B.50d.txt", "r", encoding="utf8") as f:
        for line in tqdm(f):
            placeholder = line.split()
            word = placeholder[0]
            values = np.array(placeholder[1:], dtype=np.float32)
            embedding_dict[word] = values
    
    # Return the embedding dict
    return embedding_dict

glove_50_embeds = get_embeddings()

400000it [00:07, 50951.41it/s]


In [4]:
def clean_text(text_input):
    """
    Text Cleaning Function
    """
    # strip and lowercase
    text_input = text_input.strip()
    text_input = text_input.lower()
    
    # split the text
    text_input = text_input.split()
    
    # remove punctuations
    punct = string.punctuation
    table = str.maketrans("", "", punct)
    text_input =" ".join([word.translate(table) for word in text_input])
    text_input = text_input.strip()
    
    # Return the text
    return text_input

# Append the cleaned title as well
master_df["cleaned_movie_title"] = master_df["movie_title"].apply(clean_text)

In [5]:
def title_to_vec(string_in, spatial_dim=50):
    split_data = string_in.split()
    matrix = np.zeros(spatial_dim, dtype=np.float32)
    for word in split_data:
        if word in glove_50_embeds.keys():
            matrix += glove_50_embeds[word]
        else:
            matrix += 0.0
    
    return matrix

In [6]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """
    Return precision and recall at k metrics for each user
    """
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_rating, pred_rating, _ in predictions:
        user_est_true[uid].append((pred_rating, true_rating))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_rating >= threshold) for (_, true_rating) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((pred_rating >= threshold) for (pred_rating, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_rating >= threshold) and (pred_rating >= threshold))
                              for (pred_rating, true_rating) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [7]:
def IOU(y_true, y_pred): 
    """
    Calculates the IOU over the y_true and y_preds based upon the length
    """
    # Handle the length change
    len_min = min(len(y_true), len(y_pred))

    # Truncate the lenght
    y_true = y_true[:len_min]
    y_pred = y_pred[:len_min]
      
    # Convert to set to do set operations
    set_y_true = set(list(y_true))
    set_y_pred = set(list(y_pred))
    
    # Set operation
    intersection = set_y_pred.intersection(set_y_true)
    union = set_y_pred.union(set_y_true)
    
    if len(union) == 0:
        return 0, len(intersection), len(union)
    else:
        # return the output
        return len(intersection) / len(union), len(intersection), len(union)

In [8]:
def SURPRISE_DATASET(data, is_train=True):
    """
    Creates a trainset compatiable with surprise models
    """
    # Create the reader scale 
    reader = surprise.Reader()

    # Load from dataframe
    dataset = surprise.Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], reader=reader)

    # Convert to train dataset
    dataset = dataset.build_full_trainset()
    
    # Check if test or not
    if is_train == False:
        # Create the test set and return
        dataset = dataset.build_testset()
        
    # return the dataset
    return dataset

In [9]:
class Neural_Collaborative_Recommeder_Engine(object):
    def __init__(self, models, train_data, val_data, unique_data):
        # Init the instance variables
        self.models       = models
        self.train_data   = train_data
        self.val_data     = val_data
        self.unique_data  = unique_data
        self.total_rec = None
        
    def get_recommendation(self, user_id, top_many=20, pp=False):
        # Calculate the movies not watched
        movies_watched = self.train_data[self.train_data["user_id"] == user_id].item_id.values
        movies_not_watched = list(set(self.unique_data.item_id.tolist()) - set(movies_watched.tolist()))
        
        # True labels
        true_lbls = self.val_data[self.val_data["user_id"] == user_id]["movie_title"].drop_duplicates().values
        self.total_rec = top_many # len(true_lbls)
        
        # If movies unwatched list is empty 
        if len(movies_not_watched) == 0:
            return "User has watched all movies, no new movies to recommend (get a life bro!)"
        
        # Input to the model
        X = zip([(user_id - 1)] * len(movies_not_watched), list(movies_not_watched))
        
        X = pd.DataFrame({
            "user_id" : [(user_id - 1)] * len(movies_not_watched),
            "item_id" : list(movies_not_watched),
            "rating" : [0] * len(movies_not_watched)
        })
        
        X_dataset = SURPRISE_DATASET(data=X, is_train=False)
        
        # Predict from the mode the ratings
        if len(self.models) == 1:
            preds_labels = [i[3] for i in self.models[0].test(X_dataset)]
            item_ids = [i[1] for i in self.models[0].test(X_dataset)]
        else:
            # Perform ensemble to get the ratings
            preds_labels = np.zeros((len(movies_not_watched), 1))
            for model in self.models:
                preds_labels += np.array([i[3] for i in model.test(X_dataset)]).reshape(-1, 1)
            preds_labels /= len(self.models)
            item_ids = [i[1] for i in self.models[0].test(X_dataset)]
        
        # Collect the index 
        placeholder = zip(preds_labels, item_ids)
        placeholder = sorted(placeholder, key=lambda x:x[0], reverse=True)
        index = [placeholder[i][1] for i in range(0,  self.total_rec)]
        
        # Collect the movies at those positions 
        placeholder = self.unique_data.set_index("item_id")
        
        # After processing
        if pp:
            final_rec, index = self.post_processing_glove(placeholder=placeholder,
                                                          movie_ids=index,
                                                          user_id=user_id)
        else:
            final_rec = placeholder.loc[index].movie_title.values[:self.total_rec]
        
        # Return the recommendations
        return final_rec[:self.total_rec], index, true_lbls[:self.total_rec]
    
    def post_processing_glove(self, placeholder, movie_ids, user_id):
        # Fetch the movie ids and the cleaned movie_names
        # Choose the highest cosine similarity movies
        dict_final = {}
        
        # Train and val data
        recommend_data = placeholder.loc[movie_ids]
        watched_movies = self.train_data[self.train_data["user_id"] == user_id].item_id
        
        # Make the cosine similarity matrix 
        matrix_csim = cosine_similarity(X=placeholder.loc[watched_movies].values[:, -50:],
                                        Y=placeholder.loc[movie_ids].values[:, -50:])
        
        # Chose the top few movies
        for index, (item_id, csim_curr) in enumerate(zip(watched_movies, matrix_csim)):
            # Get the recommendations
            zipped_recc = zip(matrix_csim[index], recommend_data["movie_title"], recommend_data["movie_id"])
            dict_final[placeholder.loc[item_id]["movie_title"]] = sorted(zipped_recc, key = lambda x : x[0], reverse=True)
                             
        # Argsort and send the top 2 from each case
        final_list = []
        final_ids = []
        for i in range(6):
            for value in dict_final.values():
                _, movie, ids = value[i]
                if movie not in final_list:
                    final_list.append(movie)
                    final_ids.append(ids)
            if len(final_list) > self.total_rec:
                break
                    
        # Return the movie
        return final_list[:self.total_rec], final_ids[:self.total_rec]
            
    
    def get_score(self, user_id, y_pred, y_pred_ids, t=3.5):
        # For IOU
        y_true_ids = list(self.val_data[(self.val_data["user_id"] == user_id) & (self.val_data["rating"] >= t)]["item_id"].values)
        y_pred_ids = list(y_pred_ids)
        
        # Calculate the IOU
        score_iou, _, _ = IOU(y_true=y_true_ids, y_pred=y_pred_ids)
            
        # Return the scores
        return score_iou

In [10]:
# Make the cosine similarity dataframe
csim_df = master_df[["item_id", "movie_title", "cleaned_movie_title", "movie_id"]].drop_duplicates()
list_csim_data = []
for movie in csim_df["cleaned_movie_title"]:
    list_csim_data.append(title_to_vec(movie))

# Convert to array
csim_data = np.asarray(list_csim_data)

# Append the similarity vector to the dataframe 
csim_df = pd.concat((csim_df.reset_index(drop=True), pd.DataFrame(csim_data)), axis=1)
print("NaN value count : %d" % csim_df.isna().sum().any())

NaN value count : 0


In [11]:
def model_machine(pp=False):
    model_list = []
    # Perform 5-fold scores
    for fold in range(5):
        train_df = pd.read_csv("mvlens_split/fold_%d_train.csv" % (fold + 1), low_memory=False)
        val_df = pd.read_csv("mvlens_split/fold_%d_test.csv" % (fold + 1), low_memory=False)
        
        # Create the datasets
        train_dataset = SURPRISE_DATASET(data=train_df)
        train_dataset_rmse = SURPRISE_DATASET(data=train_df, is_train=False)
        val_dataset = SURPRISE_DATASET(data=val_df, is_train=False)
        
        # Create surprise model and train it 
        model_knnbaseline = surprise.prediction_algorithms.knns.KNNBaseline(verbose=False)
        model_knnbaseline.fit(train_dataset)
        model_list.append(model_knnbaseline)
        
        # Fetch the predictions
        preds_curr = model_knnbaseline.test(val_dataset)
        preds_train = model_knnbaseline.test(train_dataset_rmse)
        
        # rmse score
        rmse_score_val = surprise.accuracy.rmse(preds_curr, verbose=False)
        rmse_score_train = surprise.accuracy.rmse(preds_train, verbose=False)
        
        # Zip the predictions
        pre, rec = precision_recall_at_k(predictions=preds_curr, k=10, threshold=3.5)
        mean_pre = sum(p for p in pre.values()) / len(pre)
        mean_rec = sum(r for r in rec.values()) / len(rec)
        
        # Create the Engine to make predictions
        all_movies_id = csim_df["item_id"]
        all_movies_names = csim_df["movie_title"]
        engine = Neural_Collaborative_Recommeder_Engine(models=[model_knnbaseline],
                                                        train_data=train_df,
                                                        val_data=val_df,
                                                        unique_data=csim_df)
        
        # Collect the scores
        scores_iou = []
        for user in tqdm(val_df.user_id.drop_duplicates()):
            y_preds, y_preds_ids, _ = engine.get_recommendation(user_id=user, top_many=10, pp=pp)
            scores = engine.get_score(y_pred=y_preds, user_id=user, y_pred_ids=y_preds_ids)
            scores_iou.append(scores)
        
        # Print the metrics
        print("Fold-%d RMSE  || Train_RMSE : %.4f | Val_RMSE : %.4f" % (fold + 1,
                                                                       rmse_score_train,
                                                                       rmse_score_val))
        
        print("Fold-%d IOU   || Mean : %.4f | Max : %.4f" % (fold + 1,np.mean(scores_iou),
                                                           np.max(scores_iou),))
        
        print("Fold-%d MAP@K || Mean : %.4f" % (fold + 1, np.mean(mean_pre)))
        
        print("Fold-%d MAR@K || Mean : %.4f" % (fold + 1, np.mean(mean_rec)))
        
        print("\n")
        
    return model_list

In [12]:
model_list = model_machine()

100%|██████████| 459/459 [02:15<00:00,  3.39it/s]
Fold-1 RMSE  || Train_RMSE : 0.7348 | Val_RMSE : 0.9418
Fold-1 IOU   || Mean : 0.0055 | Max : 0.1111
Fold-1 MAP@K || Mean : 0.7672
Fold-1 MAR@K || Mean : 0.4430


100%|██████████| 653/653 [03:05<00:00,  3.53it/s]
Fold-2 RMSE  || Train_RMSE : 0.7344 | Val_RMSE : 0.9346
Fold-2 IOU   || Mean : 0.0035 | Max : 0.3333
Fold-2 MAP@K || Mean : 0.7352
Fold-2 MAR@K || Mean : 0.5055


100%|██████████| 869/869 [04:00<00:00,  3.62it/s]
Fold-3 RMSE  || Train_RMSE : 0.7357 | Val_RMSE : 0.9292
Fold-3 IOU   || Mean : 0.0033 | Max : 0.3333
Fold-3 MAP@K || Mean : 0.7039
Fold-3 MAR@K || Mean : 0.5553


100%|██████████| 923/923 [04:50<00:00,  3.18it/s]
Fold-4 RMSE  || Train_RMSE : 0.7365 | Val_RMSE : 0.9260
Fold-4 IOU   || Mean : 0.0019 | Max : 0.3333
Fold-4 MAP@K || Mean : 0.6988
Fold-4 MAR@K || Mean : 0.5649


100%|██████████| 927/927 [05:14<00:00,  2.95it/s]Fold-5 RMSE  || Train_RMSE : 0.7376 | Val_RMSE : 0.9299
Fold-5 IOU   || Mean : 0.0027 | Max : 0.333

In [13]:
_ = model_machine(pp=True)

100%|██████████| 459/459 [02:41<00:00,  2.84it/s]
Fold-1 RMSE  || Train_RMSE : 0.7348 | Val_RMSE : 0.9418
Fold-1 IOU   || Mean : 0.0055 | Max : 0.2500
Fold-1 MAP@K || Mean : 0.7672
Fold-1 MAR@K || Mean : 0.4430


100%|██████████| 653/653 [04:24<00:00,  2.47it/s]
Fold-2 RMSE  || Train_RMSE : 0.7344 | Val_RMSE : 0.9346
Fold-2 IOU   || Mean : 0.0022 | Max : 0.1765
Fold-2 MAP@K || Mean : 0.7352
Fold-2 MAR@K || Mean : 0.5055


100%|██████████| 869/869 [04:46<00:00,  3.03it/s]
Fold-3 RMSE  || Train_RMSE : 0.7357 | Val_RMSE : 0.9292
Fold-3 IOU   || Mean : 0.0029 | Max : 0.2000
Fold-3 MAP@K || Mean : 0.7039
Fold-3 MAR@K || Mean : 0.5553


100%|██████████| 923/923 [05:03<00:00,  3.04it/s]
Fold-4 RMSE  || Train_RMSE : 0.7365 | Val_RMSE : 0.9260
Fold-4 IOU   || Mean : 0.0015 | Max : 0.1250
Fold-4 MAP@K || Mean : 0.6988
Fold-4 MAR@K || Mean : 0.5649


100%|██████████| 927/927 [05:01<00:00,  3.07it/s]Fold-5 RMSE  || Train_RMSE : 0.7376 | Val_RMSE : 0.9299
Fold-5 IOU   || Mean : 0.0022 | Max : 0.250

In [14]:
def compare_true_rec(model_list, user_id, num_folds_to_use, train_data, val_data, top_many=10):
    print("User_id : %d" % user_id)
        
    # Make the recommendation engine
    all_movies_id = csim_df["item_id"]
    all_movies_names = csim_df["movie_title"]
    
    # Engine
    engine = Neural_Collaborative_Recommeder_Engine(models=model_list,
                                                    train_data=train_data,
                                                    val_data=val_data,
                                                    unique_data=csim_df)
    
    y_preds, _, true_lbls = engine.get_recommendation(user_id=user_id,
                                                      top_many=10,
                                                      pp=True)
    y_preds_pp, _, _ = engine.get_recommendation(user_id=user_id,
                                                 top_many=10,
                                                 pp=False)
    
    data_frame = pd.DataFrame({
        "true" : true_lbls,
        "recd_normal" : y_preds,
        "recd_post_process" : y_preds_pp,
    })
    
    return data_frame

In [15]:
# Declerations 
num_users = len(np.unique(master_df["user_id"]))
num_items = len(np.unique(master_df["item_id"]))
train_data = pd.read_csv("mvlens_split//fold_1_train.csv", low_memory=False)
val_data = pd.read_csv("mvlens_split/fold_1_test.csv", low_memory=False)
user_id = np.random.choice(val_data["user_id"].tolist(), size=1)[0]

# Get the dataframe
df = compare_true_rec(model_list=model_list, user_id=user_id,
                      num_folds_to_use=5,
                      train_data=train_data,
                      val_data=val_data)

User_id : 239


In [16]:
df.head(10)

Unnamed: 0,true,recd_normal,recd_post_process
0,Babe,"Great Day in Harlem, A",Santa with Muscles
1,Dead Man Walking,Pather Panchali,"Saint of Fort Washington, The"
2,Richard III,Some Mother's Son,Some Mother's Son
3,Ed Wood,Star Kid,Pather Panchali
4,Pulp Fiction,Bitter Sugar,Bitter Sugar
5,Quiz Show,Anna,Anna
6,"Fugitive, The","Saint of Fort Washington, The","Great Day in Harlem, A"
7,"Hudsucker Proxy, The",Santa with Muscles,Aiqing wansui
8,Blade Runner,Someone Else's America,Someone Else's America
9,Terminator 2: Judgment Day,Aiqing wansui,Star Kid
