In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import interp

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import classification_report,confusion_matrix, roc_curve, auc

In [16]:
class CodebustersRecommender():
    """ 
    Implements a three stage recommendation system with a gradient boosting classifier, 
    a content-based recommender, and collaborative filter
    """
    def __init__(self):
        return None
        
    def fit(self, df, movies, df_ratings):
        """
        df -> dataframe of entire movie database
        movies -> dataframe of movies metadata
        df_ratings -> dataframe of movies ratings
        fits all three steps
        """
        # 0) init
        self.df = df
        self.movies = movies
        self.df_ratings = df_ratings
        
        # 1) fit gradient boosting classifier
        print("fit step 1: gradient boosting classifier")
        self.recommendation_without_user_info(self.df)
        
        # 2) fit content-based recommender
        print("fit step 2: content-based filter")
        self.content_based_recommendation(self.movies)
        
        # 3) fit collaborative recommender
        print("fit step 3: collaborative filter")
        self.collaborative_recommendation(self.df, self.df_ratings)
        
        print("Fitting done")
        return self

    def predict(self, userID, age, gender, number_recommendations):
        """
        predicts depending on quality of user profile
        """
        
        # get userID, movie name, user age, and user gender, and user profile quality
        user_id = userID
        number_recommendations = number_recommendations
        name = self.get_user_movie(self.df, user_id)
        age = age
        gender = gender
        
        # user profile quality
        user_profile_quality = self.df[self.df.userID==userID].nunique().movieID
        
        # 1) predict with gradient boosting classifier
        # if no user ratings: needs  age, gender, number_recommendations
        if np.isnan(user_profile_quality):
            print("Predict with classifier without user info :")
            # concat user age and gender with movie information, and make predictions
            # e.g. user age 25 and male
            X = df[['age', 'gender', 'year', 'genre1', 'genre2', 'genre3']]
            # set age
            X['age'] = age
            dummyvars = pd.get_dummies(X[['gender', 'genre1', 'genre2', 'genre3']])
            # set gender
            dummyvars['gender_F'] = 0
            dummyvars['gender_M'] = 0
            if gender=='M':
                dummyvars['gender_M'] = 1
            elif gender=='F':
                dummyvars['gender_F'] = 1
            # append the dummy variables to df
            X = pd.concat([X[['age', 'year']], dummyvars], axis = 1).as_matrix()

            # make predictions
            y_pred = self.gbclf.predict(X=X)

            # concat predictions to movie information
            df_pred = pd.concat([df[['movieID', 'name']], pd.DataFrame(y_pred, index=df.index, columns=['pred_rating'])], axis = 1)
            # shuffle 5 random movies with rating 1
            df_pred = df_pred[df_pred.pred_rating==1]
            recommendation = df_pred.drop('pred_rating', axis=1).sample(number_recommendations, random_state=10).set_index('movieID')

            return recommendation
        
        # 2) predict with content-based recommender
        # if <=20 user ratings: needs userID, name, number_recommendations
        if user_profile_quality < 20:
            print("Predict with content-based filter:")
            # get name of movie user already rated
            name = self.get_user_movie(df=df, user_ID=user_id)
            # Build a 1-dimensional array with movie titles
            indices = pd.Series(self.movies.index, index=self.movies['name'])
            # Ranks movies according to similarity to requested movie
            idx = indices[name]
            sim_scores = list(enumerate(self.cosine_sim[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:(number_recommendations+1)]
            movie_indices = [i[0] for i in sim_scores]
            return self.movies.name.iloc[movie_indices]
        
        # 3) predict with collaborative recommender
        # if >20 user ratings: needs user_id, number_recommendations
        if user_profile_quality >= 20:
            print("Predict with collaborative filter:")
            movies  = self.top_k_movies(self.item_similarity, user_id, number_recommendations)
            return df.loc[movies[0]-1, 'name']

    
    def recommendation_without_user_info(self, df):
        '''
        function returns 5 random movies which have been recommended by a gradient boosting classifier
        without user information

        @param df: movie dataset 'allData'
        '''
        # fit
        # ---------------------------------------------------
        # User information before any movie ratings
        X = df[['age', 'gender', 'year', 'genre1', 'genre2', 'genre3']]
        y = df['rating'].as_matrix()

        # Preprocessing
        # One hot encoding
        dummyvars = pd.get_dummies(X[['gender', 'genre1', 'genre2', 'genre3']])
        # append the dummy variables to df
        X = pd.concat([X[['age', 'year']], dummyvars], axis = 1).as_matrix()

        print("GradientBoostingClassifier")
        self.gbclf = GradientBoostingClassifier(n_estimators=100)
        self.gbclf.fit(X=X, y=y)
        
        return self
    
    
    # find out from user input a movie that he rated
    def get_user_movie(self, df, user_ID):
        '''
        returns a rated movie from userID

        @param df: movie dataset 'allData'
        @param user_id: target user_ID
        '''
        # return data from random sampled row of user
        df_liked = df[df.rating==1]
        movie = df[df['userID']==747].sample(1).name

        # strip space at the end before return
        return movie.item().rstrip()

    # Function that get movie recommendations based on the cosine similarity score of movie genres
    def content_based_recommendation(self, movies):
        '''
        Recommends number of similar movie based on movie title and similarity to movies in movie database

        @param movies: pandas dataframe with movie dataset with columns (movieID, name, genres_concat)
        @param name: movie title as string
        @param number_recommendations: number of recommendations returned as integer
        '''
        # fit
        # ---------------------------------------------------
        # Preprocessing for tf-idf vectorization
        # Strip space at the end of string
        movies['name'] = movies['name'].str.rstrip()
        # Concat genres into one string
        movies['genres_concat'] = movies[['genre1', 'genre2', 'genre3']].astype(str).apply(' '.join, axis=1)
        # Remove nans in string and strip spaces at the end
        movies['genres_concat'] = movies['genres_concat'].str.replace('nan','').str.rstrip()

        # Create tf_idf matrix sklearn TfidfVectorizer
        tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
        tfidf_matrix = tf.fit_transform(movies['genres_concat'])

        # calculate similarity matrix with cosine distance of tf_idf values
        self.cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


    def fast_similarity(self, m_ratings, kind='user', epsilon=1e-9):
        '''
        compute the similarity
        '''
        # epsilon -> small number for handling dived-by-zero errors
        if kind == 'user':
            sim = m_ratings.dot(m_ratings.T) + epsilon
        elif kind == 'item':
            sim = m_ratings.T.dot(m_ratings) + epsilon
        norms = np.array([np.sqrt(np.diagonal(sim))])
        return (sim / norms / norms.T)

    def top_k_movies(self, similarity, movie_idx, k=6):
        return [np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

    def collaborative_recommendation(self, all_data, df_ratings):
        '''
        Recommends number of similar movies based on user item similarity

        @param df_ratings: rating file from MovieLens dataset
        @param userID: userID
        @param number_recommendations: number of recommendations returned as integer
        @param number_recommendations: number of recommendations returned

        '''
        # fit
        # ---------------------------------------------------
        # Below code creates two new columns for user id and movie id to facilitate the creation of the user item matrix
        from itertools import cycle
        n_users = df_ratings.userID.unique().shape[0]
        n_movies = df_ratings.movieID.unique().shape[0]

        l_users = cycle(list(range(n_users)))
        l_movies = list(range(n_movies))
        df_ratings['user_id'] = df_ratings['userID'].astype("int")
        df_ratings['movie_id'] = df_ratings['movieID'].astype("int")
        df_ratings['movieID'] = df_ratings['movieID'].astype("int")
        #df_ratings['movie_id2'] = df_ratings['movie_id'].astype("str")
        current_idm = 1
        current_idu = 747
        indm = 1
        indu = 1
        listMID = list(df_ratings["movieID"])
        for idx, row in df_ratings.iterrows():
            new_idm = int(df_ratings.loc[idx, 'movieID'])
            #intialize the  foudn movie id in list
            foundm = False
            for k in range(1465):
                if new_idm in listMID:
                    #get the index
                    lind = listMID.index(new_idm)
                    #update the movie_id
                    df_ratings.loc[lind, 'movie_id'] = indm
                    #now set that list item to zero
                    listMID[lind]=0
                    foundm = True
                else:
                    #break and fetch a new row
                    break
            #increment the indicator
            if foundm:
                indm+=1
            #current_idm = new_idm

            #there is a bit of logic problem here...
            new_idu = int(df_ratings.loc[idx, 'userID'])
            if new_idu==current_idu:
                df_ratings.loc[idx, 'user_id'] = indu
            else:
                indu+=1
                current_idu = new_idu
                df_ratings.loc[idx, 'user_id'] = indu

        ## construct a user item matrix
        m_ratings = np.zeros((n_users, n_movies))
        for row in df_ratings.itertuples():
            #row[3] will be user rating row[4] user_id and row[5] movie_id  
            m_ratings[row[4]-1, row[5]-1] = row[3]

        # get item similarity matrix
        self.item_similarity = self.fast_similarity(m_ratings, kind='item')
