In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
np.seterr(divide='ignore', invalid='ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movies-1m-dataset/ml-1m/users.dat
/kaggle/input/movies-1m-dataset/ml-1m/ratings.dat
/kaggle/input/movies-1m-dataset/ml-1m/README
/kaggle/input/movies-1m-dataset/ml-1m/movies.dat


In [39]:
class Data:
    def __init__(self,movies_file,ratings_file,users_file):
        self.movies_file = movies_file
        self.ratings_file = ratings_file
        self.users_file = users_file
        self.movies()
        self.users()
        self.ratings()
        self.generate_data_matrix()
        self.calculate_user_bias()
        self.calculate_movie_weights()
        self.calculate_unbiased_data_matrix()

    def calculate_unbiased_data_matrix(self):
        self.unbiased_data_matrix = self.data_matrix - np.expand_dims(self.bias,axis =-1)          
    
    def calculate_movie_weights(self):
        self.movie_weights = np.log(np.ones(self.data_matrix.shape[1])*self.data_matrix.shape[0]/np.sum(np.isnan(self.data_matrix),axis = 0))

    def calculate_user_bias(self):
        self.bias = np.nanmean(self.data_matrix, axis=1)

    def generate_data_matrix(self):
        # Generate data matrix
        self.data_matrix = self.ratings.pivot(index='UserID',columns='MovieID',values='Rating')
        self.data_matrix = self.data_matrix.fillna(np.nan)
        self.data_matrix = np.array(self.data_matrix)

    def movies(self):
        # Extract movies data
        with open(self.movies_file,'rb') as f:
            data=[]
            for line in f.readlines():
                line = line.decode('unicode_escape')
                line = line.strip()
                line = line.split('::')
                data.append([line[0],line[1],line[2].split('|')])
            self.movies = pd.DataFrame(data,columns=['MovieID','Title','Genre'])
            self.ids_to_index = {}
            for i in self.movies.index:
                self.ids_to_index[self.movies['MovieID'][i]] = i
            self.movies['MovieID'] = self.movies['MovieID'].astype(int)

    def ratings(self):
        # Extract ratings data
        with open(self.ratings_file,'rb') as f:
            data=[]
            for line in f.readlines():
                line = line.decode('unicode_escape')
                line = line.strip()
                line = line.split('::')
                data.append(line)
            self.ratings = pd.DataFrame(data,columns=['UserID','MovieID','Rating','Timestamp'])
            self.ratings['Rating'] = self.ratings['Rating'].astype(int)
            self.ratings['UserID'] = self.ratings['UserID'].astype(int)
            self.ratings['MovieID'] = self.ratings['MovieID'].astype(int)

    def users(self):
        # Extract users data
        with open(self.users_file,'rb') as f:
            data=[]
            for line in f.readlines():
                line = line.decode('unicode_escape')
                line = line.strip()
                line = line.split('::')
                data.append(line)
            self.users = pd.DataFrame(data,columns=['UserID','Gender','Age','Occupation','Zip-code'])
            self.users['UserID'] = self.users['UserID'].astype(int)
            
    def get_movies(self,ids,ratings):
#         print(self.movies['Title'][ids])
#         movies = []
#         for j,i in enumerate(ids):
#             movies.append((self.movies['Title'][i],self.movies['Genre'][i],ratings[j]))
        movies =pd.DataFrame({'Title':self.movies['Title'][ids],'genre':self.movies['Genre'][ids],'ratings':ratings})

        return movies

In [40]:
%%time
d=Data('/kaggle/input/movies-1m-dataset/ml-1m/movies.dat','/kaggle/input/movies-1m-dataset/ml-1m/ratings.dat','/kaggle/input/movies-1m-dataset/ml-1m/users.dat')

CPU times: user 5.39 s, sys: 198 ms, total: 5.59 s
Wall time: 5.5 s


In [41]:
class Item_recommendation_system:
    def __init__(self,data):
        self.data = data
        
    def calculate_similarity_biased_user(self,custom_ratings):
        user_ratings_matrix = self.data.data_matrix
        common_ratings_mask = ~np.isnan(custom_ratings)*~np.isnan(user_ratings_matrix)
        dot_product = np.nansum(custom_ratings* user_ratings_matrix, axis=1)
        norm_custom = np.sqrt(np.nansum(custom_ratings*common_ratings_mask*custom_ratings, axis=1))
        norm_users = np.sqrt(np.nansum(user_ratings_matrix*common_ratings_mask*user_ratings_matrix, axis=1))
        similarity = dot_product / (norm_custom * norm_users)
        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0
        return similarity

    def calculate_similarity_user(self,custom_ratings):
        user_ratings_matrix = self.data.unbiased_data_matrix
        common_ratings_mask = ~np.isnan(custom_ratings)*~np.isnan(user_ratings_matrix)
        dot_product = np.nansum(custom_ratings* user_ratings_matrix, axis=1)
        norm_custom = np.sqrt(np.nansum(custom_ratings*common_ratings_mask*custom_ratings, axis=1))
        norm_users = np.sqrt(np.nansum(user_ratings_matrix*common_ratings_mask*user_ratings_matrix, axis=1))
        similarity = dot_product / (norm_custom * norm_users)
        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0
        return similarity
    
    def calculate_similarity_weighted_user(self,custom_ratings):
        user_ratings_matrix = self.data.unbiased_data_matrix
        common_ratings_mask = ~np.isnan(custom_ratings)*~np.isnan(user_ratings_matrix)
        dot_product = np.nansum(custom_ratings* user_ratings_matrix*self.data.movie_weights, axis=1)
        norm_custom = np.sqrt(np.nansum(common_ratings_mask*custom_ratings*self.data.movie_weights*custom_ratings, axis=1))
        norm_users = np.sqrt(np.nansum(common_ratings_mask*user_ratings_matrix*self.data.movie_weights*user_ratings_matrix, axis=1))
        similarity = dot_product / (norm_custom * norm_users)
        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0
        return similarity

    def get_recommendations_cosine(self, choices, n=5):
        custom_ratings = np.empty(self.data.data_matrix.shape[1])
        custom_ratings.fill(np.nan)
        for i in choices.keys():
            custom_ratings[i-1] = choices[i]
        similarities = self.calculate_similarity_biased_user(custom_ratings)
        valid_ratings_mask = ~np.isnan(self.data.data_matrix)
        weighted_ratings = self.data.data_matrix * similarities[:, np.newaxis]
        sum_weighted_ratings = np.nansum(weighted_ratings, axis=0)
        sum_similarities = np.sum(valid_ratings_mask*np.abs(similarities)[:, np.newaxis], axis=0)
        predicted_ratings = sum_weighted_ratings / sum_similarities
        predicted_ratings[np.isnan(predicted_ratings)] = 0
        top_n_indices = np.argsort(predicted_ratings)[::-1][:n]
        top_n_ratings = predicted_ratings[top_n_indices]
        return self.data.get_movies(top_n_indices,top_n_ratings)
    
    def get_recommendations_pearson(self, choices, n=5):
        custom_ratings = np.empty(self.data.unbiased_data_matrix.shape[1])
        custom_ratings.fill(np.nan)
        for i in choices.keys():
            custom_ratings[i-1] = choices[i]
        bias = np.nanmean(custom_ratings)
        custom_ratings -= bias
        similarities = self.calculate_similarity_user(custom_ratings)
        valid_ratings_mask = ~np.isnan(self.data.unbiased_data_matrix)
        weighted_ratings = self.data.unbiased_data_matrix * similarities[:, np.newaxis]

        sum_weighted_ratings = np.nansum(weighted_ratings, axis=0)
        sum_similarities = np.sum(valid_ratings_mask*np.abs(similarities)[:, np.newaxis], axis=0)

        predicted_ratings = sum_weighted_ratings / sum_similarities
        predicted_ratings += bias
        predicted_ratings[np.isnan(predicted_ratings)] = 0

        top_n_indices = np.argsort(predicted_ratings)[::-1][:n]
        top_n_ratings = predicted_ratings[top_n_indices]
        return self.data.get_movies(top_n_indices,top_n_ratings)
    
    def get_recommendations_weighted(self, choices, n=5):
        custom_ratings = np.empty(self.data.unbiased_data_matrix.shape[1])
        custom_ratings.fill(np.nan)
        for i in choices.keys():
            custom_ratings[i-1] = choices[i]
        bias = np.nanmean(custom_ratings)
        custom_ratings -= bias
        similarities = self.calculate_similarity_weighted_user(custom_ratings)
        valid_ratings_mask = ~np.isnan(self.data.unbiased_data_matrix)
        weighted_ratings = self.data.unbiased_data_matrix * similarities[:, np.newaxis]

        sum_weighted_ratings = np.nansum(weighted_ratings, axis=0)
        sum_similarities = np.sum(valid_ratings_mask*np.abs(similarities)[:, np.newaxis], axis=0)

        predicted_ratings = sum_weighted_ratings / sum_similarities
        predicted_ratings += bias
        predicted_ratings[np.isnan(predicted_ratings)] = 0

        top_n_indices = np.argsort(predicted_ratings)[::-1][:n]
        top_n_ratings = predicted_ratings[top_n_indices]
        return self.data.get_movies(top_n_indices,top_n_ratings)
    
    def print_choices(self,choices):
        print("Movies:")
        custom_ratings = np.empty(self.data.unbiased_data_matrix.shape[1])
        custom_ratings.fill(np.nan)
        for i in choices.keys():
            custom_ratings[i-1] = choices[i]
        movie_indexs = np.argsort(custom_ratings)[:len(choices)]
        return self.data.get_movies(movie_indexs,custom_ratings[movie_indexs])
    
   

In [42]:
%%time
recommender=Item_recommendation_system(d)
# Input on the basis of line numbers
choices = {48:5,236:5,242:5,311:5,361:5, 363:1, 388:1, 420:1} #  Animation Children's
choices = {48:1,236:1,242:1,70:5,271:5,311:1,361:1} #Horror
choices = {374:5, 375:5, 377:5, 379:1, 386:5} # Romance
print(recommender.print_choices(choices),sep='\n')
print()
print("Recommendations based on cosine similarity")
print(recommender.get_recommendations_cosine(choices, 5),sep='\n')
print()
print("Recommendations based on pearson similarity")
print(recommender.get_recommendations_pearson(choices, 5),sep='\n')
print()
print("Recommendations based on weighted pearson similarity")
print(recommender.get_recommendations_weighted(choices, 5),sep='\n')

Movies:
                          Title                                 genre  ratings
378                 Wolf (1994)                       [Drama, Horror]      1.0
374           Speechless (1994)                     [Comedy, Romance]      5.0
373                Speed (1994)           [Action, Romance, Thriller]      5.0
385  Colonel Chabert, Le (1994)                 [Drama, Romance, War]      5.0
376            True Lies (1994)  [Action, Adventure, Comedy, Romance]      5.0

Recommendations based on cosine similarity
                                Title            genre  ratings
1882                   Oliver! (1968)        [Musical]      5.0
1317        Body Snatcher, The (1945)         [Horror]      5.0
558   Welcome to the Dollhouse (1995)  [Comedy, Drama]      5.0
2725         European Vacation (1985)         [Comedy]      5.0
2722                 Airplane! (1980)         [Comedy]      5.0

Recommendations based on pearson similarity
                                             