__Recommendation system__
========

## Reading ratings

In [1]:
import pandas as pd
import numpy as np

class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=0):
        #defining the variables in the constructor
        self.path = path
        self.start_date = start_date
        self.end_date = end_date
        self.min_ratings = min_ratings
        self.rating_num = 0
        
        #reading the data with pandas
        self.data = pd.read_csv(self.path, sep='\t')
        
        if self.start_date is not None :
            self.data = self.date_from()
        if self.end_date is not None:
            self.data = self.date_to()
        
        #making our user x movies matrix 
        self.data = self.data.pivot(index='userID', columns='movieID', values='rating')
        
        #delete the movies with lower ratings than the min_rating
        self.data = self.data.loc[:, (self.data.notnull().sum(axis=0) > self.min_ratings)]
        
        #counting the ratings number
        for i,j in self.data.iterrows():
            self.rating_num += j.count()
        

    #delete the ratings from the specified date(start_date)
    def date_from(self):
        day_s, month_s, year_s = self.start_date.split(".")
        
        filtered = self.data[(self.data['date_year'] >= int(year_s))]
        filtered = filtered.drop(filtered[(filtered['date_year'] <= int(year_s)) & (filtered['date_month'] <= int(month_s)) & (filtered['date_day'] < int(day_s))].index)
        filtered = filtered.drop(filtered[(filtered['date_year'] <= int(year_s)) & (filtered['date_month'] < int(month_s))].index)
        return filtered
    
    #delete the ratings to the specified date(end_date)
    def date_to(self):
        day_e, month_e, year_e = self.end_date.split(".")

        filtered = self.data[(self.data['date_year'] <= int(year_e))]
        filtered = filtered.drop(filtered[(filtered['date_year'] >= int(year_e)) & (filtered['date_month'] >= int(month_e)) & (filtered['date_day'] >= int(day_e))].index)
        filtered = filtered.drop(filtered[(filtered['date_year'] >= int(year_e)) & (filtered['date_month'] > int(month_e))].index)
        return filtered
       
    #return the number of ratings
    def nratings(self):
        return self.rating_num   
    
    #function to add a new user to the data
    def add_user(self, d, user_id):
        if user_id not in self.data.index.tolist():
            self.data = self.data.append(pd.Series(name=user_id, dtype = "float64"))
            for k in d.keys():
                self.data.at[user_id,k] = d[k]        

In [2]:
uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
72784


## Reading movies

In [3]:
class MovieData:
    def __init__(self, path):
        #defining the variables and reading the movies data
        self.path = path
        self.data = self.data = pd.read_csv(self.path, sep='\t', encoding = "ISO-8859-1")
        
    #returning the name of the movie based on its index
    def get_title(self, n):
        mov = self.data[(self.data['id'] == n)]    
        mov = (mov['title'])
        seznam = mov.tolist()
        return seznam[0]
        

In [4]:
md = MovieData('data/movies.dat')
print(md.get_title(1))

Toy story


## Random predictor

In [5]:
import random

class RandomPredictor:
    def __init__(self, min_value, max_value):
        self.min_value = min_value
        self.max_value = max_value
    
    #load the class UserItemData
    def fit(self, x):
        self.x = x
        
    def predict(self, user_id):
        d = {}      
        #iterating over the user's movies and giving them a random rating
        for i,j in self.x.data.loc[user_id].iteritems():
            if j:
                j = random.randint(self.min_value, self.max_value)
            
            d[i] = j
            
        return d

In [6]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

Film: Toy story, ocena: 2
Film: Grumpy Old Men, ocena: 5
Film: Money Train, ocena: 5
Film: The Usual Suspects, ocena: 5
Film: City Hall, ocena: 5


## Recommendation

In [7]:
from math import sqrt

class Recommender:   
    #getting the predictor in the constructor
    def __init__(self, predikator):
        self.predikator = predikator
    
    #fitting the data in the Recommender and the predictor
    def fit(self, x):
        self.x = x
        self.predikator.fit(self.x)
        self.m = self.x.data.mean(axis=1)
            
    def recommend(self, userID, n=10, rec_seen=True):
        #getting a dictionary of predictions for the specified user
        d = (self.predikator.predict(userID))
        
        #here if we don't want to see already watched movies, I delete them from the predictions dictionary
        if rec_seen == False:
            seznam = [i for i in self.x.data.loc[userID][self.x.data.loc[userID].notnull()].index]
            for i in seznam:
                if i in d.keys():
                    del d[i]
        
        #sorting the dictionary by the predicted values
        d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
        
        #making a list of the top predictions and returning it
        s = []
        for i,j in enumerate(d.keys()):
            if i == n:
                break
            s.append((j,d[j]))
            
        return s
    
    #this is a method for evaluating the predictor, calculating(rmse,mae,precision,recall and f score)
    def evaluate(self, uim_test, n):
        train_data = self.x.data
        train_data = train_data.fillna(0) #replace the Nan values with zeros for comparing
        
        #get the interseciton for users between the test and train data
        users = list(set(self.x.data.index.tolist()) & set(uim_test.data.index.tolist()))
        
        #make a recommeder for the train data
        rec = Recommender(self.predikator)
        rec.fit(uim_test)
        
        #arrays for the different scores 
        mse_list = []
        mae_list = []
        precision_list = []
        recall_list = []
        
        #iterating over the users
        for i in users:
            views = 0
            pred = {}
            
            #getting the top (n) number of recommended movies
            for k,v in rec.recommend(i, n=n, rec_seen=False):
                pred[k] = v
            
            #going over the movies and checking if the user has rated them in the train data,
            #and calculating the mse/mae between the (rating and prediction_rating)
            #also counting the number of movies watched from our recommendations
            for m in pred.keys(): 
                if m in train_data.columns.tolist():
                    if train_data.loc[i][m] != 0:
                        mse_list.append((self.x.data.loc[i][m] - pred[m])**2)
                        mae_list.append(abs(self.x.data.loc[i][m] - pred[m]))
                        
                        #if train_data.loc[i][m] > self.m.loc[i]: 
                        views += 1
             
            precision_list.append(views / n)
            if self.x.data.loc[i].count() > 0:
                recall_list.append(views / self.x.data.loc[i].count())
 
        #the first value was 0.0 for both arrays(manually checked for the first itteration, wasn't 0.0) so i delete them
        del precision_list[0]
        del recall_list[0]
       
        #getting the final scores, by the formulas from vaje
        mse = sqrt(sum(mse_list) / len(mse_list))
        mae = sum(mae_list) / len(mae_list)
        precision = sum(precision_list) / len(precision_list)
        recall = sum(recall_list) / len(recall_list)
        f = (2 * precision * recall) / (precision + recall)
        
        return mse, mae, precision, recall, f

In [8]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Grumpy Old Men, ocena: 5
Film: Father of the Bride Part II, ocena: 5
Film: Copycat, ocena: 5
Film: Leaving Las Vegas, ocena: 5
Film: Dead Man Walking, ocena: 5


## Average Predictor

In [9]:
class AveragePredictor:
    def __init__(self, b=0):
        self.b = b
        if b < 0:
            self.b = 0
        
    def fit(self, x):
        self.x = x
        
        self.d = {}
        
        #the average for all movies
        g_avg = self.x.data.sum().sum() / self.x.rating_num 
        #iterating over the movies
        for i in self.x.data:
            vs=self.x.data[i].sum() #sum of ratings for a movie(not counting Nan values)
            n=self.x.data[i].count() #number of ratings for a movie(not counting Nan values)
            
            #calculating the average with the formula
            #avg = (vs + b * g_avg) / (n + b)
            avg = (vs + self.b * g_avg) / (n + self.b)
            
            self.d[i] = avg
    
    #returning the dictionary
    def predict(self, userID):
        return self.d

In [10]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = AveragePredictor()
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Brother Minister: The Assassination of Malcolm X, ocena: 5.0
Film: Synthetic Pleasures, ocena: 5.0
Film: Gabbeh, ocena: 5.0
Film: Storefront Hitchcock, ocena: 5.0
Film: Ko to tamo peva, ocena: 5.0


In [11]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
ap = AveragePredictor(b=100)
rec = Recommender(ap)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


## Recommend the most watched movies

In [12]:
class ViewsPredictor:
    def fit(self, x):
        self.x = x
    
    #just counting the number of reviews and storing them in a dictionary
    def predict(self, userID):
        d = {}
        for i in self.x.data:
            d[i] = self.x.data[i].count()
        return d
    

In [13]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
vp = ViewsPredictor()
rec = Recommender(vp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: The Silence of the Lambs, ocena: 1431
Film: Shrek, ocena: 1404


## Recommendation of controversial films

In [14]:
class STDPredictor:
    def __init__(self, n):
        self.n = n
        
    def fit(self, x):
        self.x = x
    
    #first we get movies which have at least n ratings, than we calculate with the standard deviation value
    def predict(self, userID):
        d = {}
        for i in self.x.data:
            if self.x.data[i].count() >= self.n:
                d[i] = self.x.data[i].std()
                
        return d

In [15]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = STDPredictor(100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Plan 9 from Outer Space, ocena: 1.3449520951495715
Film: The Passion of the Christ, ocena: 1.2814934595257372
Film: The Texas Chainsaw Massacre, ocena: 1.2353493219088192
Film: Jackass Number Two, ocena: 1.2189769976366682
Film: White Chicks, ocena: 1.189958142429732


## Prediction of estimates with similarity between products

In [16]:
from math import sqrt
import itertools

class ItemBasedPredictor:
    def __init__(self, min_values = 0, threshold = 0):
        #setting the variables
        self.min_values = min_values
        self.threshold = threshold
        
    def fit(self, x):
        #fitting the data
        self.x = x
        
        #get all movies and make all possible permutations of them
        self.all_movies = self.x.data.columns.tolist()
        permutations = list(itertools.permutations(self.all_movies, 2))
        self.sim = {}
        
        #iterate trough the permutations and calculate the similarity between them
        #we get similarities between each of the movies
        for i,j in permutations:
            if (j,i) in self.sim.keys():
                pass
            else:
                self.sim[(i,j)] = self.similarity(i,j)    
        
    def predict(self, userID):
        d = dict()
        #get the movies which the user has not rated and predict the rating based on the adjusted cosine similarities
        movies = self.x.data.loc[userID][self.x.data.loc[userID].isnull()].index.tolist()
        
        #iterate over the movies array and calculate the predicted rating
        for i in movies:
            gore = 0
            dolu = 0
            for (j,_) in self.x.data.iteritems():
                if i != j and j not in movies:
                    avg = self.x.data.loc[userID][j]
                    x = tuple(sorted([i,j]))
                    gore += self.sim[x] * avg
                    dolu += self.sim[x]
                  
            if i not in d.keys() and dolu > 0:
                d[i] = gore / dolu
        #we get a dictionary with predicted ratings for the movies
        return d
    
    def similarity(self, p1, p2):
        #matrix of the intersection of p1 and p2 where both movies are rated
        matrix = self.x.data[(self.x.data[p1].notnull()) & (self.x.data[p2].notnull())]  
        
        #making arrays of the values in the matrix for calculating the similarity
        p1_array = np.array(matrix[p1])
        p2_array = np.array(matrix[p2])
        mean_array = np.array(matrix.mean(axis=1))
        
        #calculating the adjusted cosine similarity with vectorized operations on the arrays
        gore = sum((p1_array - mean_array)*(p2_array - mean_array))
        prv_koren = sum((p1_array - mean_array)**2)
        vtor_koren = sum((p2_array - mean_array)**2)
        
        #number of users that rated the movies
        num_users = len(matrix.index)
        
        #similarity by the adjusted cosine similarity formula
        sim = gore / (sqrt(prv_koren) * sqrt(vtor_koren))
        
        #if the sim is beneath the threshold or number of users is lower than min_values return 0.0 similarity
        if sim < self.threshold or num_users < self.min_values:
            return 0.0
        
        return sim

    def sim_movies(self, n):
        #sort the dictionary of similarities and return (n) most similar items 
        d = {k: v for k, v in sorted(self.sim.items(), key=lambda item: item[1], reverse=True)}
        s = []
        for i,j in enumerate(d.keys()):
            if i == n:
                break
            s.append((j,d[j]))
            
        return s
    
    def similarItems(self, item, n):
        d = {}
        #go through all the movies and make a tuple of movie(item) with all the rest of them
        for m in self.all_movies:
            if m != item:
                k = tuple(sorted([item,m]))
                d[m] = self.sim[k] #save the tuple of (item, other_movie) and the similarity between them
                
        #sort the dictionary and return (n) movies with the greatest similarity
        d = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
        
        s = []
        for i, k in enumerate(d.keys()):
            if i == n:
                break
            s.append((k,d[k]))
            
        return s 

In [17]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
#print(uim.movies)
print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))

Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.2339552317675661
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0.0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.42466125844687547


In [18]:
print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: Shichinin no samurai, ocena: 4.35573479031016
Film: The Usual Suspects, ocena: 4.3546817280678365
Film: The Silence of the Lambs, ocena: 4.335305303472517
Film: Sin City, ocena: 4.278687166899101
Film: Monsters, Inc., ocena: 4.2175811369435205
Film: The Incredibles, ocena: 4.2070985832817485
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.152792107348347
Film: Batman Begins, ocena: 4.146413806700199
Film: Die Hard, ocena: 4.125915602232819
Film: Rain Man, ocena: 4.07153524295855
Film: The Lord of the Rings: The Return of the King, ocena: 4.020237449257013
Film: A Beautiful Mind, ocena: 4.0151424900648385
Film: Good Will Hunting, ocena: 4.0092808069228205
Film: The Lord of the Rings: The Two Towers, ocena: 3.9414763050955934
Film: Indiana Jones and the Last Crusade, ocena: 3.7969764963789236


## Most Similar Movies

In [19]:
most_sim_movies = rec.predikator.sim_movies(20)
for (i, j), val in most_sim_movies:
    print("Film1: {}, Film2: {}, podobnost: {}".format(md.get_title(i), md.get_title(j), val))

Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8439842148481411
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8231885401761887
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8079374897442487
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7372340224381033
Film1: Star Wars, Film2: Star Wars: Episode V - The Empire Strikes Back, podobnost: 0.7021321132220316
Film1: Ace Ventura: Pet Detective, Film2: The Mask, podobnost: 0.6616471778494041
Film1: Star Wars: Episode V - The Empire Strikes Back, Film2: Star Wars: Episode VI - Return of the Jedi, podobnost: 0.5992253753778951
Film1: Independence Day, Film2: Star Wars: Episode I - The Phantom Menace, podobnost: 0.5610426219249982
Film1: Ace Ventura: Pet Detective, Film2: Austin Powers: The Spy Who Shagged Me, podob

-----

## Recommendation based on currently viewed content

In [20]:
rec_items = rp.similarItems(4993, 10)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": 
Film: The Lord of the Rings: The Two Towers, ocena: 0.8231885401761887
Film: The Lord of the Rings: The Return of the King, ocena: 0.8079374897442487
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.23961943073496453
Film: Star Wars, ocena: 0.21965586527074088
Film: The Matrix, ocena: 0.2151555270688026
Film: Raiders of the Lost Ark, ocena: 0.19944276706345052
Film: The Usual Suspects, ocena: 0.18321188451910767
Film: Blade Runner, ocena: 0.16399681315410303
Film: Schindler's List, ocena: 0.16105905138148724
Film: Monty Python and the Holy Grail, ocena: 0.15780453798519137


## Recommendation for myself

In [21]:
#my ratings for some hand picked movies
my_ratings = {296:5.0, 318:4.5, 344:2.5, 364:4.5, 593:4.5, 858:5.0, 1270:3.0, 1721:3.0, 2571:4.5, 2959:5.0, 3578:4.0, 4306:2.0, 4886:3.0,
              4963:3.5, 6365:4.0, 6377:3.5, 6539:3.5, 8961:3.0, 5418:3.0, 367:2.0, 1036:2.0, 1097:2.0, 1580:1.5, 5349:2.5}
for k in my_ratings.keys():
    print("Film: {}, ocena: {}".format(md.get_title(k), my_ratings[k]))

Film: Pulp Fiction, ocena: 5.0
Film: The Shawshank Redemption, ocena: 4.5
Film: Ace Ventura: Pet Detective, ocena: 2.5
Film: The Lion King, ocena: 4.5
Film: The Silence of the Lambs, ocena: 4.5
Film: The Godfather, ocena: 5.0
Film: Back to the Future, ocena: 3.0
Film: Titanic, ocena: 3.0
Film: The Matrix, ocena: 4.5
Film: Fight Club, ocena: 5.0
Film: Gladiator, ocena: 4.0
Film: Shrek, ocena: 2.0
Film: Monsters, Inc., ocena: 3.0
Film: Ocean's Eleven, ocena: 3.5
Film: The Matrix Reloaded, ocena: 4.0
Film: Finding Nemo, ocena: 3.5
Film: Pirates of the Caribbean: The Curse of the Black Pearl, ocena: 3.5
Film: The Incredibles, ocena: 3.0
Film: The Bourne Identity, ocena: 3.0
Film: The Mask, ocena: 2.0
Film: Die Hard, ocena: 2.0
Film: E.T.: The Extra-Terrestrial, ocena: 2.0
Film: Men in Black, ocena: 1.5
Film: Spider-Man, ocena: 2.5


In [22]:
#reading the data and adding my ratings as a new user
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=950)
uim.add_user(my_ratings, 1)

#using the ItemBasedPredictor for recommendation
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

In [23]:
#printing the top 10 recommended movies for myself
print("Predictions for myself: ")
rec_items = rec.recommend(1, n=10, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for myself: 
Film: Kill Bill: Vol. 2, ocena: 4.8823202264975665
Film: A Clockwork Orange, ocena: 4.850639605605614
Film: Kill Bill: Vol. 2, ocena: 4.850531530195364
Film: Lost in Translation, ocena: 4.83588560884867
Film: Being John Malkovich, ocena: 4.830060623160483
Film: Donnie Darko, ocena: 4.780433050578312
Film: Reservoir Dogs, ocena: 4.770285691044744
Film: Fargo, ocena: 4.765923017624664
Film: Sin City, ocena: 4.761626819589236
Film: One Flew Over the Cuckoo's Nest, ocena: 4.724342130206327


## Slope one prediction

In [24]:
class SlopeOnePredictor:    
    def fit(self, x):
        self.x = x
        
        #again making all the permutations between the movies
        self.all_movies = self.x.data.columns.tolist()
        permutations = list(itertools.permutations(self.all_movies, 2))
        self.slopeOne = {}
        
        #calculating average differences between all the movies and storing them in a dictionary
        for i,j in permutations:
            self.slopeOne[(i,j)] = self.slope_one(i,j)    
    
    def predict(self, userID):
        d = dict()
        #get the movies that are not rated and predict their ratings
        movies = self.x.data.loc[userID][self.x.data.loc[userID].isnull()].index.tolist()
        
        #for each movie make a prediction
        for i in movies:
            gore = []
            dolu = []
            for (j,_) in self.x.data.iteritems():
                if i != j and j not in movies:
                    x = tuple([i,j])
                    gore.append( (self.x.data.loc[userID][j] + self.slopeOne[x][0]) * self.slopeOne[x][1])
                    dolu.append(self.slopeOne[x][1])
            
            #store the predicted values in a dictionary and return it
            if i not in d.keys() and sum(dolu) > 0:
                d[i] = sum(gore) / sum(dolu)
            
        return d
    
    def slope_one(self, p1, p2):
        #get the movies that are rated and calculate average difference between them
        matrix = self.x.data[(self.x.data[p1].notnull()) & (self.x.data[p2].notnull())]  
        
        #arrays of the ratings to use vectorized operations and not iteratting over them
        p1_array = np.array(matrix[p1])
        p2_array = np.array(matrix[p2])
        n = len(p1_array) #number of ratings
        
        #calculationg average difference with the formula from vaje
        dev = (sum(p1_array - p2_array)) / n
        
        #returning the average difference value and number of ratings
        return [dev, n]

In [25]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: The Usual Suspects, ocena: 4.325079182263173
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.155293229840448
Film: The Lord of the Rings: The Return of the King, ocena: 4.153135076202185
Film: The Silence of the Lambs, ocena: 4.127978169643881
Film: Shichinin no samurai, ocena: 4.119790444913598
Film: The Lord of the Rings: The Two Towers, ocena: 4.083325894849594
Film: Indiana Jones and the Last Crusade, ocena: 3.9670398355464194
Film: The Incredibles, ocena: 3.9664496674557546
Film: Good Will Hunting, ocena: 3.963362387354114
Film: Sin City, ocena: 3.942619137615212
Film: Batman Begins, ocena: 3.9375326640077017
Film: A Beautiful Mind, ocena: 3.9140940935239508
Film: Rain Man, ocena: 3.9107819079644943
Film: Monsters, Inc., ocena: 3.8819375978658006
Film: Finding Nemo, ocena: 3.8807711131654794


-----

## Evaluation method

In [26]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print("Scores for RandomPredictor:")
print(mse, mae, precision, recall, f)

Scores for RandomPredictor:
1.248330677608933 0.9263368777931885 0.3849168646080761 0.3203418918789692 0.34967306819092886


In [27]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = AveragePredictor(b=100)
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print("Scores for AveragePredictor:")
print(mse, mae, precision, recall, f)

Scores for AveragePredictor:
0.8083112995322195 0.6141289398447879 0.02244655581947745 0.017963301381865325 0.01995623222131743


In [28]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print("Scores for ItemBasedPredictor:")
print(mse, mae, precision, recall, f)

Scores for ItemBasedPredictor:
0.9074528048825778 0.6642004553056103 0.37565320665083135 0.31097105270493275 0.34026549901921116


In [29]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print("Scores for SlopeOnePredictor:")
print(mse, mae, precision, recall, f)

Scores for SlopeOnePredictor:
0.8365220233178216 0.6293274373721766 0.3940023752969124 0.3298975131976127 0.35911154531248957


- From the scores we can see that the random predictor, just as expected, made the worst predictions for the ratings(based on the MSE and MAE), even though it had good precision, recall and f-score it is the worst predictor(that I wouldn't use). The Average predictor had the best MSE and MAE but it lacked in the precision, recall and f-score, that's because it gives the same recommendations to all the users. The ItemBasedPredictor and SlopeOnePredictor have good values overall but the SlopeOnePredicator is slightly better in all the scores. With the SlopeOnePredictor we get great MAE and MSE scores, also the best precision, recall and f-score values from our predictors.