Question 30
---

In [25]:
import os
import numpy as np
from surprise import AlgoBase
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise import accuracy
from surprise.model_selection import KFold
from surprise.prediction_algorithms.predictions import Prediction

from six import iteritems



class SymmetricAlgo(AlgoBase):
    """This is an abstract class aimed to ease the use of symmetric algorithms.
    A symmetric algorithm is an algorithm that can can be based on users or on
    items indifferently, e.g. all the algorithms in this module.
    When the algo is user-based x denotes a user and y an item. Else, it's
    reversed.
    """

    def __init__(self, sim_options={}, **kwargs):

        AlgoBase.__init__(self, sim_options=sim_options, **kwargs)

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)

        ub = self.sim_options['user_based']
        self.n_x = self.trainset.n_users if ub else self.trainset.n_items
        self.n_y = self.trainset.n_items if ub else self.trainset.n_users
        self.xr = self.trainset.ur if ub else self.trainset.ir
        self.yr = self.trainset.ir if ub else self.trainset.ur

        return self

    def switch(self, u_stuff, i_stuff):
        """Return x_stuff and y_stuff depending on the user_based field."""

        if self.sim_options['user_based']:
            return u_stuff, i_stuff
        else:
            return i_stuff, u_stuff

class NaiveCollabratingFilter(SymmetricAlgo):
    """
    :math:`\hat{r}_{ui} = \mu_u`
    
    """

    def __init__(self, sim_options={}, **kwargs):

        SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs)

    def fit(self, trainset):

        SymmetricAlgo.fit(self, trainset)
#        self.sim = self.compute_similarities()

        self.means = np.zeros(self.n_x)
        for x, ratings in iteritems(self.xr):
            self.means[x] = np.mean([r for (_, r) in ratings])

        return self

    def estimate(self, u, i):

#         if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
#             raise PredictionImpossible('User and/or item is unkown.')
        x, y = self.switch(u, i)
        est = self.means[x]
        
        return est
    
file_path = os.path.expanduser('../ml-latest-small/ratings.csv')
reader = Reader(line_format = 'user item rating timestamp', sep = ',', skip_lines = 1)
data = Dataset.load_from_file(file_path, reader = reader)

algo = NaiveCollabratingFilter()
kf = KFold(n_splits=10)
sum = 0
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    sum += accuracy.rmse(predictions, verbose=True)
print 'avg: ' + str(sum/10)    


RMSE: 0.9623
RMSE: 0.9553
RMSE: 0.9642
RMSE: 0.9684
RMSE: 0.9610
RMSE: 0.9536
RMSE: 0.9767
RMSE: 0.9665
RMSE: 0.9539
RMSE: 0.9608
avg: 0.9622719245124223


Question 31
---

In [5]:
from surprise.model_selection import KFold
from surprise import accuracy

x = range(2, 101, 2)

# the trim function is used to process the testset
# by default the trim function don't do anything
def knn_filter_trim(trim = lambda x : x):
    n_splits = 10
    kf = KFold(n_splits = n_splits)
    rmse = 0
    algo = NaiveCollabratingFilter()
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        testset = trim(testset)
        prediction = algo.test(testset)
        rmse += accuracy.rmse(prediction)
    return rmse / n_splits

In [6]:
# the trim function for popular movie trimming
# delete the movie which has received less than or equal to 2 ratings
# each item in testset is represented by (userId, movieId, rating)
# this trim function can be used by Question 12 and 13
def trim(testset, trim_condition = lambda x : x <= 2):
    movie_map = dict()
    for i in testset:
        movie = i[1]
        movie_map[movie] = 1 if (movie not in movie_map) else movie_map[movie] + 1
    for movie_id in movie_map:
        # meaning that this item should be trimmed
        if trim_condition(movie_map[movie_id]):
            testset = filter(lambda x: x[1] != movie_id, testset)
    return testset
        
def trim_popular(testset):
    return trim(testset)

rmse_popular = knn_filter_trim(trim_popular)
print rmse_popular

RMSE: 0.9584
RMSE: 0.9363
RMSE: 0.9391
RMSE: 0.9411
RMSE: 0.9418
RMSE: 0.9409
RMSE: 0.9516
RMSE: 0.9426
RMSE: 0.9518
RMSE: 0.9611
0.9464811276282006


Question 32
---

In [7]:
def trim_unpopular(testset):
    return trim(testset, trim_condition = lambda x : x > 2)

rmse_unpopular = knn_filter_trim(trim_unpopular)
print rmse_unpopular

RMSE: 1.0140
RMSE: 0.9813
RMSE: 0.9884
RMSE: 1.0113
RMSE: 0.9985
RMSE: 0.9852
RMSE: 0.9858
RMSE: 0.9964
RMSE: 1.0005
RMSE: 1.0190
0.9980208386428572


Question 33
---

In [8]:
def trim_high_variance(testset):
    movie_map = dict()
    for (user, movie, rate) in movie_map:
        if (movie in movie_map):
            movie_map[movie].append(rate)
        else:
            movie_map[movie] = [rate]
    for movie_id in movie_map:
        rating = movie_map[movie_id]
        if len(rating) < 5 or np.var(np.array(rating)) < 2:
            testset = filter(lambda x: x[1] != movie_id, testset)
    return testset 

In [9]:
rmse_high_variance = knn_filter_trim(trim_high_variance)
print rmse_high_variance

RMSE: 0.9611
RMSE: 0.9683
RMSE: 0.9624
RMSE: 0.9617
RMSE: 0.9602
RMSE: 0.9691
RMSE: 0.9734
RMSE: 0.9552
RMSE: 0.9573
RMSE: 0.9557
0.9624426162629863
