In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.model_selection import KFold

In [2]:
data_dir = 'ml-1m'

movies_filename = 'movies.dat'
users_filename = 'users.dat'
ratings_filename = 'ratings.dat'

movies_columns = ['MovieID', 'Title', 'Genres']
users_columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
ratings_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

def create_dataframe(data_dir, filename, columns):
    data_file = os.path.join(data_dir, filename)
    return pd.read_csv(data_file, delimiter='::', names=columns, encoding='latin-1', engine='python')

movies = create_dataframe(data_dir, movies_filename, movies_columns)
users = create_dataframe(data_dir, users_filename, users_columns)
ratings = create_dataframe(data_dir, ratings_filename, ratings_columns)
data = (users, movies, ratings)

In [3]:
def rating_error(test_set, model, subset:int=None):
    if subset is not None:
        actual_ratings = test_set[:subset]['Rating']
        predicted_ratings = test_set[:subset].apply(model, axis=1)
    else:
        actual_ratings = test_set['Rating']
        predicted_ratings = test_set.apply(model, axis=1)

    rating_error = (((actual_ratings - predicted_ratings)**2)**(1/2)).mean()
    return rating_error

In [4]:
def test_naive_model_1(data, subset:int=None):
    users, movies, ratings = data
    
    cv = KFold(n_splits=5, random_state=42, shuffle=True)
    
    rating_errors = np.array([])
        
    for train_index, test_index in cv.split(ratings):
        train_set = ratings.iloc[train_index]
        test_set = ratings.iloc[test_index]
        
        def model(row):
            return train_set['Rating'].mean()
        
        rating_err = rating_error(test_set, model, subset)
        print(rating_err)
        
        rating_errors = np.append(rating_errors, rating_err)
    return rating_errors

errors = test_naive_model_1(data, subset=100)
mean_error = np.mean(errors)
print('mean error', mean_error)

0.9934922085014753
0.8590062949359322
0.8333964284955516
0.9087583716899096
0.9551108017316362
mean error 0.909952821070901


In [5]:
def test_naive_model_2(data, subset:int=None):
    users, movies, ratings = data
    
    cv = KFold(n_splits=5, random_state=42, shuffle=True)
    
    rating_errors = np.array([])
        
    for train_index, test_index in cv.split(ratings):
        train_set = ratings.iloc[train_index]
        test_set = ratings.iloc[test_index]
        movie_ids = train_set['MovieID']
        
        def model(row):
            movie_id = row['MovieID']
            return train_set[movie_id == movie_ids]['Rating'].mean()
        
        rating_err = rating_error(test_set, model, subset)
        print(rating_err)
        
        rating_errors = np.append(rating_errors, rating_err)
    return rating_errors

errors = test_naive_model_2(data, subset=100)
mean_error = np.mean(errors)
print('mean error', mean_error)

0.8877947904704699
0.7786278499104203
0.7582993988238058
0.8018631124666835
0.8336848964912545
mean error 0.8120540096325269


In [6]:
def test_naive_model_3(data, subset:int=None):
    users, movies, ratings = data
    
    cv = KFold(n_splits=5, random_state=42, shuffle=True)
    
    rating_errors = np.array([])
        
    for train_index, test_index in cv.split(ratings):
        train_set = ratings.iloc[train_index]
        test_set = ratings.iloc[test_index]
        user_ids = train_set['UserID']
        
        def model(row):
            user_id = row['UserID']
            return train_set[user_id == user_ids]['Rating'].mean()
        
        rating_err = rating_error(test_set, model, subset)
        print(rating_err)
        
        rating_errors = np.append(rating_errors, rating_err)
    return rating_errors

errors = test_naive_model_3(data, subset=100)
mean_error = np.mean(errors)
print('mean error', mean_error)

0.9240060893082526
0.8089525344561035
0.7485945911836677
0.788641954098307
0.8407291188096166
mean error 0.8221848575711895
