In [37]:
# 1. Load data from the file 'jester-data-1.csv'
# Note: 99 denotes that the user hasn't been labelled it. Therefore don't
# use then in training.

import pandas as pd
df = pd.read_csv("jester-data-1.csv", header=None)
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [38]:
# 2. Label 10% of the dataset cells as 99, to denote they are part of the
# validation set.

# Make a copy of the data frame.
df_validation = df.copy()
df_validation.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [39]:
# Calculate the number of rows, columns and cells in the data set.
users = df_validation.shape[0]
jokes = df_validation.shape[1]
cells = users * jokes

print('Users:\t' + str(users))
print('Jokes:\t' + str(jokes))
print('Cells:\t' + str(cells))

Users:	24983
Jokes:	101
Cells:	2523283


In [40]:
# Get number cells that require changing.

# iloc: 'Purely integer-location based indexing for selection by position'.
# iloc: http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.iloc.html

cells_not_specified = (df_validation.iloc[:,:] == 99).sum().sum()
cells_not_specified_percent = (cells_not_specified / cells) * 100

print('Not specified count:\t' + str(cells_not_specified))
print('Not specified percent:\t' + str(cells_not_specified_percent))

cells_specified = cells - cells_not_specified
cells_to_change = round(cells_specified * 0.1).astype(int)

print('Cells specified:\t' + str(cells_specified))
print('Cells to change\t:\t' + str(cells_to_change))

Not specified count:	687926
Not specified percent:	27.26313298983903
Cells specified:	1835357
Cells to change	:	183536


In [41]:
# Change 10% of cells to '99'.
import random as rand

row = 0
col = 0
for x in range(0, cells_to_change):  
    while (df_validation.iloc[row, col]) == 99:
        row = rand.randint(0, users - 1)
        col = rand.randint(0, jokes - 1)
    df_validation.at[row, col] = 99    
    
print('Not specified count after: ' + str((df_validation.iloc[:,:] == 99).sum().sum()))

df_validation.head()

Not specified count after: 871462


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,99,-7.82,8.79,99.0,-8.16,-7.52,99.0,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,99.0,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,99.0,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,99.0,5.58,4.27,5.19,5.73,1.55,3.11,99.0,1.8,1.6


In [42]:
# 3. Use latent factor modeling to infer the hidden ratings of the users.
# Generate initial random data.
import numpy as np

factors = 3

latent_item_features = np.random.random((jokes, factors))
latent_user_preferences = np.random.random((users, factors))

print('Latent item features = ' + repr(latent_item_features))
print('Latent user preferences = ' + repr(latent_user_preferences))

Latent item features = array([[0.31553218, 0.35375843, 0.42128843],
       [0.83733677, 0.63102883, 0.4861513 ],
       [0.41308771, 0.63552616, 0.02460905],
       [0.56726126, 0.1933566 , 0.77156232],
       [0.97379705, 0.7133205 , 0.03880881],
       [0.05399391, 0.11352083, 0.95802275],
       [0.24578754, 0.31499537, 0.52186427],
       [0.33398775, 0.44690347, 0.98898567],
       [0.00136793, 0.23378431, 0.43043728],
       [0.00446899, 0.40568992, 0.42717097],
       [0.97985558, 0.49165449, 0.21749641],
       [0.14609495, 0.43437843, 0.9722517 ],
       [0.70326123, 0.14522801, 0.15986105],
       [0.27442175, 0.27011004, 0.80743314],
       [0.00183   , 0.86266868, 0.86927456],
       [0.20648025, 0.10005742, 0.90066547],
       [0.67983417, 0.9100762 , 0.86038761],
       [0.5955261 , 0.65983189, 0.44209573],
       [0.89968553, 0.29560511, 0.4100669 ],
       [0.39845782, 0.44568366, 0.78919017],
       [0.03554156, 0.74181618, 0.49765141],
       [0.69455884, 0.68587731, 

In [43]:
# Predict ratings function.
def predict_rating(user_row, item_row):
    """ Predict a rating given a user_row and an item_row. """
    
    user_values = latent_user_preferences[user_row]
    item_values = latent_item_features[item_row]
    return user_values.dot(item_values)

In [44]:
# Training function.
def train(user_row, item_row, rating, alpha = 0.0001):
    """ Adapt the values of user_preferences and item_factors to match
    the ones predicted by the users. """
    
    err = alpha * (rating - predict_rating(user_row, item_row))
    latent_user_preferences[user_row] += err * latent_item_features[item_row]
    latent_item_features[item_row] += err * latent_user_preferences[user_row]
    return err

In [None]:
# Train function. Ignore '99' values which indicates
# Either that the item has not been specified or it is part of the
# validation set.

def sgd_svd(iterations = 100):
    for i in range(0, iterations):
        training_errors = []
        for user_row in range(0, users):
            for joke_col in range(0, jokes):
                rating = df_validation.iloc[user_row][joke_col]
                if (not np.isnan(rating) and not rating == 99):
                    training_errors.append(train(user_row, joke_col, rating))
        if (i % 10 == 0):
            print("Training MSE, " + str(i) + ": " + str((np.array(training_errors) ** 2).mean()))

In [None]:
# Run the training function and print out updated tables.

sgd_svd()

print('Latent item features = ' + repr(latent_item_features))
print('Latent user preferences = ' + repr(latent_user_preferences))

In [None]:
# 4. Calculate the performance of the algorithm on the validation dataset.

validation_error = []
for user in range(0, 20):
    for joke in range(0, jokes):
        rating_training_data = df.iloc[user][joke]
        rating_validation_data = df_validation.iloc[user][joke]
        if (not rating_training_data == rating_validation_data):
            prediction = predict_rating(user, joke)
            validation_error.append(rating_training_data - predict_rating(user, joke))
print("Validation MSE: " + str((np.array(validation_error) ** 2).mean()))