In [242]:
# 1. Load data from the file 'jester-data-1.csv'
# Note: 99 denotes that the user hasn't been labelled it. Therefore don't
# use then in training.

import pandas as pd
df = pd.read_csv("jester-data-1.csv", header=None)
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [243]:
# 2. Label 10% of the dataset cells as 99, to denote they are part of the
# validation set.

# Make a copy of the data frame.
df_validation = df.copy()
df_validation.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [244]:
# Calculate the number of rows, columns and cells in the data set.
users = df_validation.shape[0]
jokes = df_validation.shape[1]
cells = users * jokes

print('Users:\t\t' + str(users))
print('Jokes:\t\t' + str(jokes))
print('Cells:\t\t' + str(cells))

Users:		24983
Jokes:		101
Cells:		2523283


In [245]:
# Get number cells that require changing.

# iloc: 'Purely integer-location based indexing for selection by position'.
# iloc: http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.iloc.html

cells_not_specified = (df_validation.iloc[:,:] == 99).sum().sum()
cells_not_specified_percent = (cells_not_specified / cells) * 100

print('Not specified count: ' + str(cells_not_specified))
print('Not specified percent: ' + str(cells_not_specified_percent))

cells_specified = cells - cells_not_specified
cells_to_change = round(cells_specified * 0.1).astype(int)

print('Cells specified: ' + str(cells_specified))
print('Cells to change: ' + str(cells_to_change))

Not specified count: 687926
Not specified percent: 27.26313298983903
Cells specified: 1835357
Cells to change: 183536


In [246]:
# Change 10% of cells to '99'.
import random as rand

row = 0
col = 0
for x in range(0, cells_to_change):  
    while (df_validation.iloc[row, col]) == 99:
        row = rand.randint(0, rows - 1)
        col = rand.randint(0, cols - 1)
    df_validation.at[row, col] = 99    
    
print('Not specified count after: ' + str((df_validation.iloc[:,:] == 99).sum().sum()))

df_validation.head()

Not specified count after: 871462


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,99,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,99.0,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,99.0,99.0,-0.19,-2.14,3.06,0.34,99.0,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,99.0,8.16,-2.82,99.0,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,99.0,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,99.0,1.55,3.11,6.55,1.8,1.6


In [247]:
# 3. Use latent factor modeling to infer the hidden ratings of the users.
# Generate initial random data.
import numpy as np

factors = 3

latent_item_features = np.random.random((jokes, factors))
latent_user_preferences = np.random.random((users, factors))

print('Latent item features = ' + repr(latent_item_features))
print('Latent user preferences = ' + repr(latent_user_preferences))

Latent item features = array([[0.38966179, 0.25896835, 0.36331712],
       [0.93858856, 0.112169  , 0.78065416],
       [0.61494735, 0.9082222 , 0.03442773],
       [0.55086221, 0.54399707, 0.43392225],
       [0.04387386, 0.81094321, 0.14274978],
       [0.9432947 , 0.0030292 , 0.09721013],
       [0.4311313 , 0.4266163 , 0.77331112],
       [0.61351656, 0.6754211 , 0.01177734],
       [0.88316746, 0.01367604, 0.90638492],
       [0.92057223, 0.80871571, 0.66727211],
       [0.89039925, 0.62176974, 0.16201026],
       [0.75247842, 0.05527144, 0.18022854],
       [0.08265116, 0.26218721, 0.23003362],
       [0.85461693, 0.15899898, 0.88395143],
       [0.5295725 , 0.16122957, 0.49455029],
       [0.29277319, 0.42413283, 0.04696646],
       [0.69828635, 0.16916225, 0.82039137],
       [0.61623405, 0.20928764, 0.0440336 ],
       [0.04005576, 0.32902459, 0.02664288],
       [0.55417162, 0.06834335, 0.52897389],
       [0.79869689, 0.15108784, 0.37102256],
       [0.58826928, 0.02108019, 

In [248]:
# Predict ratings function.
def predict_rating(user_row, item_row):
    """ Predict a rating given a user_row and an item_row. """
    
    user_values = latent_user_preferences[user_row]
    item_values = latent_item_features[item_row]
    return user_values.dot(item_values)

In [249]:
# Training function.
def train(user_row, item_row, rating, alpha = 0.0001):
    """ Adapt the values of user_preferences and item_factors to match
    the ones predicted by the users. """
    
    err = alpha * (rating - predict_rating(user_row, item_row))
    latent_user_preferences[user_row] += err * latent_item_features[item_row]
    latent_item_features[item_row] += err * latent_user_preferences[user_row]
    return err

In [255]:
# Train function. Ignore '99' values which indicates
# Either that the item has not been specified or it is part of the
# validation set.

def sgd_svd(iterations = 100):
    for i in range(0, iterations):
        training_errors = []
        for user_row in range(0, 20):
            for item_row in range(0, jokes):
                rating = df_validation.iloc[user_row][item_row]
                if (not np.isnan(rating) and not rating == 99):
                    training_errors.append(train(user_row, item_row, rating))
        if (i % 10 == 0):
            print("Training MSE, " + str(i) + ": " + str((np.array(training_errors) ** 2).mean()))

In [256]:
# Run the training function and print out updated tables.

sgd_svd()

print('Latent item features = ' + repr(latent_item_features))
print('Latent user preferences = ' + repr(latent_user_preferences))

Training MSE, 0: 1.0413812753687217e-06
Training MSE, 10: 9.482602885145727e-07
Training MSE, 20: 8.197599549832963e-07
Training MSE, 30: 6.679932436289613e-07
Training MSE, 40: 5.239606782756986e-07
Training MSE, 50: 4.1635402591377743e-07
Training MSE, 60: 3.5028659759311416e-07
Training MSE, 70: 3.1352199311403816e-07
Training MSE, 80: 2.929316744595875e-07
Training MSE, 90: 2.806289305527061e-07
Latent item features = array([[ 1.00863056e+01,  1.20097312e+01,  1.00276110e+01],
       [ 8.19884313e-01, -4.21582277e-02,  8.49870254e-01],
       [ 4.75206265e-01,  7.62018831e-01, -1.15341296e-01],
       [ 4.62906091e-01,  4.46225625e-01,  4.60365568e-01],
       [-1.98338363e-01,  4.61642081e-01, -6.92402702e-02],
       [ 7.74584028e-01, -3.17390301e-01, -1.88948294e-01],
       [ 1.16624383e-01, -3.75613185e-02,  3.44473754e-01],
       [ 6.28986688e-01,  5.57885658e-01,  1.90556729e-01],
       [ 6.82427974e-01, -3.13170720e-01,  5.95540736e-01],
       [ 3.14728717e-01,  9.349161

In [261]:
# 4. Calculate the performance of the algorithm on the validation dataset.

validation_error = []
for user in range(0, 20):
    for joke in range(0, jokes):
        rating_training_data = df.iloc[user][joke]
        rating_validation_data = df_validation.iloc[user][joke]
        if (not rating_training_data == rating_validation_data):
            prediction = predict_rating(user, joke)
            validation_error.append(rating_training_data - predict_rating(user, joke))
print("Validation MSE: " + str((np.array(validation_error) ** 2).mean()))

74.0:-17.332927772505258
-9.85:-0.7380785536739037
-8.11:-0.6614727494611499
-9.08:-0.3225238797107228
4.13:-0.7495777079690997
-8.64:-0.7637852087782073
8.59:-0.44023302515251217
-1.36:-0.6004799708346045
8.3:-0.2749320831750379
-1.31:3.734636613814634
7.77:0.8217981147181315
-0.29:3.24890722215496
7.86:2.0344254087810243
-4.32:0.114148082428565
-6.36:-0.0576968375210688
-6.89:-2.3943357980021065
9.03:0.8324728948435912
7.28:3.5678726069270317
7.28:0.8911457000476245
7.52:2.5481759747017056
7.28:0.13864319757137172
1.8:0.06665905492592933
6.21:1.3469127638584104
2.91:1.9857268829991672
6.65:3.253424917963204
6.84:1.2833436940307856
6.94:1.6566063316797313
4.61:3.258405106085746
8.3:4.041397570781756
0.34:3.7812105056785104
5.24:2.1034321905343547
6.31:2.547767195932423
8.06:1.6965411989771004
4.51:3.7918482927194277
-8.06:1.0139529004653158
3.83:1.5009272404646532
5.24:3.01686891228731
7.28:2.084538798717379
3.93:2.0882674124764877
4.71:2.770346363603151
5.73:1.8699539753145027
-3.54: