In [None]:
import re
def parse(row):
    numbers = re.findall(r'\d+', row[0])
    numbers = list(map(int, numbers))
    value = int(row[1])
    return (numbers, value)
    

In [None]:
# Sigmoid function
def sigmoid(z):
    result = 1.0 / (1.0 + np.e**-z)
    return result

In [74]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [82]:
import csv
import numpy as np
import scipy as sp
firstrow = True
X = np.zeros((1000,10000)) #movies, users
with open('data_train.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        if not firstrow:
            numbers, value = parse(row) #numbers is (columns, rows)
            X[numbers[1]-1, numbers[0]-1] = value 
        else:
            firstrow = False

In [75]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#do simple SVD

#maybe use these values for better analysis
similarity_between_users = X * X.T #(X[i,j] how similar is user i to user j) 
similarity_between_movies = X.T * X #(X[i,j] how similar is movie i to movie j)

U, D, V = svds(X, k=40)
X_a = np.dot(np.dot(U, np.diag(D)), V)

print ('SVD MSE: ' + str(rmse(X_a, X)) + " STD: " + str(np.std(X_a - X)))

SVD MSE: 1.1574378043774212 STD: 1.15743575388


In [121]:
averageRatingPerMovie = X.sum(1)/(X != 0).sum(1) #(1000,1)

#simple Prediction, no need to train anything...

def predictRating_Baseline(movie,user):
    averageRating = averageRatingPerMovie[movie]
    ratingsByUser_indices = X[:,user] > 0
    averageOffset = np.mean(averageRatingPerMovie[ratingsByUser_indices] - X[ratingsByUser_indices, user])
    return averageRating + averageOffset

In [184]:
#use this for Netflix
def predictRating(movie, user):
    #maybe do some feature transformation sigmoid
    predict = np.dot(movieFeature[:, movie],userFeature [:, user])
    if predict > 5:
        predict = 5
    else:
        if predict < 1:
            predict = 1
    return predict

In [241]:
#for Netflix

#params
lrate = 0.001
#K should be very small, is to reduce prior
K = 0.02

#works well if k is large
def train_tikhonov(movie, user, feature, firstRound):
    true_rating = X[movie, user]
    if firstRound: 
        predicted_rating = predictRating_Baseline(movie,user)
        err = (true_rating - predicted_rating)
        #print("Predicted: "+ str(predicted_rating) + " True: " + str(true_rating))
    else:
        predicted_rating = predictRating(movie,user)
        err = (true_rating - predicted_rating)
        #print("Predicted: "+ str(predicted_rating) + " True: " + str(true_rating))
    uv = userFeature[feature, user]
    userFeature[feature, user] = uv + lrate * (err * movieFeature[feature, movie] - K * uv)
    movieFeature[feature, movie] = movieFeature[feature, movie] + lrate * (err * uv - K * movieFeature[feature, movie])
    return err

In [None]:
#NETFLIX

#parameters
k = 20
n_epochs = 30
#for k=40 takes approx 3min per epoch
#for k=100 takes approx 11min per epoch
#for k=200 takes approx 22min per epoch

#remove/comment out the following four lines if you want to train your model even further
movieFeature = np.zeros((k,1000)) + 0.1
userFeature = np.zeros((k,10000)) + 0.1
movieFeature_cache = []
userFeature_cache = []

error_cache = np.ones((k,)) * 5
stop = False
for epoch in range(n_epochs):
    print("Starting epoch: " + str(epoch))
    for feature in range(k):
        errors = []
        movie_indices, user_indices = np.nonzero(X)
        for i in range(len(movie_indices)):
            movie = movie_indices[i]
            user = user_indices[i]
            error_squared = train_tikhonov(movie, user, feature, epoch==-1) ** 2
            errors.append(error_squared)
        rmserror = np.sqrt(np.mean(errors))
        print("Errors of feature " + str(feature) + ": rmse " + str(rmserror))
        #termination criteria: leave if training on this feature was worse than last epoch
        if rmserror > error_cache[feature]:
            stop = True
            break
        error_cache[feature] = rmserror
    #cache the training data to recover fast from errors
    movieFeature_cache.append(movieFeature)
    userFeature_cache.append(userFeature)
    if stop:
        break

#inspect if trained features have very high numbers (overfitted)
print(movieFeature)
print(userFeature)

Starting epoch: 0
Errors of feature 0: rmse 3.05399887438
Errors of feature 1: rmse 2.92048493867
Errors of feature 2: rmse 2.76370380964
Errors of feature 3: rmse 2.63305652143
Errors of feature 4: rmse 2.52531246606
Errors of feature 5: rmse 2.43472295393
Errors of feature 6: rmse 2.35712102047
Errors of feature 7: rmse 2.28954508944
Errors of feature 8: rmse 2.22985650076
Errors of feature 9: rmse 2.17654462082
Errors of feature 10: rmse 2.12849065966
Errors of feature 11: rmse 2.08483350678
Errors of feature 12: rmse 2.0448956447
Errors of feature 13: rmse 2.00815435393
Errors of feature 14: rmse 1.97418129335
Errors of feature 15: rmse 1.94263109684
Errors of feature 16: rmse 1.91322009778
Errors of feature 17: rmse 1.8857124944
Errors of feature 18: rmse 1.85990138792
Errors of feature 19: rmse 1.83562284015
Starting epoch: 1
Errors of feature 0: rmse 1.68098797108
Errors of feature 1: rmse 1.48814844864
Errors of feature 2: rmse 1.40359079328
Errors of feature 3: rmse 1.35536244

In [210]:
#Output file

#make sure to have X_a or have
#movieFeature and userFeature trained

import csv
import numpy as np
import time
firstrow = True
requested_y = []
with open('sampleSubmission.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        if not firstrow:
            numbers, value = parse(row)
            requested_y.append(numbers)
        else:
            firstrow = False
            
        
with open('submission-'+(time.strftime('%Y-%m-%d-%a-%Hh%Mmin'))+'-rmse-'+str(rmserror)+'.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['ID','Prediction'])
    for coord in requested_y:
        r = coord[0]
        c = coord[1]
        #value = int(X_a[c-1, r-1]) #exchange this if desired
        value = predictRating(c-1, r-1)
        writer.writerow(['r%d_c%d'%(r, c) , str(value)])


In [219]:
#backup SVD values (movieFeature & userFeature)
import time
np.savetxt('movieBackup'+(time.strftime('%Y-%m-%d-%a-%Hh%Mmin')+'.csv'), movieFeature, delimiter=",")
np.savetxt("userBackup"+(time.strftime('%Y-%m-%d-%a-%Hh%Mmin')+".csv"), userFeature, delimiter=",")

In [213]:
#let the system voice tell you that training has finished
import os
os.system('say "Done"')

0