In [48]:
%matplotlib inline
import csv
import pandas as pd
import scipy.sparse as sc
from helpers import *
from sgd import *
from als import *
from sklearn import decomposition
from scipy.optimize import minimize
from models import *
from plot import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
ratings = load_data('data_train.csv')

In [50]:
num_items_per_user = np.array((ratings != 0).sum(axis=0)).flatten()
num_users_per_item = np.array((ratings != 0).sum(axis=1).T).flatten()

In [51]:
_,train, test = split_data(ratings, num_items_per_user, num_users_per_item, 10, 0.1)

the shape of original ratings. (# of row, # of col): (10000, 1000)
the shape of valid ratings. (# of row, # of col): (9990, 999)
Total number of nonzero elements in origial data:1176952
Total number of nonzero elements in train data:1065253
Total number of nonzero elements in test data:111620


In [52]:
nnz_row, nnz_col = test.nonzero()
nnz_test = list(zip(nnz_row, nnz_col))

In [53]:
predict_item_test = np.array(baseline_item_mean(train,test))

test RMSE of the baseline using the item mean: [[ 1.09633198]].


In [54]:
predict_user_test = np.array(baseline_user_mean(train, test))

test RMSE of the baseline using the user mean: [[ 1.03317038]].


In [55]:
predict_global_test = baseline_global_mean(train, test)
predict_global_test

test RMSE of baseline using the global mean: [[ 1.12152228]].


3.8581107023810133

In [93]:


def svd_surprise_test(train, test, target, factors=20):
    
    train_df = create_data(train)
    reader = surprise.dataset.Reader(rating_scale=(1,5))
    data = Dataset.load_from_df(train_df[['row', 'col', 'rate']], reader)
    data.split(2) 
    
    algo = SVD(n_factors=factors)
    algo.train(data.build_full_trainset())
    
    return get_prediction_surprise(algo, train_df, test, target)


In [98]:
predict_SVD_test, rmse = svd_surprise_test(train, test, None, 10)

1.00044285488


In [83]:
predict_SVD_test, rmse = svd_surprise(train, test, None, 10)
predict_SVD_test[predict_SVD_test < 1] = 1
predict_SVD_test[predict_SVD_test > 5] = 5

1.00345952803


In [57]:
predict_KNNitem_test, rmse = knn_surprise(train,test,None,False, 100)
predict_KNNitem_test[predict_KNNitem_test < 1] = 1
predict_KNNitem_test[predict_KNNitem_test > 5] = 5

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
0.988904728304


In [58]:
predict_baseline_test = baseline_surprise(train,test,None)
predict_baseline_test[predict_baseline_test > 5] = 5
predict_baseline_test[predict_baseline_test < 1] = 1

Estimating biases using als...
1.00127314809


In [59]:
predict_slope_test = slope_surprise(train, test, None)
predict_slope_test[predict_slope_test > 5] = 5
predict_slope_test[predict_slope_test < 1] = 1

1.00185650691


In [60]:
%%time
predict_SGD_test, rmse = matrix_factorization_SGD(train, test, 0.025, 20, 0.1, 0.016)
predict_SGD_test[predict_SGD_test > 5] = 5
predict_SGD_test[predict_SGD_test < 1] = 1

RMSE on test data: 0.9837803281828068.
Wall time: 8min 53s


In [61]:
%%time
predict_ALS_test, rmse = ALS(train,test,3, 0.2, 0.9)
predict_ALS_test[predict_ALS_test > 5] = 5
predict_ALS_test[predict_ALS_test < 1] = 1

number of iterations:  1
number of iterations:  2
number of iterations:  3
number of iterations:  4
number of iterations:  5
number of iterations:  6
number of iterations:  7
number of iterations:  8
number of iterations:  9
number of iterations:  10
number of iterations:  11
test RMSE after running ALS: 0.9940086593042198.
Wall time: 3min 48s


In [84]:
all_predictions_test = []
all_predictions_test.append(predict_SGD_test)
all_predictions_test.append(predict_ALS_test)
all_predictions_test.append(predict_global_test)
all_predictions_test.append(predict_SVD_test)
all_predictions_test.append(predict_user_test)
all_predictions_test.append(predict_item_test.reshape(9990,1))
all_predictions_test.append(predict_KNNitem_test)
all_predictions_test.append(predict_baseline_test)
all_predictions_test.append(predict_slope_test)

In [63]:
def const(x):
    return x.sum() - 1
constraint = {'type':'eq', 'fun': const}

def sum_ratings(weights):
    mix_prediction = 0
    for i, pred in enumerate(all_predictions_test):
        mix_prediction += weights[i] * pred
    mix_prediction[mix_prediction > 5] = 5
    mix_prediction[mix_prediction < 1] = 1
    return compute_mix_error(test, nz=nnz_test, prediction=mix_prediction)


In [85]:
%%time
x0 = [1/len(all_predictions_test)] * len(all_predictions_test)
result = minimize(fun=sum_ratings, x0=x0)

Wall time: 8min 58s


In [86]:
result

      fun: 0.9782640927574934
 hess_inv: array([[  36.08833014,  -12.71034799,   -7.37971193,   -5.2022471 ,
           7.48960465,    6.89882479,  -11.61393268,   -5.22088393,
          -7.71584501],
       [ -12.71034799,   23.59700701,    1.47438163,   -4.5661592 ,
          -2.6099633 ,   -1.39209188,   -6.04379766,    7.94479478,
          -5.94815812],
       [  -7.37971193,    1.47438163,   84.99445698,    4.25813016,
         -86.80326384,  -83.12691086,    5.93459048,   85.42525424,
          -4.892919  ],
       [  -5.2022471 ,   -4.5661592 ,    4.25813016,   29.48962273,
          -0.61958235,   -4.93757946,   -2.39151744,  -27.12832912,
          11.25399388],
       [   7.48960465,   -2.6099633 ,  -86.80326384,   -0.61958235,
         109.23756507,   87.80924676,   -4.53603861, -159.42551919,
          49.71462127],
       [   6.89882479,   -1.39209188,  -83.12691086,   -4.93757946,
          87.80924676,   86.80247496,   -6.29750026, -108.53925669,
          22.96957302],

In [66]:
%%time
predict_global = baseline_global_mean(ratings, None)
predict_user = np.array(baseline_user_mean(ratings, None))
predict_item = np.array(baseline_item_mean(ratings,None)).reshape(10000,1)
predict_SGD = matrix_factorization_SGD(ratings,None, 0.025, 20, 0.1, 0.016)
predict_SGD[predict_SGD > 5] = 5
predict_SGD[predict_SGD < 1] = 1
predict_ALS = ALS(ratings,None, 3, 0.2, 0.9)
predict_ALS[predict_ALS > 5] = 5
predict_ALS[predict_ALS < 1] = 1

number of iterations:  1
number of iterations:  2
number of iterations:  3
number of iterations:  4
number of iterations:  5
number of iterations:  6
number of iterations:  7
number of iterations:  8
number of iterations:  9
number of iterations:  10
Wall time: 14min 33s


In [67]:
to_predict = get_to_predict()
predict_SVD = svd_surprise(ratings, None, to_predict, 10 )
predict_SVD[predict_SVD > 5] = 5
predict_SVD[predict_SVD < 1] = 1

In [68]:
predict_KNNitem = knn_surprise(ratings, None, to_predict, False)
predict_KNNitem[predict_KNNitem > 5] = 5
predict_KNNitem[predict_KNNitem < 1] = 1

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [69]:
predict_baseline = baseline_surprise(ratings, None, to_predict)
predict_baseline[predict_baseline > 5] = 5
predict_baseline[predict_baseline < 1] = 1

Estimating biases using als...


In [70]:
predict_slope = slope_surprise(ratings, None, to_predict)
predict_slope[predict_slope > 5] = 5
predict_slope[predict_slope < 1] = 1

In [71]:
all_predictions = []
all_predictions.append(predict_SGD)
all_predictions.append(predict_ALS)
all_predictions.append(predict_global)
all_predictions.append(predict_SVD)
all_predictions.append(predict_user)
all_predictions.append(predict_item.reshape(10000,1))
all_predictions.append(predict_KNNitem)
all_predictions.append(predict_baseline)
all_predictions.append(predict_slope)

In [72]:
final_predict = 0
for i, pred in enumerate(all_predictions):
    final_predict += result.x[i] * pred

In [73]:
final_predict[final_predict<1] = 1
final_predict[final_predict>5] = 5

In [74]:
create_csv_submissions(final_predict)