In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, vstack
from sklearn.utils import shuffle

from tqdm import tqdm

In [2]:
def get_data():
    data_file_path = "./netflix-prize-data/processed_data.csv"

    df = pd.read_csv(data_file_path, header = None, names = ['User_Id','Rating','Movie_Id'])
    print(df.iloc[::5000000, :])
    
    encoder = OneHotEncoder(categories='auto') 

    # (number_of_ratings x number_of_users)
    one_hot_user_matrix = encoder.fit_transform(np.asarray(df['User_Id']).reshape(-1,1)) 
    print("One-hot user matrix shape: " + str(one_hot_user_matrix.shape))
    
    # (number_of_ratings x number_of_movie_ids)
    one_hot_movie_matrix = encoder.fit_transform(np.asarray(df['Movie_Id']).reshape(-1,1))
    print("One-hot movie matrix shape: " + str(one_hot_movie_matrix.shape))
    
    # train data in CSR format
    X = hstack([one_hot_user_matrix, one_hot_movie_matrix]).tocsr()
    # data to predict
    ratings=np.asarray(df['Rating']).reshape(-1,1)
    
    return X,ratings

In [3]:
X,ratings = get_data()

# do shuffling so records will be evenly distributed over the matrix
X,ratings = shuffle(X,ratings)

print(X.shape)
print(ratings.shape)

           User_Id  Rating  Movie_Id
0          1488844       3         1
5000000     501954       2       996
10000000    404654       5      1962
15000000    886608       2      2876
20000000   1193835       2      3825
25000000   1899206       3      4661
30000000    154804       4      5496
35000000   2078749       5      6274
40000000    450763       5      7057
45000000    102092       3      7991
50000000    220298       5      9023
55000000    550530       5     10042
60000000    222570       3     11038
65000000   1273080       5     11875
70000000   2026970       5     12676
75000000    506044       4     13582
80000000    353605       2     14453
85000000    664606       3     15116
90000000   2213715       3     16008
95000000   1589401       5     16879
100000000  2314006       4     17627
One-hot user matrix shape: (100480507, 480189)
One-hot movie matrix shape: (100480507, 17770)
(100480507, 497959)
(100480507, 1)


In [4]:
def predict_ratings(X, w, w0, V):
    linear_part = w0 + X.dot(w)
    
    V_part =((X.dot(V))**2) - (X.power(2)).dot((V)**2) 
    sqr_part = (0.5 * np.sum(V_part, axis=1)).reshape(-1,1)
    
    return linear_part + sqr_part


def mse(y, y_pred):
    return np.sum((y - y_pred)**2)/len(y_pred)


def rmse(y, y_pred):
    return np.sqrt(mse(y, y_pred))


def r2(y, y_pred):
    nom=np.sum((y - y_pred)**2)
    denom=np.sum((y - np.mean(y, axis=0))**2)
    return 1 - nom/denom

In [8]:
def mini_batch_grad_desc(X_train, Y_train, learning_rate=0.01, num_epoches=5, batch_size=1024, factors_numb=3):
    w = np.full((X_train.shape[1], 1), 0.5)
    w0 = 0.5
    V = np.full((X_train.shape[1], factors_numb), 0.5)
    
    for e in range(num_epoches):
        print("Epoch: " + str(e+1) + "/" + str(num_epoches))
        
        X_train, Y_train = shuffle(X_train, Y_train)
        minibatches_numb = X_train.shape[0] // batch_size
        if (minibatches_numb * batch_size < X_train.shape[0]):
            # batch with a smaller size
            minibatches_numb += 1
        
        for i in tqdm(range(minibatches_numb)):
            X_batch = X_train[i * batch_size : (i+1) * batch_size, :]
            Y_batch = Y_train[i * batch_size : (i+1) * batch_size, :]
            
            actual_batch_size = X_batch.shape[0]
            
            ratings_prediction = predict_ratings(X_batch, w, w0, V)
            error = Y_batch - ratings_prediction
            
            # upd model parameters
            w = w + (learning_rate*2*((X_batch.T).dot(error)))/actual_batch_size
            w0 = w0 + (learning_rate*2*(np.sum(error)))/actual_batch_size
            V = V + (learning_rate*2*((X_batch.T).dot(np.multiply(error, X_batch.dot(V))) - np.multiply(V, (X_batch.T).power(2).dot(error))))/actual_batch_size
        
    return w, w0, V

## Model training

In [9]:
folds_numb = 5
fold_size = X.shape[0] // folds_numb

rmse_train = []
r2_train = []
rmse_test = []
r2_test = []

# split data to folds for cross-validation purposes
for fold_idx in range(folds_numb):
    print("Fold: " + str(fold_idx+1) + "/" + str(folds_numb))
    
    # prepare train data for fold i
    X_train = vstack(
        [
            X[0 : fold_idx * fold_size, :],
            X[(fold_idx+1) * fold_size : X.shape[0], :]
        ],
        'csr'
    )
    Y_train = np.vstack(
        (
            ratings[0 : fold_idx * fold_size],
            ratings[(fold_idx+1) * fold_size : ratings.shape[0]]
        )
    )
    
    # train model
    w, w0, V = mini_batch_grad_desc(X_train, Y_train, batch_size=65536)
    
    # save train metrics
    train_prediction = predict_ratings(X_train, w, w0, V)
    rmse_train.append(rmse(Y_train, train_prediction))
    r2_train.append(r2(Y_train, train_prediction))
    
    # prepare test data for fold i
    X_test = X[fold_idx * fold_size : (fold_idx+1) * fold_size, :]
    Y_test = ratings[fold_idx * fold_size : (fold_idx+1) * fold_size]
    
    # save test metrics
    test_prediction = predict_ratings(X_test, w, w0, V)
    rmse_test.append(rmse(Y_test, test_prediction))
    r2_test.append(r2(Y_test, test_prediction))

Fold: 1/5
Epoch: 1/5


100%|██████████| 1227/1227 [00:35<00:00, 34.09it/s]


Epoch: 2/5


100%|██████████| 1227/1227 [00:35<00:00, 34.80it/s]


Epoch: 3/5


100%|██████████| 1227/1227 [00:35<00:00, 35.01it/s]


Epoch: 4/5


100%|██████████| 1227/1227 [00:35<00:00, 34.55it/s]


Epoch: 5/5


100%|██████████| 1227/1227 [00:35<00:00, 34.75it/s]


Fold: 2/5
Epoch: 1/5


100%|██████████| 1227/1227 [00:34<00:00, 35.56it/s]


Epoch: 2/5


100%|██████████| 1227/1227 [00:34<00:00, 35.25it/s]


Epoch: 3/5


100%|██████████| 1227/1227 [00:34<00:00, 35.43it/s]


Epoch: 4/5


100%|██████████| 1227/1227 [00:36<00:00, 33.46it/s]


Epoch: 5/5


100%|██████████| 1227/1227 [00:34<00:00, 35.89it/s]


Fold: 3/5
Epoch: 1/5


100%|██████████| 1227/1227 [00:33<00:00, 36.38it/s]


Epoch: 2/5


100%|██████████| 1227/1227 [00:33<00:00, 36.22it/s]


Epoch: 3/5


100%|██████████| 1227/1227 [00:33<00:00, 36.34it/s]


Epoch: 4/5


100%|██████████| 1227/1227 [00:33<00:00, 36.38it/s]


Epoch: 5/5


100%|██████████| 1227/1227 [00:33<00:00, 36.29it/s]


Fold: 4/5
Epoch: 1/5


100%|██████████| 1227/1227 [00:36<00:00, 33.19it/s]


Epoch: 2/5


100%|██████████| 1227/1227 [00:35<00:00, 34.19it/s]


Epoch: 3/5


100%|██████████| 1227/1227 [00:36<00:00, 33.95it/s]


Epoch: 4/5


100%|██████████| 1227/1227 [00:36<00:00, 33.63it/s]


Epoch: 5/5


100%|██████████| 1227/1227 [00:36<00:00, 33.33it/s]


Fold: 5/5
Epoch: 1/5


100%|██████████| 1227/1227 [00:33<00:00, 36.59it/s]


Epoch: 2/5


100%|██████████| 1227/1227 [00:32<00:00, 37.42it/s]


Epoch: 3/5


100%|██████████| 1227/1227 [00:35<00:00, 34.49it/s]


Epoch: 4/5


100%|██████████| 1227/1227 [00:32<00:00, 37.25it/s]


Epoch: 5/5


100%|██████████| 1227/1227 [00:34<00:00, 36.02it/s]


## Metrics

In [10]:
data = {'1':[rmse_test[0],r2_test[0],rmse_train[0],r2_train[0]], 
        '2':[rmse_test[1],r2_test[1],rmse_train[1],r2_train[1]],
        '3':[rmse_test[2],r2_test[2],rmse_train[2],r2_train[2]],
        '4':[rmse_test[3],r2_test[3],rmse_train[3],r2_train[3]],
        '5':[rmse_test[4],r2_test[4],rmse_train[4],r2_train[4]],
        'E':[np.mean(rmse_test),np.mean(r2_test),np.mean(rmse_train),np.mean(r2_train)],
        'SD':[np.std(rmse_test),np.std(r2_test),np.std(rmse_train),np.std(r2_train)]} 
 
df_res = pd.DataFrame(data, index =['RMSE test', 'R^2 test','RMSE train','R^2 train']) 

df_res

Unnamed: 0,1,2,3,4,5,E,SD
RMSE test,1.072098,1.072118,1.07208,1.072204,1.072216,1.072143,5.6e-05
R^2 test,0.023977,0.023964,0.023947,0.023944,0.023926,0.023952,1.7e-05
RMSE train,1.072153,1.072146,1.072152,1.072122,1.072116,1.072138,1.6e-05
R^2 train,0.023948,0.023956,0.023966,0.023965,0.023974,0.023962,9e-06
