# Collaborative Filtering Model

In [21]:
import pandas as pd
import numpy as np
import pickle as pkl
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# enable offline plotting in plotly
init_notebook_mode(connected=True)

In [22]:
# load our 3 datasets
users = pd.read_csv('data/user_features.csv')
problems =  pd.read_csv('data/problem_features.csv')
submissions = pd.read_csv('data/train_submissions.csv')

In [23]:
train, R_test = train_test_split(submissions, test_size=0.25, random_state=42)

R_train, R_cv = train_test_split(train, test_size=0.25, random_state=42)

In [24]:
R_train = R_train.set_index(['user_id','problem_id']).unstack(level=-1)
R_cv = R_cv.set_index(['user_id','problem_id']).unstack(level=-1)

R_train.columns = R_train.columns.droplevel()
R_cv.columns = R_cv.columns.droplevel()

empty_sub = pd.DataFrame(np.nan, index=users.user_id.unique(), 
                         columns=problems.problem_id.unique())

R_train = empty_sub.fillna(R_train)
R_cv = empty_sub.fillna(R_cv)

In [25]:
def save_obj(obj, name ):
    with open('results/'+ name + '.pkl', 'wb') as f:
        pkl.dump(obj, f, pkl.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('results/' + name + '.pkl', 'rb') as f:
        return pkl.load(f)

In [26]:
def unroll(M_users, M_items):
    
    """Reshape 2 matrices into a single 1D array. 
    Inverse function of `roll`.
    
    Parameters
    ----------
    M_users : 2D numpy array
        Matrix of user latent features. Has
        dimensions (n_users, n_features).
    M_items : 2D numpy array
        Matrix of item latent features. Has
        dimensions (n_items, n_features).
        
    Returns
    -------
    x_users_items : 1D numpy array
        User and item latent features.
    """
    
    # convert matrices to np arrays
    M_users = np.array(M_users)
    M_items = np.array(M_items)

    # flatten 2D arrays into 1D arrays
    x_users = M_users.flatten(order='C')
    x_items = M_items.flatten(order='C')
    
    # concatenate user and item 1D arrays
    x_users_items = np.concatenate((x_users, x_items), axis=0)

    return x_users_items

In [27]:
def roll(x_users_items, n_users, n_items, n_features):
    
    """Reshape a 1D array of user and item latent
    features into their original 2D array format.
    Inverse function of `unroll`.
    
    Parameters
    ----------
    x_users_items : 1D numpy array
        User and item latent features.
    n_users : int
        Number of users.
    n_items : int
        Number of items.
    n_features: int
        Number of latent features to learn. 
        Determines the overall size of M_users 
        and M_items.
        
    Returns
    -------
    M_users : 2D numpy array
        Matrix of user latent features. Has
        dimensions (n_users, n_features).
    M_items : 2D numpy array
        Matrix of item latent features. Has
        dimensions (n_items, n_features).
    """
    
    # retrieve user and item 1D arrays
    x_users = x_users_items[0:n_users*n_features]
    x_items = x_users_items[n_users*n_features:]
    
    # reshape 1D arrays into original matrices
    M_users = np.reshape(x_users, (n_users, n_features))
    M_items = np.reshape(x_items, (n_items, n_features))

    return M_users, M_items

In [28]:
def cost(x_users_items, Y_true, Lambda, n_users, n_items, n_features):
    
    """Compute cost (error) J from predictions on 
    Y_true using learned features `x_users_items`. J 
    is defined as the sum of squared errors plus
    regularization penatlies on user and item 
    latent features.
    
    Parameters
    ----------
    x_users_items : 1D numpy array
        User and item latent features.
    Y_true : 2D numpy array
        Matrix containing true ratings.
    Lambda : int
        Regularization coefficient.
    n_users : int
        Number of users.
    n_items : int
        Number of items.
    n_features: int
        Number of latent features to learn. 
        Determines the overall size of M_users 
        and M_items.
        
    Returns
    -------
    J : float
        Cost associated with prediction on `Y_true` 
        using learned latent features `x_users_items`.
    """
    
    # recover 2D user and item feature matrices
    M_users, M_items = roll(x_users_items, n_users, n_items, n_features)

    # compute the prediction
    Y_predicted = np.dot(M_users, M_items.T)
    
    # compute the error in the prediction
    error = Y_true - Y_predicted
    # replace all NaN values with 0
    error[np.isnan(error)] = 0

    # compute the regularization penalties
    User_regularization = (Lambda/2) * np.nansum(M_users * M_users)
    Item_regularization = (Lambda/2) * np.nansum(M_items * M_items)

    # compute the cost J with regularization
    J = (1/2) * np.nansum(error*error) + User_regularization + Item_regularization

    return J

In [29]:
def gradient(x_users_items, Y_true, Lambda, n_users, n_items, n_features):
    
    """Compute gradient function on `x_users_items`.
    
    Parameters
    ----------
    x_users_items : 1D numpy array
        User and item latent features.
    Y_true : 2D numpy array
        Matrix containing true ratings.
    Lambda : int
        Regularization coefficient.
    n_users : int
        Number of users.
    n_items : int
        Number of items.
    n_features: int
        Number of latent features to learn. 
        Determines the overall size of M_users 
        and M_items.
        
    Returns
    -------
    gradient : 1D numpy array
        Gradient of cost J w.r.t user and item
        latent features.
    """
    
    # recover 2D user and item feature matrices
    M_users, M_items = roll(x_users_items, n_users, n_items, n_features)

    # compute the prediction
    Y_predicted = np.dot(M_users, M_items.T)
    
    # compute the error in the prediction
    error = Y_true - Y_predicted
    # replace all NaN values with 0
    error[np.isnan(error)] = 0 

    # the gradients of user & item features
    M_user_gradient = np.dot(error, M_items) + Lambda*M_users
    M_item_gradient = np.dot(error.T, M_users) + Lambda*M_items

    # reshape gradients into 1D array
    gradient = unroll(M_user_gradient, M_item_gradient)

    return gradient

In [30]:
def predict(x_users_items, n_users, n_items, n_features):
    
    """Compute prediction on ratings. Predictions
    are computed from learned user and item 
    latent features in `x_users_items`.
    
    Parameters
    ----------
    x_users_items : 1D numpy array
        User and item latent features.
    n_users : int
        Number of users.
    n_items : int
        Number of items.
    n_features: int
        Number of latent features to learn. 
        Determines the overall size of M_users 
        and M_items.
    
    Return
    ------
    Y_predicted : pandas DataFrame
        Predictions.
    """
    
    # recover 2D user and item feature matrices
    M_users, M_items = roll(x_users_items, n_users, n_items, n_features)

    # compute predictions from P & Q
    Y_predicted = np.dot(M_users, M_items.T) 
    
    # set all negative predictions to 1 (bottom limit)
    Y_predicted[Y_predicted < 1] = 1
    
    Y_predicted = Y_predicted.astype(int)
    
    return Y_predicted

In [31]:
def f1_matrix(Y_true, Y_predicted, average='weighted', labels=[1.0,2.0,3.0,4.0,5.0,6.0]):
    
    """Compute the f1_score between an actual values
    matrix and predicted values matrix.
    
    Parameters
    ----------
    Y_true : 2D numpy array
        Matrix of true values.
    Y_predicted : 2D numpy array
        Matrix of predictions.
    average : str
        Method for weighting f1-score which is computed
        for each label.
    labels : list
        List of labels to compute f1-scores over.
        
    Returns
    -------
    f1 : float
        f1-score computed using `average` method 
        across specified `labels`.
    """
    
    # get indices of non-NaN values
    mask = ~np.isnan(np.array(Y_true))
    
    # flatten mask into 1D array
    mask = mask.flatten(order='C')
    
    # flatten matrices into 1D arrays
    y_true = Y_true.flatten(order='C')
    y_predicted = Y_predicted.flatten(order='C')
    
    # filter the arrays using the mask
    y_true = y_true[mask]
    y_predicted = y_predicted[mask]
    
    # compute f1-score
    f1 = f1_score(y_true, y_predicted, average=average, labels=labels)
    
    return f1

In [32]:
def SGD_e(R_train, n_users, n_items, n_features, Lambda, 
          epochs, alpha, compute_f1=False, seed=42, **kwargs):
    
    """Stochastic gradient descent algorithm. Searches
    for the optimal values of user and item latent
    features in x_users_items, that minimize the cost J.
    Updates are calculated for `epochs` iterations using a 
    learning rate `alpha`.
    
    Parameters
    ----------
    R_train : 2D numpy array
        Ratings matrix for training dataset.
    R_cv : 2D numpy array
        Ratings matrix for cross-validation dataset.
    n_users : int
        Number of users.
    n_items : int
        Number of items.
    n_features: int
        Number of latent features to learn. 
        Determines the overall size of M_users 
        and M_items.
    Lambda : int
        Regularization coefficient.
    epochs : int
        Number of iterations to run.
    alpha : float
        Learning rate.
    compute_f1 : bool (default False)
        If true, computes the f1-scores for predictions
        on R_train and R_cv at each epoch.
    seed : int (default 42)
        Seed for numpy's pseudo-random number
        generator.
    
    Returns
    -------
    results : dict
        Nested dictionary containing epoch as keys. Values
        associated with each `epochs` are cost J, f1-scores 
        for training and CV datasets, and optimized 
        parameters `x_users_items`.
        
    """
    
    # get cross-validation data if given
    R_cv = kwargs.get('R_cv')
    
    # set random seed
    np.random.seed(seed)
    
    # intial random guess of user and item
    # latent features
    M_users = np.random.rand(n_users, n_features) - 0.5
    M_items = np.random.rand(n_items, n_features) - 0.5
    
    # reshape matrices into 1D array of
    # user and item latent features
    x_users_items = unroll(M_users, M_items)
    
    # initialize empty dict to store training results
    results = {}
    
    # loop through `epochs` iterations
    for e in range(1,epochs+1):
        
        # compute the cost
        J = cost(x_users_items, R_train, Lambda, 
                   n_users, n_items, n_features)
        
        # compute the gradient function
        gradient_ = gradient(x_users_items, R_train, Lambda, 
                      n_users, n_items, n_features)
        
        # update `x_users_items`
        x_users_items = x_users_items + alpha * gradient_
        
        if compute_f1:
            # make prediction
            Y_predicted = predict(x_users_items, n_users, n_items, n_features)

            # store cost J, f1-scores on training and CV data,
            # and the optimized parameters `x_users_items`.
            results[e] = {'J': J, 'f1-train': f1_matrix(R_train, Y_predicted), 
                          'f1-cv': f1_matrix(R_cv, Y_predicted), 'x_users_items': x_users_items}
            
        else:
            # store cost J and optimized parameters `x_users_items`
            results[e] = {'J': J, 'x_users_items': x_users_items}
        
        # print current epoch and cost
        print('Epoch %s' % e + ' | ' + 'J : %s' % round(J))
        
        # logic for stop condition
        if e > 2:
            # compute delta in previous iteration
            delta0 = (results[e-2]['J'] - results[e-1]['J'])
            
            # compute delta for current iteration
            delta = (results[e-1]['J'] - results[e]['J'])
            
            # if delta for current iteration is larger than
            # delta for previous iteration, end updates
            if (delta < 0) | (delta > delta0 + 2):
                print('Gradient diverging! Ending training...')
                break
        else:
            pass
    
    # indicate completion and print final J
    print('Training complete, final J: %s' % round(results[epochs]['J']))
    
    return results

In [34]:
%%time

# convert dataframes into numpy arrays
R_train_ = np.array(R_train)
R_cv_ = np.array(R_cv)

# define model parameters
n_users = R_train_.shape[0]
n_items = R_train_.shape[1]
n_features=10
Lambda=0.1
alpha=0.001
epochs=50

results = SGD_e(R_train_, n_users, n_items, n_features, Lambda, 
                epochs, alpha, compute_f1=True, R_cv=R_cv_)

Epoch 1 | J : 188180.0
Epoch 2 | J : 187498.0
Epoch 3 | J : 186838.0
Epoch 4 | J : 186194.0
Epoch 5 | J : 185564.0
Epoch 6 | J : 184944.0
Epoch 7 | J : 184330.0
Epoch 8 | J : 183719.0
Epoch 9 | J : 183108.0
Epoch 10 | J : 182492.0
Gradient diverging! Ending training...


KeyError: 50

In [None]:
x = np.array(list(results.keys()))
y = np.array([results[i]['J'] for i in results.keys()])

trace0=go.Scattergl(x=x, y=y, mode='lines+markers')

layout=go.Layout(title='Cost Function vs epoch',
                yaxis=dict(title='Cost Function'),
                xaxis=dict(title='epoch'))

fig = go.Figure([trace0], layout)

iplot(fig, filename='training.html')

In [None]:
y_train = np.array([i['F1-train'] for i in results.values()])
y_cv = np.array([i['F1-cv'] for i in results.values()])

In [None]:
trace0=go.Scattergl(x=x, y=y_train, mode='lines+markers',
                   name='Train F1-score')
trace1=go.Scattergl(x=x, y=y_cv, mode='lines+markers',
                   name='CV F1-score')

layout=go.Layout(title='F1-scores for Training vs CV datasets',
                xaxis=dict(title='epoch'),
                yaxis=dict(title='F1-score'))

fig = go.Figure([trace0, trace1], layout)

iplot(fig, filename='training.html')

In [None]:
# save results
save_obj(results, 'initial_100epochs')

In [16]:
def SGD_t(R_train, n_users, n_items, n_features, Lambda, 
          epsilon, alpha, compute_f1=False, seed=42, **kwargs):
    
    """Stochastic gradient descent algorithm. Searches
    for the optimal values of user and item latent
    features in x_users_items, that minimize the cost J.
    Updates are calculated until the change (delta) in J, 
    between iterations, is less than epsilon.
    
    Parameters
    ----------
    R_train : 2D numpy array
        Ratings matrix for training dataset.
    R_cv : 2D numpy array (optional, must be given if
    `compute_f1` is True)
        Ratings matrix for cross-validation dataset.
    n_users : int
        Number of users.
    n_items : int
        Number of items.
    n_features: int
        Number of latent features to learn. 
        Determines the overall size of M_users 
        and M_items.
    Lambda : int
        Regularization coefficient.
    epsilon : float
        Training threshold. Training stops when the cost
        J is less than epsilon.
    alpha : float
        Learning rate.
    compute_f1 : bool (default False)
        If true, computes the f1-scores for predictions
        on R_train and R_cv at each epoch.
    seed : int (default 42)
        Seed for numpy's pseudo-random number
        generator.
    
    Returns
    -------
    results : dict
        Nested dictionary containing epoch as keys. Values
        associated with each `epochs` are cost J, f1-scores 
        for training and CV datasets, and optimized 
        parameters `x_users_items`.
        
    """
    
    # get cross-validation data if given
    R_cv = kwargs.get('R_cv')
    
    # set random seed
    np.random.seed(seed)
    
    # intial random guess of user and item
    # latent features
    M_users = np.random.rand(n_users, n_features) - 0.5
    M_items = np.random.rand(n_items, n_features) - 0.5
    
    # reshape matrices into 1D array of
    # user and item latent features
    x_users_items = unroll(M_users, M_items)
    
    # initialize empty dict to store training results
    results = {}
    
    e = 1 # counter for training iteration
    
    # large, arbitrary initial value for delta in J
    delta = 1000
    
    # iterate until the delta in J is less than epsilon
    while delta > epsilon:
        
        # compute the cost
        J = cost(x_users_items, R_train, Lambda, 
                   n_users, n_items, n_features)
        
        # compute the gradient function
        gradient_ = gradient(x_users_items, R_train, Lambda, 
                      n_users, n_items, n_features)
        
        # update `x_users_items`
        x_users_items = x_users_items + alpha * gradient_
        
        if compute_f1:
            # make prediction
            Y_predicted = predict(x_users_items, n_users, n_items, n_features)

            # store cost J, f1-scores on training and CV data,
            # and the optimized parameters `x_users_items`.
            results[e] = {'J': J, 'f1-train': f1_matrix(R_train, Y_predicted), 
                          'f1-cv': f1_matrix(R_cv, Y_predicted), 'x_users_items': x_users_items}
            
        else:
            # store cost J and optimized parameters `x_users_items`
            results[e] = {'J': J, 'x_users_items': x_users_items}
        
        # print current epoch and cost
        print('Epoch %s' % e + ' | ' + 'J : %s' % round(J))
        
        # logic for stop condition
        if e > 1:
            # compute delta for current iteration
            delta = (results[e-1]['J'] - results[e]['J'])
            
            # if J increases from last iteration (delta < 0)
            # end updates and return results
            if delta < 0:
                print('Gradient diverging! Ending training...')
                return results
        else:
            pass
        
        print('Cost delta: %s' % round(delta))
        print()
        
        e += 1
    
    # indicate completion and print final J
    print('Stopping criteria met: delta < epsilon.')
    print('Final J: %s' % round(results[epochs]['J']))
        
    return results

In [17]:
%%time

R_train_ = np.array(R_train)
R_cv_ = np.array(R_cv)

grid = {'f': [50, 100],
       'L':[0.01, 0.1, 1],
       'alpha':[0.0005, 0.001]}

epsilon=100

evil_master_plan = {}

i=1
for n_features in grid['f']:
    for Lambda in grid['L']:
        for alpha in grid['alpha']:
            print('current values of (n_features, Lambda, alpha): %s' % \
                  str((n_features, Lambda, alpha)))
            
            results = SGD_t(R_train=R_train_, n_users=n_users, n_items=n_items, 
                            n_features=n_features, Lambda=Lambda, epsilon=epsilon, 
                            alpha=alpha, compute_f1=True, R_cv=R_cv_)
            
            evil_master_plan[i] = {'f':n_features, 'L':Lambda, 'alpha':alpha, 'results':results}
            
            i+=1

current values of (n_features, Lambda, alpha): (50, 0.01, 0.0005)
Epoch 1 | J : 199505.0
Cost delta: 1000

Epoch 2 | J : 197664.0
Cost delta: 1000

Epoch 3 | J : 195869.0
Cost delta: 1794.0



KeyboardInterrupt: 

In [None]:
evil_master_plan.keys()

In [None]:
# save results
for i in evil_master_plan.keys():
    save_obj(evil_master_plan[i], 'fullsearch_eps50_run_%s' % i)

In [None]:
initial = min(results.keys())
final = max(results.keys())

print('F1-score @ epoch %s:' % ini tial + ' %s' % round(results[initial]['F1-train'], 4))
print('F1-score @ epoch %s:' % final + ' %s' % round(results[final]['F1-cv'], 4))

In [None]:
initial = min(results.keys())
final = max(results.keys())

R_pred_init = predict(results[initial]['xopt'], n_u, n_i, f)
R_pred_fin = predict(results[final]['xopt'], n_u, n_i, f)

print('F1-score @ epoch %s:' % initial + ' %s' % round(f1(R_train, R_pred_init), 4))
print('F1-score @ epoch %s:' % final + ' %s' % round(f1(R_train, R_pred_fin), 4))

In [None]:
f1(R_cv, R_pred_fin)