# Model

In [33]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix
import matplotlib.pyplot as plt
import pandas as pd
import math

## Data Preprocessing

In [2]:
recs = pd.read_csv('data/pruned.csv')

In [3]:
recs.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,534380,0,0,2022-10-08,True,40.6,10531,22
1,42700,6,2,2019-10-19,False,5.9,185293,27
2,602960,0,0,2022-01-05,True,41.9,319249,58
3,976730,0,0,2021-11-25,False,21.1,747545,67
4,1091500,2,0,2022-10-30,True,18.1,2113544,127


In [4]:
USED_COLS = ['app_id', 'is_recommended', 'user_id']

recs = recs[USED_COLS]
recs.head()

Unnamed: 0,app_id,is_recommended,user_id
0,534380,True,10531
1,42700,False,185293
2,602960,True,319249
3,976730,False,747545
4,1091500,True,2113544


## Data Class

In [5]:
from RecData import RecData

In [6]:
class Metrics:
    def rmse(self, predictions):
        return math.sqrt(sum((prediction - true_rating)**2 for _, _, prediction, true_rating in predictions)/len(predictions))

In [7]:
rec_data = RecData()
rec_data.create_from_dataframe(recs)

In [8]:
train_data, test = rec_data.leave_k_out_split()

## Simple Model

In [9]:
def item_averages(M):
    # Division by 0 -> no data for item -> predict average of all items
    with np.errstate(invalid='ignore'):
        # Take item averages
        averages = (np.sum(M.toarray(), axis=0)/np.count_nonzero(M.toarray(), axis=0))

        # Fill na with averages over all items
        averages = np.nan_to_num(averages, nan=np.nanmean(averages))
    return averages

In [10]:
class AveragePredictor:
    """Simple model which always predicts the average value of an item."""
    def fit(self, M):
        self._averages = item_averages(M)
        
    def predict(self, pairs):
        """Parameters:
        pairs - user-item pairs to predict"""
        predictions = []
        for user, item in pairs:
            predictions.append((user, item, self._averages[item]))

        return predictions

In [11]:
predictor = AveragePredictor()
predictor.fit(train_data.get_matrix())

In [12]:
predictions = predictor.predict([(user, item) for user, item, _ in test])
predictions = [prediction + (test[i][2],) for i, prediction in enumerate(predictions)]
metrics = Metrics()
metrics.rmse(predictions)

0.37214961220300863

## SVD

In [77]:
from numpy.linalg import svd
class SVDPredictor:
    """SVD for collaborative filtering"""
    def __init__(self, num_users, num_items, k=100, learning_rate=0.01, epochs=5):
        self._num_users = num_users
        self._num_items = num_items
        
        self._k = k
        self._learning_rate = learning_rate
        self._epochs = epochs
        
        self._user_features = np.random.normal(size=(self._num_users, self._k))
        self._item_features = np.random.normal(size=(self._num_items, self._k))
    
    def fit(self, M):
        for epoch in range(self._epochs):
            print("Epoch", epoch)
            diff = M - self._user_features @ np.transpose(self._item_features)
            print(diff)
            rows, cols = M.nonzero()
            for i in range(len(rows)):
                if i  % 1e5 == 0:
                    print("Computing for review", i)
                
                    
                row = rows[i]
                col = cols[i]
                
                if np.max(self._learning_rate*self._item_features[col, :]*diff[row, col]) > 10:
                    print(self._learning_rate*self._item_features[col, :]*diff[row, col])
                    print(i)

                
                self._user_features -= self._learning_rate*self._item_features[col, :]*diff[row, col]
                self._item_features -= self._learning_rate*self._user_features[row, :]*diff[row, col]
        
    def predict(self, pairs):
        predictions = []
        for user, item in pairs:
            prediction = (self._user_features[user, :] @ np.transpose(self._item_features)[:, item])
            predictions.append((user, item, prediction))
        
        return predictions

In [78]:
train_data.get_matrix().nonzero()[1].shape

(1419357,)

In [79]:
svd_predictor = SVDPredictor(train_data.get_num_users(), train_data.get_num_items(), k=5, epochs=1)
svd_predictor.fit(train_data.get_matrix())
predictions = svd_predictor.predict([(user, item) for user, item, _ in test])
predictions = [prediction + (test[i][2],) for i, prediction in enumerate(predictions)]
metrics = Metrics()
metrics.rmse(predictions)

Epoch 0
[[ 0.72221385 -2.40235995 -1.63520117 ... -0.00780982 -3.92798198
  -0.66222607]
 [ 0.91115955 -3.35474529 -0.80769425 ...  0.52048384 -3.59242437
  -0.76669433]
 [ 1.86349418  0.16598667  1.04306675 ...  2.08064906  2.72608635
  -0.9850728 ]
 ...
 [-1.04779117  0.91744914  0.35892592 ... -1.38711194  0.33541355
  -0.77489846]
 [ 0.19289093  0.94136496 -0.55191899 ... -0.56455865 -0.83250045
   1.23098368]
 [ 2.14884934 -3.47216012  0.82493077 ...  3.34050468  0.65374497
  -1.38191493]]
Computing for review 0
[-0.00262716 -0.02157288 -0.00733874  0.01962276  0.00321283]


  self._user_features -= self._learning_rate*self._item_features[col, :]*diff[row, col]
  self._user_features -= self._learning_rate*self._item_features[col, :]*diff[row, col]
  self._item_features -= self._learning_rate*self._user_features[row, :]*diff[row, col]


KeyboardInterrupt: 