# Model

In [33]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix
import matplotlib.pyplot as plt
import pandas as pd
import math

## Data Preprocessing

In [2]:
recs = pd.read_csv('data/pruned.csv')

In [3]:
recs.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,534380,0,0,2022-10-08,True,40.6,10531,22
1,42700,6,2,2019-10-19,False,5.9,185293,27
2,602960,0,0,2022-01-05,True,41.9,319249,58
3,976730,0,0,2021-11-25,False,21.1,747545,67
4,1091500,2,0,2022-10-30,True,18.1,2113544,127


In [4]:
USED_COLS = ['app_id', 'is_recommended', 'user_id']

recs = recs[USED_COLS]
recs.head()

Unnamed: 0,app_id,is_recommended,user_id
0,534380,True,10531
1,42700,False,185293
2,602960,True,319249
3,976730,False,747545
4,1091500,True,2113544


## Data Class

In [5]:
from RecData import RecData

In [6]:
class Metrics:
    def rmse(self, predictions):
        return math.sqrt(sum((prediction - true_rating)**2 for _, _, prediction, true_rating in predictions)/len(predictions))

In [7]:
rec_data = RecData()
rec_data.create_from_dataframe(recs)

In [8]:
train_data, test = rec_data.leave_k_out_split()

## Simple Model

In [9]:
def item_averages(M):
    # Division by 0 -> no data for item -> predict average of all items
    with np.errstate(invalid='ignore'):
        # Take item averages
        averages = (np.sum(M.toarray(), axis=0)/np.count_nonzero(M.toarray(), axis=0))

        # Fill na with averages over all items
        averages = np.nan_to_num(averages, nan=np.nanmean(averages))
    return averages

In [10]:
class AveragePredictor:
    """Simple model which always predicts the average value of an item."""
    def fit(self, M):
        self._averages = item_averages(M)
        
    def predict(self, pairs):
        """Parameters:
        pairs - user-item pairs to predict"""
        predictions = []
        for user, item in pairs:
            predictions.append((user, item, self._averages[item]))

        return predictions

In [11]:
predictor = AveragePredictor()
predictor.fit(train_data.get_matrix())

In [12]:
predictions = predictor.predict([(user, item) for user, item, _ in test])
predictions = [prediction + (test[i][2],) for i, prediction in enumerate(predictions)]
metrics = Metrics()
metrics.rmse(predictions)

0.37214961220300863

## SVD

In [104]:
from numpy.linalg import svd
class SVDPredictor:
    """SVD for collaborative filtering"""
    def __init__(self, num_users, num_items, k=100, learning_rate=0.000001, epochs=5, C=0.02):
        self._num_users = num_users
        self._num_items = num_items
        
        self._k = k
        self._learning_rate = learning_rate
        self._epochs = epochs
        self._C = C
        
        self._user_features = np.random.normal(size=(self._num_users, self._k), scale=0.01)
        self._item_features = np.random.normal(size=(self._num_items, self._k), scale=0.01)
    
    def fit(self, M):
        for epoch in range(self._epochs):
            print("Epoch", epoch)
            diff = M - self._user_features @ np.transpose(self._item_features)
            print(diff)
            rows, cols = M.nonzero()
            for i in range(len(rows)):
                if i  % 1e5 == 0:
                    print("Computing for review", i)
                
                    
                row = rows[i]
                col = cols[i]
                
                if np.max(self._learning_rate*self._item_features[col, :]*diff[row, col]) > 1000:
                    print(self._learning_rate*self._item_features[col, :]*diff[row, col])
                    print(i)
                    return

                
                new_user_features = self._user_features - self._learning_rate*(self._item_features[col, :]*diff[row, col] - self._C*self._user_features[row, :])
                self._item_features -= self._learning_rate*(self._user_features[row, :]*diff[row, col] - self._C*self._item_features[col, :])
                self._user_features = new_user_features
        
    def predict(self, pairs):
        predictions = []
        for user, item in pairs:
            prediction = (self._user_features[user, :] @ np.transpose(self._item_features)[:, item])
            predictions.append((user, item, prediction))
        
        return predictions

In [105]:
train_data.get_matrix().nonzero()[1].shape

(1419357,)

In [None]:
svd_predictor = SVDPredictor(train_data.get_num_users(), train_data.get_num_items(), k=5, epochs=1)
svd_predictor.fit(train_data.get_matrix())
predictions = svd_predictor.predict([(user, item) for user, item, _ in test])
predictions = [prediction + (test[i][2],) for i, prediction in enumerate(predictions)]
metrics = Metrics()
metrics.rmse(predictions)

Epoch 0
[[ 2.15569318e-04 -1.50344049e-04  3.79036846e-04 ...  5.69094478e-04
  -3.76797719e-04 -8.62434777e-05]
 [-1.17638551e-04 -4.92797256e-04  4.84320497e-06 ...  2.28943942e-04
   2.88625389e-05 -1.50515060e-04]
 [-1.34986988e-04  1.82775880e-04  3.86171385e-04 ... -3.72611482e-04
   1.98926286e-04  3.32431605e-04]
 ...
 [ 2.24311352e-04  2.97287320e-04 -1.36734722e-04 ...  5.18137886e-04
  -2.91890191e-04 -1.90995476e-04]
 [-1.12936764e-04  4.93280923e-04 -2.42551946e-04 ... -7.69185009e-05
   1.11706738e-04  5.92751700e-05]
 [-2.58625754e-04 -6.89142063e-05  1.88946386e-04 ... -1.90310496e-04
   3.90081480e-05  2.05689205e-04]]
Computing for review 0
