# Model

In [33]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [57]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix
import matplotlib.pyplot as plt
import pandas as pd

## Data Preprocessing

In [35]:
recs = pd.read_csv('data/pruned.csv')

In [36]:
recs.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,534380,0,0,2022-10-08,True,40.6,10531,22
1,42700,6,2,2019-10-19,False,5.9,185293,27
2,602960,0,0,2022-01-05,True,41.9,319249,58
3,976730,0,0,2021-11-25,False,21.1,747545,67
4,1091500,2,0,2022-10-30,True,18.1,2113544,127


In [37]:
USED_COLS = ['app_id', 'is_recommended', 'user_id']

recs = recs[USED_COLS]
recs.head()

Unnamed: 0,app_id,is_recommended,user_id
0,534380,True,10531
1,42700,False,185293
2,602960,True,319249
3,976730,False,747545
4,1091500,True,2113544


## Data Class

In [174]:
import copy

class RecData:
    def create_from_dataframe(self, data):
        """Create rec data from a Pandas dataframe. Columns must be in the form [item-id, rating, user_id]"""
        # Create user-item rating matrix
        self._M = data.pivot_table(index=data.columns[2], columns=data.columns[0], values=data.columns[1], aggfunc='mean')
        
        self._userid_to_index = {user_id: i for i, user_id in enumerate(self._M.index)}
        self._itemid_to_index = {app_id: i for i, app_id in enumerate(self._M.columns)}
        self._index_to_userid = {i: user_id for user_id, i in self._userid_to_index.items()}
        self._index_to_itemid = {i: app_id for app_id, i in self._itemid_to_index.items()}
        
        self._users = self._userid_to_index.keys()
        self._items = self._itemid_to_index.keys()
        self._num_users = len(self._users)
        self._num_items = len(self._items)
        
        self._M = self._M.to_numpy()
        self._M += 1
        self._M = np.nan_to_num(self._M)
        self._M = csr_matrix(self._M)
                
    def leave_k_out_split(self, k=1):
        M_prime = self._M.copy()
        test = []
        for user in range(self._M.shape[0]):
            possible_indices = np.nonzero(self._M[user, :])[1]
            left_out = np.random.choice(possible_indices, k, replace=False)
            for item in left_out:
                M_prime[user, item] = 0
                test.append((user, item, self._M[user, item]))
        
        train_data = RecData()
        train_data.__dict__.update(self.__dict__)
        train_data._M = M_prime
        
        return train_data, test
                        
    def create_anti_set(self):
        """Return all user-item pairs not in the data"""
        anti_set = []
        print(self._M.shape)
        for user in range(self._M.shape[0]):
            if user % 1000 == 0:
                print(user)
                
            for item in range(self._M.shape[1]):
                if self._M[user, item] == 0:        
                    anti_set.append((user, item))
                    
        return anti_set
            
    def get_matrix(self):
        return self._M
    
    def get_num_users(self):
        return self._num_users

In [175]:
rec_data = RecData()
rec_data.create_from_dataframe(recs)

In [176]:
train_data, test = rec_data.leave_k_out_split()

_M


SyntaxError: invalid syntax (<string>, line 1)

## Simple Model

In [None]:
class AveragePredictor:
    """Simple model which always predicts the average value of an item."""
    def fit(self, M):
        # Division by 0 -> nan -> not able to make prediction
        with np.errstate(invalid='ignore'):
            self._averages = (np.sum(M.toarray(), axis=0)/np.count_nonzero(M.toarray(), axis=0))
        
    def predict(self, pairs):
        """Parameters:
        pairs - user-item pairs to predict"""
        predictions = []
        for user, item in pairs:        
            predictions.append((user, item, self._averages[item]))
        return predictions

In [None]:
train_data

In [None]:
predictor = AveragePredictor()
predictor.fit(train_data.get_matrix())

In [146]:
predictions = predictor.predict([(user, item) for user, item, _ in test])

In [117]:
def cross_val(model, data, n_folds=10):
    """Parameters:
    data - DataFrame"""
    fold_size = data.shape[0]//n_folds
    fold_recalls =  []
    for fold_num in range(n_folds):
        print("Fold", fold_num)
        # Get fold slices
        if fold_num != n_folds - 1:
            train_data = pd.concat([data.iloc[:fold_num*fold_size], data.iloc[(fold_num + 1)*fold_size:]])
            val_data = data.iloc[fold_num*fold_size:(fold_num + 1)*fold_size]
        else:
            train_data = data.iloc[:fold_num*fold_size]
            val_data = data.iloc[fold_num*fold_size:]
            
        train_data = RecData(train_data)
        val_data = RecData(val_data)
        
        

In [118]:
cross_val(None, recs)

Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
