# Model

In [106]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data Preprocessing

In [108]:
recs = pd.read_csv('data/pruned.csv')

In [109]:
recs.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,534380,0,0,2022-10-08,True,40.6,10531,22
1,42700,6,2,2019-10-19,False,5.9,185293,27
2,602960,0,0,2022-01-05,True,41.9,319249,58
3,976730,0,0,2021-11-25,False,21.1,747545,67
4,1091500,2,0,2022-10-30,True,18.1,2113544,127


In [110]:
USED_COLS = ['app_id', 'is_recommended', 'user_id']

recs = recs[USED_COLS]
recs.head()

Unnamed: 0,app_id,is_recommended,user_id
0,534380,True,10531
1,42700,False,185293
2,602960,True,319249
3,976730,False,747545
4,1091500,True,2113544


In [104]:
M = recs.pivot_table(index='user_id', columns='app_id', values='is_recommended', aggfunc='mean')

In [105]:
M.head()

app_id,10,30,60,70,220,240,400,440,500,550,...,2111850,2116850,2141690,2153330,2154230,2167580,2179380,2208920,2211280,2220810
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
240,,,,,,,,,,,...,,,,,,,,,,
527,,,,,1.0,,,,,,...,,,,,,,,,,
654,,,,1.0,,,,,,,...,,,,,,,,,,
705,,,,,,,,,,,...,,,,,,,,,,
864,,,,,,,,,,,...,,,,,,,,,,


In [97]:
index_user_ids = {user_id: i for i, user_id in enumerate(M.index)}
index_item_ids = {app_id: i for i, app_id in enumerate(M.columns)}

In [98]:
M = M.to_numpy()

## Data Class

In [134]:
class RecData:
    def __init__(self, data):
        """Parameters:
        data - """
        # Create user-item rating matrix
        self._M = recs.pivot_table(index=data.columns[2], columns=data.columns[0], values=data.columns[1], aggfunc='mean')
        
        self._index_user_ids = {user_id: i for i, user_id in enumerate(self._M.index)}
        self._index_item_ids = {app_id: i for i, app_id in enumerate(self._M.columns)}
        self._users = self._index_user_ids.keys()
        self._items = self._index_item_ids.keys()
        self._num_users = len(self._users)
        self._num_items = len(self._items)
        
    def create_anti_set(self):
        """Return all user-item pairs not in the data"""
        set = []
        for user in self._M.shape[0]:
            for item in self._M.shape[1]:
                if self._M[user, item] == 0:
                    set.append((user, item))
        return set
        
    def get_matrix(self):
        return self._M

In [135]:
rec_data = RecData(recs)

In [133]:
anti_set = rec_data.create_anti_set()

2246

## Simple Model

In [77]:
class AveragePredictor:
    """Simple model which always predicts the average value of an item."""
    def fit(self, M):
        self._M = M
        self._averages = np.nanmean(M, axis=0)
        
    def predict(self, user_id, item_id):
        return self._averages[item_id]        

In [117]:
def cross_val(model, data, n_folds=10):
    """Parameters:
    data - DataFrame"""
    fold_size = data.shape[0]//n_folds
    fold_recalls =  []
    for fold_num in range(n_folds):
        print("Fold", fold_num)
        # Get fold slices
        if fold_num != n_folds - 1:
            train_data = pd.concat([data.iloc[:fold_num*fold_size], data.iloc[(fold_num + 1)*fold_size:]])
            val_data = data.iloc[fold_num*fold_size:(fold_num + 1)*fold_size]
        else:
            train_data = data.iloc[:fold_num*fold_size]
            val_data = data.iloc[fold_num*fold_size:]
            
        train_data = RecData(train_data)
        val_data = RecData(val_data)
        
        

In [118]:
cross_val(None, recs)

Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
