In [1]:
%run helper.ipynb

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
class UserCollabKnnModel():
    
    def __init__(self, X_train, y_train, similarity='cosine', k=10):        
        X_train['overall'] = y_train
        self._user_product_matrix = pd.crosstab(X_train.reviewerID, X_train.asin, X_train.overall, aggfunc='max')
        
        self._k = k        
        
        if similarity == 'cosine': 
            cos_similarity = cosine_similarity(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(cos_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        elif similarity == 'pearson':
            pea_similarity = np.corrcoef(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(pea_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        else:
            raise Exception
            
    def predict(self, X):
        results = np.array([])
        for i, row in X.iterrows():
            results = np.append(results, self._predict(row))
        return results
    
    def _knn_filtered(self, user_id, product_id, k):
        return self._similarity[user_id][~np.isnan(self._user_product_matrix[product_id])].sort_values(ascending=False).head(k)
    
    def _predict(self, X):
        if X['asin'] in self._user_product_matrix and X['reviewerID'] in self._user_product_matrix.index:
            neighbours = self._knn_filtered(X['reviewerID'], X['asin'], self._k)

            if not len(neighbours):
                return 2.5
            
            ratings = self._user_product_matrix[X['asin']][neighbours.index.values].to_numpy().astype(float)
            weights = neighbours.values.astype(float)
            
            if weights.sum() != 0:
                return np.dot(ratings, weights)/weights.sum()
        return 2.5

In [4]:
X_train, y_train = load_train(subset_name)
X_test, y_test = load_test(subset_name)

In [5]:
model = UserCollabKnnModel(X_train, y_train)
y_pred = model.predict(X_test)
print_score(y_test, y_pred)

RMSE: 1.07327468998795
MAE: 0.7464510929652136



In [6]:
save_preds(model, subset_name, y_pred)

In [None]:
%run helper.ipynb
for u in user_ids:
    save_preds(model, subset_name, model.predict(get_user_pred_data(u, subset_name)), f=u)