# Create the ratings matrix

In [3]:
import pandas as pd
import numpy as np
from glob import glob

users_dir = "../dataset/utility/users"
items_dir = "../dataset/utility/items"

In [4]:
items_df = pd.read_csv(f"{items_dir}/itemset.csv", low_memory=False)
items_df.rename(columns={"Unnamed: 0": "ASIN"}, inplace=True)
items_df.set_index("ASIN", inplace=True)
display(items_df)

Unnamed: 0_level_0,Home & Kitchen,Bedding,Comforters & Sets,Comforter Sets,Kids' Bedding,Baby Products,Nursery,Toddler Bedding,Bedding Sets,Blankets & Throws,...,Torches,Pest Control,Bug Zappers,Bistro Sets,Outdoor Curtains,Patio Furniture Covers,Furniture Set Covers,Figurine Lights,Storage Benches,Boot & Shoe Boxes
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0CTM6P5TW,,,,,,,,,,,...,,,,,,,,,,
B0CTH3XT3D,1.0,,,,,,,,,,...,,,,,,,,,,
B0CT2CC1YY,1.0,,,,,,,,,,...,,,,,,,,,,
B0CSSRBG48,1.0,,,,,,,,,,...,,,,,,,,,,
B0CS4CP75C,1.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B07QXK9WG5,1.0,,,,,,,,,,...,,,,,,,,,,
B0CHXTR17D,1.0,,,,,,,,,,...,,,,,,,,,,
B0C9MZWQ1D,1.0,,,,,,,,,,...,,,,,,,,,,
B0CLRHN99R,1.0,,,,,,,,,,...,,,,,,,,,,


In [None]:
utility_csvs = glob(f"{users_dir}/utility_*")
users_df_list = []
for util in utility_csvs:
    users_df_list.append(pd.read_csv(util))

# Try all algorithms

## Content-based

### Unary Aggregation

In [None]:
def compute_user_profile_agg_unary(df_utility, df_item_profiles, user):
    """
    Return user profile with unarized ratings
    """
    utility_user_arr = df_utility.loc[user].to_numpy()
    user_mean = np.nanmean(utility_user_arr)
    unarized_ratings = []
    for i in range(len(utility_user_arr)):
        if ~np.isnan(utility_user_arr[i]):
            if utility_user_arr[i] < user_mean:
                unarized_ratings.append(0)
            else:
                unarized_ratings.append(1)
        else:
            unarized_ratings.append(np.nan)
    unarized_ratings = np.array(unarized_ratings)
    ones = np.count_nonzero(unarized_ratings == 1)
    indices = list(np.nonzero(~np.isnan(unarized_ratings))[0])
    df_item_profiles_relevant = df_item_profiles.values[indices]
    return np.dot(
        np.array(unarized_ratings)[indices],
        df_item_profiles_relevant)/np.sum(ones)

## Collaborative and Matrix Factorization

From Surprise

In [None]:
from surprise import AlgoBase, Dataset, accuracy, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import PredictionImpossible
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.prediction_algorithms.matrix_factorization import SVD, SVDpp, NMF
from surprise.prediction_algorithms.co_clustering import CoClustering
import numpy as np

In [None]:
reader = Reader(rating_scale=(0,5))
melted_user_df = users_df.reset_index().melt(
    'reviewerID', var_name='ASIN', value_name='rating').dropna()
dataset = Dataset.load_from_df(melted_user_df, reader)
algos = [KNNBasic(), KNNWithMeans(), KNNWithZScore(), SVD(), SVDpp(), NMF(), CoClustering()]
for algo in algos:
    cross_validate(algo, dataset, verbose=True)

In [None]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin("ml-100k")

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25)

# We'll use the famous SVD algorithm.
algo = MyOwnAlgorithm()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)