# Notebook used to create the integration for implicit matrix factorization

In [6]:
'''
Implements Alternating Least Squares (ALS) to create a recommender system for a subset of the Netflix dataset.
'''
import matplotlib.pyplot as plt
import numpy as np
import argparse
from scipy.sparse import csc_matrix, csr_matrix
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import ray
from random import shuffle


from more_itertools import chunked

In [15]:
file, out_dir, n_features = '../datasets/kgrec/music_ratings.csv.gz', '../datasets/kgrec/mf/', 50
# file, out_dir, n_features = '../datasets/spotify/ratings.csv.gz', '../datasets/spotify/mf/', 300

original_data = pd.read_csv(file)

print(original_data.head())
# chceck ids
assert original_data['user_id'].nunique() == original_data['user_id'].max() + 1
assert original_data['item_id'].nunique() == original_data['item_id'].max() + 1

print('Number of users: ', original_data['user_id'].nunique())
print('Number of items: ', original_data['item_id'].nunique())

   user_id  item_id
0        0        0
1        0        1
2        0        2
3        0        3
4        0        4
Number of users:  5199
Number of items:  8640


In [8]:
# keep only users with at least n ratings
n = 10
num_ratings = original_data.groupby('user_id').size()
index_size_ok = num_ratings[num_ratings >= n].index
index_size_low = num_ratings[num_ratings < n].index
original_data_ok = original_data[original_data['user_id'].isin(index_size_ok)]
original_data_low = original_data[original_data['user_id'].isin(index_size_low)]

In [9]:
def load_and_process_df():
    '''
    Loads a dataframe from a file and returns a sparse matricies.
    '''
    df = original_data_ok.copy()
    df_low = original_data_low.copy()

    if 'user_id' not in df.columns or 'item_id' not in df.columns:
        raise Exception('Dataframe does not have user_id and item_id columns')
    if not 'rating' in df.columns:
        df['rating'] = 1
        df_low['rating'] = 1
    
    df = df[['user_id', 'item_id', 'rating']]
    df_low = df_low[['user_id', 'item_id', 'rating']]


    # split to training and testing data 80:20 for each user
    training, testing = train_test_split(df, test_size=0.2, stratify=df['user_id'])
    # training = pd.concat([training, df_low])

    num_of_users = df['user_id'].max() + 1
    num_of_items = df['item_id'].max() + 1

    training_data_csc = csc_matrix((training['rating'], (training['user_id'], training['item_id'])), shape=(num_of_users, num_of_items))
    testing_data_csc = csc_matrix((testing['rating'], (testing['user_id'], testing['item_id'])), shape=(num_of_users, num_of_items))
    
    # check if the last value is only in training data xor testing data
    # print(training_data)
    uid = int(df.iloc[-1]['user_id'])
    iid = int(df.iloc[-1]['item_id'])
    rating = df.iloc[-1].rating
    in_train = training_data_csc[uid, iid] == rating
    in_test = testing_data_csc[df.iloc[-1]['user_id'], df.iloc[-1]['item_id']] == df.iloc[-1].rating
    assert (in_train and not in_test) or (not in_train and in_test), 'Dataframe is not split correctly'
    
    return training_data_csc, testing_data_csc, num_of_users, num_of_items


training_data_csc, testing_data_csc, number_of_users, number_of_items = load_and_process_df()

print(training_data_csc.shape)
print(testing_data_csc.shape)

(5199, 8640)
(5199, 8640)


In [5]:
import implicit


input = '../datasets/kgrec/music_ratings.csv.gz'
factors = 50


model = implicit.als.AlternatingLeastSquares(factors=factors, num_threads=10)
# train the model on a sparse matrix of user/item/confidence weights

# we need crs matrix for implicit library as specified in the documentation
# user_items: csr_matrix
#   Matrix of confidences for the liked items. This matrix should be a csr_matrix where
#   the rows of the matrix are the user, the columns are the items liked by that user,
#   and the value is the confidence that the user liked the item.

user_item_data_csr = training_data_csc.tocsr()
model.fit(user_item_data_csr, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [11]:
import numpy as np

testing_data_csc

U_features = model.user_factors
I_features = model.item_factors

np.save(out_dir + 'U_features.npy', U_features)
np.save(out_dir + 'I_features.npy', I_features)

print('features saved to', out_dir)

# print statistics about features
print('U_features shape: {0}'.format(U_features.shape))
print('I_features shape: {0}'.format(I_features.shape))
print(U_features)
print(I_features)

features saved to ../datasets/kgrec/mf/
U_features shape: (5199, 50)
I_features shape: (8640, 50)
[[ 0.32297722  0.14018077 -0.18361667 ...  0.32090026  0.14438957
   0.41769338]
 [-0.47638202 -0.17443427 -0.04962534 ... -0.32596985  0.23652582
  -0.79392034]
 [ 0.336984    0.3637985  -0.26695216 ... -0.02917717  0.15864246
   0.60157067]
 ...
 [-0.27085918 -0.12923804 -0.26580304 ... -0.17377138  0.7439807
   0.16454127]
 [ 0.325364   -0.16725653  0.31061834 ... -0.8237092  -0.20028614
  -0.5407357 ]
 [ 0.13770361 -0.5618404   0.37424856 ... -0.46694812 -0.35027254
  -0.6047742 ]]
[[ 1.6076101e-02  3.0562010e-02 -5.7015836e-02 ...  2.5372272e-02
  -1.6897468e-02 -9.4786088e-04]
 [ 4.4864896e-03 -7.1100001e-03  7.9561817e-03 ...  9.2834402e-03
   2.3293081e-03  1.8198498e-03]
 [ 1.4858595e-02  2.0564320e-02 -5.5074822e-02 ...  2.0950746e-02
  -1.1320139e-02  3.4459867e-03]
 ...
 [ 1.2216844e-03 -2.5652989e-03 -1.7900961e-04 ... -1.6647113e-04
   1.8422551e-03  1.0186286e-03]
 [ 2.52679

# Perform a quick evaluation

In [24]:
testing_data_csr = testing_data_csc.tocsr()

In [49]:
user_predictions = []
for user_i in range(number_of_users):
    x = testing_data_csr[user_i, :]
    u_features = U_features[user_i]
    # print('user', u_features.shape)
    training_item_features = I_features[x.indices]
    # print('training_item_features', training_item_features.shape)
    predictions = np.dot(training_item_features, u_features)
    # print('predictions', predictions.shape)
    user_predictions.append(predictions)


In [50]:
averages = list(map(lambda x: np.array(x).mean(), user_predictions))

In [51]:
# get statistics about averages
pd.Series(averages).describe()

count    5199.000000
mean        0.206553
std         0.079218
min         0.021580
25%         0.147811
50%         0.199834
75%         0.258635
max         0.523911
dtype: float64