# Cornac recommendation system experiment
A simple experiment to compare the following recommendation models:
- Maximum margin matrix factorization (MMMR), [paper](https://papers.nips.cc/paper_files/paper/2004/file/e0688d13958a19e087e123148555e4b4-Paper.pdf)
- Embarresingly shallow autoencoder (EASE), [paper](https://arxiv.org/pdf/1905.03375.pdf)
- Bayesian Personalised Ranking (BPR), [paper](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf)

## Used experiment

In [1]:
# Installing required modules

# Colab does not cache external modules
# This needs to be run on every colab session
import sys
required_mods = ['cornac']
reinstall = False

for mod in required_mods:
  if not (mod in sys.modules):
    reinstall = True
    break;
    
if (reinstall):
  %pip install cornac scikit-surprise recommenders

Collecting cornac
  Downloading cornac-1.15.1-cp39-cp39-manylinux1_x86_64.whl (18.8 MB)
[K     |████████████████████████████████| 18.8 MB 9.6 MB/s eta 0:00:01
[?25hCollecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 8.2 MB/s eta 0:00:01
[?25hCollecting recommenders
  Using cached recommenders-1.1.1-py3-none-any.whl (339 kB)
Collecting scipy
  Downloading scipy-1.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 25 kB/s  eta 0:00:01
[?25hCollecting tqdm>=4.19
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 2.3 MB/s eta 0:00:01
[?25hCollecting powerlaw
  Using cached powerlaw-1.5-py3-none-any.whl (24 kB)
Collecting numpy
  Using cached numpy-1.24.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Collecting joblib>=1.0.0
  Using cached joblib-1.2.0-py3-none-any.whl (

In [4]:
import cornac
from cornac.data import Reader
from cornac.datasets import movielens
from cornac.eval_methods import RatioSplit

# Import dataset and convert to implicit feedback
feedback = movielens.load_feedback(variant="1M", reader=Reader(bin_threshold=1.0))

# Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=feedback,
    test_size=0.2,
    rating_threshold=1.0,
    exclude_unknowns=True,
    verbose=True,
)

# Define models and hyperparameters
most_pop = cornac.models.MostPop() 
mmmf = mmmf = cornac.models.MMMF(k=10, max_iter=200, learning_rate=0.01, verbose=True)
ease = cornac.models.EASE(
    lamb=500,
    name="EASEᴿ (B>0)",
    posB=True
)
bpr = cornac.models.BPR(
    k=50, max_iter=200, learning_rate=0.001, lambda_reg=0.001, verbose=True
)


# Use RMSE and DCG (N=10)
recall = cornac.metrics.Recall(k=10)
ndcg = cornac.metrics.NDCG(k=10)

# Perform the experiment
cornac.Experiment(
    eval_method=ratio_split,
    models=[most_pop, mmmf, ease,bpr],
    metrics=[recall, ndcg],
    user_based=True,
).run()

ImportError: libpython3.7m.so.1.0: cannot open shared object file: No such file or directory

## Other (failed) experiments

In [None]:
# Installing required modules

# Colab does not cache external modules
# This needs to be run on every colab session
import sys
required_mods = ['cornac', 'recommenders', 'surprise']
reinstall = False

for mod in required_mods:
  if not (mod in sys.modules):
    reinstall = True
    break;
    
if (reinstall):
  %pip install cornac scikit-surprise recommenders

In [None]:
import cornac
import surprise
from recommenders.datasets import movielens
from recommenders.utils.constants import SEED
from recommenders.utils.timer import Timer
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.models.surprise.surprise_utils import surprise_trainset_to_df, predict, compute_ranking_predictions
from recommenders.evaluation.python_evaluation import rmse, ndcg_at_k
from sklearn.model_selection import train_test_split

In [None]:
# Configuration

# Convert explicit ratings (1-5) to implicit binary rating
implicit_ratings = False;
training_split = 0.75;

# Number of recommendations for each user (evaluation) 
k = 10

In [None]:
# Import movielens dataset
data = movielens.load_pandas_df(size='100k', header=['userID', 'itemID', 'rating'])
data.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 10.8kKB/s]


Unnamed: 0,userID,itemID,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0


In [None]:
# Convert explicit rating to implicit ratings
if (implicit_ratings):
  data.rating = data.rating.map(lambda x: 1 if x > 3 else 0) 

data.head()

Unnamed: 0,userID,itemID,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0


In [None]:
# Split dataset to training and testing set. 75/25
train, test = train_test_split(data, train_size=training_split)

train.shape

(75000, 3)

### Suprise Recommenders

In [None]:
# Convert surprise reader to binary rating scale if implicit ratings
reader = surprise.Reader(rating_scale = (1,5))

if (implicit_ratings):
  reader = surprise.Reader(rating_scale = (0,1))

In [None]:
# Convert train dataset to surprise dataset
trainset = surprise.Dataset.load_from_df(train , reader=reader).build_full_trainset()

trainset

<surprise.trainset.Trainset at 0x7f11286918b0>

#### SVD
Works on explicit feedback only so this cannot be used for the project + was designed and performs extremely well on single item recommendations (apparent from the RMSE), bad at top-n recommendation

In [None]:
# Hyperparameters
n_factors = 20
n_epochs = 30

In [None]:
# Train
svd = surprise.SVD(random_state=0, n_factors=n_factors, n_epochs=n_epochs, verbose=True)

with Timer() as train_time:
    svd.fit(trainset)

print(f"Training took {train_time.interval} seconds")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Training took 2.361976502999994 seconds


In [None]:
# View results
predictions = predict(svd, test, usercol='userID', itemcol='itemID')

predictions.head()

Unnamed: 0,userID,itemID,prediction
0,535,699,3.915372
1,732,332,3.846788
2,559,385,3.470695
3,378,473,3.070922
4,23,28,3.800569


In [None]:
# Test
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd, train, usercol='userID', itemcol='itemID', remove_seen=True)
    
print(f"Testing took {test_time.interval} seconds")

Testing took 34.02148172100004 seconds


In [None]:
# Evaluate
eval_rmse = rmse(test, predictions)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("SVD (Surprise) Top@10 results:")
print('----')
print("RMSE:\t%f" % eval_rmse)
print("NDCG:\t%f" % eval_ndcg)

SVD (Surprise) Top@10 results:
----
RMSE:	0.933221
NDCG:	0.109762


#### SVD++
An improved version of SVD that uses both implicit and explicit feedback. It requires explicit feedback to work so it could not be chosen

In [None]:
# Hyperparameters
n_factors = 20
n_epochs = 30

In [None]:
# Train
svdpp = surprise.SVDpp(random_state=0, n_factors=n_factors, n_epochs=n_epochs, verbose=True)

with Timer() as train_time:
    svdpp.fit(trainset)

print(f"Training took {train_time.interval} seconds")

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
 processing epoch 25
 processing epoch 26
 processing epoch 27
 processing epoch 28
 processing epoch 29
Training took 37.12319648700003 seconds


In [None]:
# View results
predictions = predict(svdpp, test, usercol='userID', itemcol='itemID')

predictions.head()

Unnamed: 0,userID,itemID,prediction
0,535,699,4.386404
1,732,332,4.09119
2,559,385,3.454677
3,378,473,2.952356
4,23,28,3.97229


In [None]:
# Test
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svdpp, train, usercol='userID', itemcol='itemID', remove_seen=True)
    
print(f"Testing took {test_time.interval} seconds")

Testing took 225.36368552300002 seconds


In [None]:
# Evaluate
eval_rmse = rmse(test, predictions)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("SVD++ (Surprise) Top@10 results:")
print('----')
print("RMSE:\t%f" % eval_rmse)
print("NDCG:\t%f" % eval_ndcg)

SVD++ (Surprise) Top@10 results:
----
RMSE:	0.939488
NDCG:	0.120513


#### NMF
Similar to SVD, works with implicit or explicit feedback. One of the early candidates but was one of the lower preforming models

In [None]:
# Hyperparameters
n_factors = 20
n_epochs = 30

In [None]:
# Train
nmf = surprise.NMF(random_state=0, n_factors=n_factors, n_epochs=n_epochs, verbose=True)

with Timer() as train_time:
    nmf.fit(trainset)

print(f"Training took {train_time.interval} seconds")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Training took 2.376322160999962 seconds


In [None]:
# View results
predictions = predict(nmf, test, usercol='userID', itemcol='itemID')

predictions.head()

Unnamed: 0,userID,itemID,prediction
0,535,699,3.828133
1,732,332,4.750383
2,559,385,4.036553
3,378,473,3.459776
4,23,28,4.354365


In [None]:
# Test
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(nmf, train, usercol='userID', itemcol='itemID', remove_seen=True)
    
print(f"Testing took {test_time.interval} seconds")

Testing took 24.18148979199998 seconds


In [None]:
# Evaluate
eval_rmse = rmse(test, predictions)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("NMF (Surprise) Top@10 results:")
print('----')
print("RMSE:\t%f" % eval_rmse)
print("NDCG:\t%f" % eval_ndcg)

NMF (Surprise) Top@10 results:
----
RMSE:	1.025525
NDCG:	0.049988


100%|██████████| 4.81k/4.81k [00:02<00:00, 1.99kKB/s]


In [3]:
data

Unnamed: 0,user_id,item_id,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0
...,...,...,...
99995,880,476,3.0
99996,716,204,5.0
99997,276,1090,1.0
99998,13,225,2.0


In [2]:
from model import EASE
from recommenders.datasets import movielens

data = movielens.load_pandas_df(size='100k', header=['user_id', 'item_id', 'rating'])

ease_model = EASE()

ease_model.fit(data)

predictions = ease_model.predict_single(2, 10)
predictions

100%|██████████| 4.81k/4.81k [00:01<00:00, 3.00kKB/s]


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed