In [1]:
!pip install papermill
!pip install scrapbook
!pip install cornac
!pip install retrying
!pip install pandera

Collecting papermill
  Using cached papermill-2.4.0-py3-none-any.whl (38 kB)
Collecting ansiwrap
  Using cached ansiwrap-0.8.4-py2.py3-none-any.whl (8.5 kB)
Collecting tenacity
  Using cached tenacity-8.2.2-py3-none-any.whl (24 kB)
Collecting textwrap3>=0.9.2
  Using cached textwrap3-0.9.2-py2.py3-none-any.whl (12 kB)
Installing collected packages: textwrap3, tenacity, ansiwrap, papermill
Successfully installed ansiwrap-0.8.4 papermill-2.4.0 tenacity-8.2.2 textwrap3-0.9.2
Collecting scrapbook
  Using cached scrapbook-0.5.0-py3-none-any.whl (34 kB)
Collecting pyarrow
  Downloading pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.1/39.1 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pyarrow, scrapbook
Successfully installed pyarrow-12.0.1 scrapbook-0.5.0
Collecting pandera
  Using cached pandera-0.16.1-py3-none-any.whl (201 kB)
Collecting t

In [36]:
!git clone https://github.com/microsoft/recommenders.git

Cloning into 'recommenders'...
remote: Enumerating objects: 37376, done.[K
remote: Counting objects: 100% (908/908), done.[K
remote: Compressing objects: 100% (311/311), done.[K
remote: Total 37376 (delta 621), reused 759 (delta 581), pack-reused 36468[K
Receiving objects: 100% (37376/37376), 205.18 MiB | 21.64 MiB/s, done.
Resolving deltas: 100% (25265/25265), done.
Checking connectivity... done.


In [37]:
!mv recommenders recommender
!cp -r recommender/recommenders recommenders/

In [38]:
import sys
import os
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
sys.path.append(os.getcwd())
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.7.10 (default, Jun  4 2021, 14:48:32) 
[GCC 7.5.0]
Cornac version: 1.14.2


In [39]:
from sklearn.preprocessing import LabelEncoder

In [42]:
test = pd.read_csv('../Dataset/test.csv')
train = pd.read_csv('../Dataset/train.csv')

In [43]:
test = test.drop('xd', axis=1)
train = train.drop('xd', axis=1)

In [44]:
test = test.rename(columns={'qid': 'userID', 'article_id': 'itemID', 'response': 'rating'})
train = train.rename(columns={'qid': 'userID', 'article_id': 'itemID', 'response': 'rating'})

In [45]:
test = test[["userID", "itemID", "rating"]]
train = train[["userID", "itemID", "rating"]]

In [46]:
train.head()

Unnamed: 0,userID,itemID,rating
0,85837,1073,0
1,85837,4807,0
2,85837,6379,0
3,85837,4909,0
4,85837,9953,0


In [47]:
test.head()

Unnamed: 0,userID,itemID,rating
0,95354290,4234,0
1,95354290,1226,0
2,95354290,8952,0
3,95354290,8952,0
4,95354290,6200,0


In [48]:
df_articles = pd.read_csv('../Dataset/Published_online_articles.csv')

In [49]:
df_articles['all_text'] = pd.Series(df_articles[['headline','teaser','text']].fillna('').values.tolist()).str.join(' ')
df_articles = df_articles[['article_id','all_text']]
df_articles = df_articles.rename(columns={'article_id': 'itemID'})

In [50]:
tcm_id_le = LabelEncoder()

df_articles['itemID'] = tcm_id_le.fit_transform(df_articles['itemID'])
test['itemID'] = tcm_id_le.transform(test['itemID'])
train['itemID'] = tcm_id_le.transform(train['itemID'])

In [51]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 463
Number of items: 66




In [52]:
# UserKNN methods
K = 30
user_knn_cosine = cornac.models.UserKNN(k=K, similarity="cosine", name="UserKNN-Cosine")
user_knn_pearson = cornac.models.UserKNN(k=K, similarity="pearson", name="UserKNN-Pearson")

In [61]:
with Timer() as t:
    user_knn_cosine.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/463 [00:00<?, ?it/s]

Took 0.1010 seconds for training.


In [62]:
with Timer() as t:
    all_predictions = predict_ranking(user_knn_cosine, train, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 6.4313 seconds for prediction.


In [56]:
all_predictions.head()

Unnamed: 0,userID,itemID,prediction
26172,85837,43,0.129842
26173,85837,50,0.191415
26174,85837,20,0.127504
26175,85837,32,0.219859
26176,85837,61,0.454536


In [63]:
k = 10
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.203942
NDCG:	0.748457
Precision@K:	0.659179
Recall@K:	0.270598
