In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/Colab_me/DS300/recommenders/

/content/drive/MyDrive/Colab_me/DS300/recommenders


## Global Settings and Import

In [None]:
!pip install scrapbook
!pip install papermill
!pip install cornac
!pip install retrying
!pip install pandera

In [4]:
import sys

import pandas as pd
import numpy as np
import scrapbook as sb
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.rbm.rbm import RBM
from recommenders.datasets.python_splitters import numpy_stratified_split
from recommenders.datasets.sparse import AffinityMatrix

# Load Data

Here we select the size of the movielens dataset. In this example we consider the 100k ratings datasets, provided by  943 users on 1682 movies. The data are imported in a pandas dataframe including the user ID, the item ID, the ratings and a timestamp denoting when a particular user rated a particular item.  

In [5]:
import time
import datetime

In [6]:
%cd /content/drive/MyDrive/DS300_DoAn/data

/content/drive/MyDrive/DS300_DoAn/data


In [31]:
df = pd.read_csv('/content/drive/MyDrive/DS300_DoAn/data/data_history.csv')

convert_timestamp = lambda x: time.mktime(datetime.datetime.strptime(x, "%Y-%m-%d").timetuple())
df['timestamp'] = df['Date'].apply(convert_timestamp)
df = df.rename(columns={'IDuser': "userID", 'IDhotel': "hotelID", 'Rating': "rating"})
df = df[['userID','hotelID','rating','timestamp']]
data = df[df.userID.map(df.userID.value_counts()) > 4]

In [8]:
#to use standard names across the analysis
header = {
        "col_user": "userID",
        "col_item": "hotelID",
        "col_rating": "rating",
    }
#instantiate the sparse matrix generation
am = AffinityMatrix(df = data, **header)
#obtain the sparse matrix
X, _, _ = am.gen_affinity_matrix()

In [9]:
Xtr, Xtst = numpy_stratified_split(X, ratio=1, seed=42)

# Loading a model

In [10]:
#First we initialize the model class
model = RBM(
    possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
    visible_units=Xtr.shape[1],
    hidden_units=100,
    training_epoch=200,
    minibatch_size=30,
    keep_prob=0.9,
    with_metrics=True
)

In [11]:
# number of top score elements to be recommended
K = 25
model.fit(Xtr)

In [27]:
top_k = model.recommend_k_items(Xtst, top_k=K)
top_k_df = am.map_back_sparse(top_k, kind = 'prediction')

In [25]:
top_k = model.recommend_k_items(Xtr, top_k=K)
top_k_df = am.map_back_sparse(top_k, kind = 'prediction')

In [28]:
top_k_df

Unnamed: 0,userID,hotelID,prediction
0,70,482,0.653762
1,70,441,0.623125
2,70,537,0.652402
3,70,120,0.626617
4,70,530,0.670371
...,...,...,...
12120,10664,102,5.529964
12121,10664,144,4.364117
12122,10664,339,10.346272
12123,10664,282,2.686874


In [21]:
!pwd

/content/drive/MyDrive/DS300_DoAn/data


In [30]:
top_k_df.to_csv('rbm_TopK25.csv', index = False)

In [32]:
data

Unnamed: 0,userID,hotelID,rating,timestamp
8,3827,378,6.0,1.325981e+09
10,5272,182,6.0,1.328573e+09
11,5822,308,6.0,1.328659e+09
14,5754,141,6.7,1.331165e+09
15,5961,334,6.7,1.331856e+09
...,...,...,...,...
18247,5668,103,8.0,1.701907e+09
18255,8139,47,10.0,1.701907e+09
18256,4080,140,8.0,1.701994e+09
18263,5857,571,4.0,1.701994e+09


In [36]:
df.userID.value_counts()[:40]

5954    135
5857    122
5961     80
5972     71
9325     60
5572     52
5914     51
5700     45
9387     41
5892     39
4076     38
5532     37
5754     36
5846     34
6648     32
6599     32
5722     31
9399     29
5932     25
5559     24
3857     23
6028     23
4017     23
5912     23
4080     22
5872     22
9392     22
5822     20
8106     20
5966     20
5547     19
9362     19
6006     19
5682     19
5823     19
4039     18
5963     18
7910     18
5521     18
5502     18
Name: userID, dtype: int64