<a href="https://colab.research.google.com/github/LinaDanilina/recommender-system/blob/master/Neural_Collaborative_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    roc_auc_score,
    log_loss,
)
import numpy as np

In [20]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [21]:
! ls
%cd drive/My Drive/recommenders

AUTHORS.md	 data.csv  ml-100k     scripts	    test.csv
benchmarks	 docker    notebooks   SECURITY.md  tests
contrib		 docs	   README.md   SETUP.md     train.csv
CONTRIBUTING.md  LICENSE   reco_utils  setup.py     Untitled.ipynb
[Errno 2] No such file or directory: 'drive/My Drive/recommenders'
/content/drive/My Drive/recommenders


In [22]:
%tensorflow_version 1.x
import tensorflow as tf
tf.__version__

'1.15.2'

In [0]:
import time
import pandas as pd
#import tensorflow as tf

from reco_utils.recommender.ncf.ncf_singlenode import NCF
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
from reco_utils.dataset import movielens
from reco_utils.common.notebook_utils import is_jupyter
from reco_utils.dataset.python_splitters import python_chrono_split
from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)

In [0]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 50
BATCH_SIZE = 256

SEED = 42

In [25]:
data=pd.read_csv('data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,genre,user_id,rating,unix_timestamp,age,sex,occupation,zip_code
0,0,1,Toy Story (1995),b'01-Jan-1995',,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",308,4,887736532,60,M,retired,95076
1,1,4,Get Shorty (1995),b'01-Jan-1995',,http://us.imdb.com/M/title-exact?Get%20Shorty%...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",308,5,887737890,60,M,retired,95076
2,2,5,Copycat (1995),b'01-Jan-1995',,http://us.imdb.com/M/title-exact?Copycat%20(1995),"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",308,4,887739608,60,M,retired,95076
3,3,7,Twelve Monkeys (1995),b'01-Jan-1995',,http://us.imdb.com/M/title-exact?Twelve%20Monk...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",308,4,887738847,60,M,retired,95076
4,4,8,Babe (1995),b'01-Jan-1995',,http://us.imdb.com/M/title-exact?Babe%20(1995),"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",308,5,887736696,60,M,retired,95076


In [0]:
train=pd.read_csv('train.csv',delimiter=',')
train=train[['user_id','movie_id','rating','unix_timestamp']]
train=train.rename(columns={'user_id':'userID','movie_id':'itemID','unix_timestamp':'timestamp'})
test=pd.read_csv('test.csv',delimiter=',')
test=test[['user_id','movie_id','rating','unix_timestamp']]
test=test.rename(columns={'user_id':'userID','movie_id':'itemID','unix_timestamp':'timestamp'})

In [27]:
train

Unnamed: 0,userID,itemID,rating,timestamp
0,46,267,4,879439040
1,845,584,2,883949643
2,22,214,2,874787116
3,757,87,4,881979942
4,674,1627,5,889489837
...,...,...,...,...
74995,278,661,2,875310631
74996,293,263,2,877819090
74997,523,683,4,884636236
74998,4,166,2,875636281


In [0]:
data = NCFDataset(train=train, test=test, seed=SEED)

In [0]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

In [30]:
start_time = time.time()

model.fit(data)

train_time = time.time() - start_time

print("Took {} seconds for training.".format(train_time))

Took 460.02435183525085 seconds for training.


In [31]:

start_time = time.time()

users, items, preds = [], [], []
item = list(train.itemID.unique())
for user in train.userID.unique():
    user = [user] * len(item) 
    users.extend(user)
    items.extend(item)
    preds.extend(list(model.predict(user, item, is_list=True)))

all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

test_time = time.time() - start_time
print("Took {} seconds for prediction.".format(test_time))

Took 4.066263675689697 seconds for prediction.


In [0]:
def metrics(data_true, data_pred):
    mse=rmse(test, all_predictions)
    mae_=mae(test, all_predictions)
    r2=rsquared(test, all_predictions)
    ex_var=exp_var(test, all_predictions)
    df=pd.DataFrame({"MSE": mse, "MAE":mae_, "R2_score":r2, "explained variance":ex_var},index=[0])
    return df

In [45]:
metrics(test,all_predictions)

Unnamed: 0,MSE,MAE,R2_score,explained variance
0,3.226091,3.031819,-7.167729,0.045915
