### Extension 1 - Comparison to single machine implementations (LightFM)


In [1]:
import pandas as pd
import os   
import sys 
import numpy as np
from lightfm import LightFM as LF
from scipy import sparse
from lightfm.evaluation import precision_at_k
import time



In [2]:
start=time.time()
print('Loading training data')

path = r'train.csv/'
flist = []
for f in os.listdir(path):
    if f.endswith('.csv'):
        m = pd.read_csv(os.path.join(path,f), index_col=None, names=['user_id', 'book_id', 'is_read', 'rating','is_reviewed'])
        flist.append(m)
train_df = pd.concat(flist, axis=0)

print('Time taken:{}'.format(time.time()-start))

Loading training data
Time taken:0.16811084747314453


In [3]:
start = time.time()
print('Loading test data')

path_test = r'test.csv/'
flist_test = []
for f in os.listdir(path_test):
    if f.endswith('.csv'):
        m_test = pd.read_csv(os.path.join(path_test,f), index_col=None, names=['user_id', 'book_id', 'is_read', 'rating','is_reviewed'])
        flist_test.append(m_test)
test_df = pd.concat(flist_test, axis=0)

print('Time taken:{}'.format(time.time()-start))

Loading test data
Time taken:0.07405304908752441


In [None]:
n1= train_df[train_df.user_id.isin(test_df.user_id)].shape[0]
n_test = test_df.shape[0]
n_train = train_df.shape[0]

In [None]:
def fit_model(samples, train_df, test_df):
    train_df_sample1 = train_df[train_df.user_id.isin(test_df.user_id)]
    train_df_sample2 = train_df[~train_df.user_id.isin(test_df.user_id)]
    train_df_subsample = train_df_sample2.sample(n=samples)
    train_df_new = pd.concat([train_df_sample1,train_df_subsample],axis=0)
    
    start = time.time()
    print('Data preprocessing')

    mat = sparse.coo_matrix((train_df_new['rating'], (train_df_new['user_id'], train_df_new['book_id'])))
    test_new = pd.concat([train_df_new,test_df],axis=0)
    test_mat = sparse.coo_matrix((test_new['rating'], (test_new['user_id'], test_new['book_id'])))
    
    print('Time taken:{}'.format(time.time()-start))
    
    start = time.time()

    model = LF(no_components=100, user_alpha=0.001, item_alpha=0.001, loss='warp')
    model.fit(mat, epochs=10)

    print('Done with model fitting')
    print('Time taken: {}'.format(time.time()-start))
    
    start = time.time()

    test_precision = precision_at_k(model, test_mat, k=500).mean()
    print('Precision@k: {}'.format(test_precision))

    print('Time taken: {}'.format(time.time()-start))

In [None]:
l = [1,10000,25000, 50000,75000, 100000]
for n in l:
    f = (n + n1)/n_train
    print('Fraction:{}'.format(f))
    fit_model(n, train_df, test_df)