In [None]:
 # autosave at most every 5 minutes
%autosave 900

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras import regularizers
from keras.optimizers import Adam, Adadelta, Nadam, SGD
from keras.layers import dot
import keras.backend as K
from keras.callbacks import LearningRateScheduler
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import gzip
warnings.filterwarnings('ignore')

%matplotlib inline

<b>Data Preperation class :</b>
<br/>
The class will perform data pre-processing on the selected dataset and output a K-core trimmed dataset.<br/>
<b>K_core_item : </b> only select items that has at least K ratings.<br/>
<b>K_core_user : </b> only select users that has rated at least K items.<br/>

In [2]:
class data_prep(object):
    def __init__(self, fileName, K_core_item, K_core_user, conv=True):
        self.fileName    = fileName
        self.K_core_item = K_core_item
        self.K_core_user = K_core_user
        self.conv        = conv
        
    def filter_data(self):
        data = pd.read_csv(self.fileName, header=None)
        # rename the coumns in the dataset
        if fileName == 'ratings_Electronics.csv' or fileName == 'data/ratings_Electronics.csv':
            data = data.rename(columns={0: "reviewerID", 1: "asin", 2: "overall", 3:"reviewTime"})
        
        # cast to float32 to reduce memory 
        data['overall'] = data['overall'].astype('float32')
        
        # cut down item count by limiting the entry of the dataset, only include them when it has more than 20 ratings
        item_rate = data['asin'].value_counts()
        item_rate1 = pd.DataFrame(data=item_rate)
        item_rate1.columns = ['rating_count']
        rate_count = self.K_core_item 
        item_rate1.sort_values(by=['rating_count'])
        item_rate1 = item_rate1.loc[item_rate1['rating_count'] >= rate_count]
        item_list = item_rate1.index.values
        data = data.loc[data['asin'].isin(item_list)]
        
        # cut down item count by limiting the entry of the dataset, only include them when it has more than 20 ratings
        user_rate = data['reviewerID'].value_counts()
        user_rate1 = pd.DataFrame(data=user_rate)
        user_rate1.columns = ['rating_count']
        rate_count = self.K_core_user
        user_rate1.sort_values(by=['rating_count'])
        user_rate1 = user_rate1.loc[user_rate1['rating_count'] >= rate_count]
        user_list = user_rate1.index.values
        data = data.loc[data['reviewerID'].isin(user_list)]
        
        # remove to free up RAM 
        del user_rate
        del user_rate1
        del user_list
        del item_rate
        del item_rate1
        del item_list
        
        # if conversion of user and itemID is required
        if self.conv:
            # encode user and item ID
            data['userID'] = data.reviewerID.astype('category').cat.codes.values
            data['itemID'] = data.asin.astype('category').cat.codes.values
        
        return data
    
# return train, test and validation split
def split(data, col_name, size, seed):
    # split with statified to maintain distribution of the data
    train, test = train_test_split(data,
                                    stratify=data[col_name],
                                    test_size=size,
                                    random_state = seed)

    # further split into train and validation set
    train, val = train_test_split(train,
                                    stratify=train[col_name],
                                    test_size=size,
                                    random_state = seed)
    # train, test, validation
    return train, test, val

# parse function to unzip the gzip file
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

# unzip the gzip file
def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

#return the metafile propocessing 
def getMeta(file):
    meta_data = getDF(file)

    # drop any unnecessary columns for now
    col = ['imUrl', 'categories','related', 'salesRank', 'brand', 'price'] 
    meta_data = meta_data.drop(col, axis=1)
    
    return meta_data

In [20]:
def lr_schedule(epoch):
    lrate = 1.0
    if epoch > 3:
        lrate = 0.01
    if epoch > 8:
        lrate = 0.001
    if epoch > 15:
        lrate = 0.0001
    return lrate

def root_mean_squared_error(y_true, y_pred):
    y_pred = K.clip(y_pred, 1, 5)
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

class Deep_SparseAutoEncoder(object):
    def __init__(self, num_user, num_item, latent_factor, act, rr, pre_feed='concat', layer=[512,256]):
        self.n_users    = num_user
        self.n_items    = num_item
        self.l_factor   = latent_factor
        self.activation = act
        self.reg_rate   = rr
        self.pre_feed   = pre_feed
        self.layer      = layer
        self.model      = None
            
    def create_model(self, optimiser = 'Adadelta'):
        # input 
        user_input = keras.layers.Input(shape=[1],name='User')
        item_input = keras.layers.Input(shape=[1],name='Item')

        # decompose users with embedding to get first level latent factor
        user_embedding = keras.layers.Embedding(self.n_users + 1, self.l_factor ,name='User-Embedding')(user_input)
        user_vec = keras.layers.Flatten(name='FlattenUsers')(user_embedding)
        user_vec = keras.layers.Dropout(0.2)(user_vec)

        # decompose items with embedding to get first level latent factor
        item_embedding = keras.layers.Embedding(self.n_items + 1, self.l_factor, name='item-Embedding')(item_input)
        item_vec = keras.layers.Flatten(name='FlattenItems')(item_embedding)
        item_vec = keras.layers.Dropout(0.2)(item_vec)

        # conbine the two embedding layer and feed into auto encoders as features
        if self.pre_feed == 'concat':
            x_inp = keras.layers.Concatenate()([user_vec, item_vec])
        else:
            x_inp = dot([user_vec, item_vec], axes=1, normalize=False, name='DotProduct') 

        # create layer based on the provided provided detail
        for i in range(len(self.layer)):
            # Encoder
            # -----------------------------
            layer_name = 'EncLayer'+ str(i+1) 
            x_inp = Dense(self.layer[i], activation=self.activation, name=layer_name,
                        activity_regularizer=regularizers.l2(self.reg_rate))(x_inp)
        
        # bottleneck
        x_inp = Dropout(0.5, name='Dropout')(x_inp) # Dropout

        # create layer based on the provided provided detail
        for i in reversed(range(len(self.layer))):
            layer_name = 'DecLayer'+ str(len(self.layer) - i) 
            # Decoder
            # -----------------------------
            x_inp = Dense(self.layer[i], activation=self.activation, name=layer_name,
                        activity_regularizer=regularizers.l2(self.reg_rate))(x_inp)

        # Output 
        output_layer = keras.layers.Dense(1, activation='relu', name='Activation')(x_inp)
        Adp_LR_method = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=1e-6)

        # define the final model
        self.model = keras.Model([user_input, item_input], output_layer)       
    
        # compile the model and return 
        self.model.compile(optimizer= Adp_LR_method, loss='mean_squared_error', metrics=[root_mean_squared_error])
        
def evaluateModel(model, test_data):
    y_hat = np.round(model.predict([test_data.userID, test_data.itemID]),0)
    y_true = test_data.overall

    y_hat = np.clip(y_hat,1 ,5)
    MAE = mean_absolute_error(y_true, y_hat)
    RMSE = np.sqrt(mean_squared_error(y_true, y_hat))

    return MAE, RMSE 

Get data prepared with K-core Analysis

In [9]:
fileName = 'data/ratings_Electronics.csv'
K_core_item = 20
K_core_user = 20

DP = data_prep(fileName, K_core_item, K_core_user, True)
data = DP.filter_data()

In [10]:
data.head(5)

Unnamed: 0,reviewerID,asin,overall,reviewTime,userID,itemID
17,A1H8PY3QHMQQA0,528881469,2.0,1290556800,1065,0
118,AT09WGFUM934H,594481813,3.0,1377907200,7964,1
189,A2IDCSC6NVONIZ,972683275,5.0,1367280000,3318,2
200,A3BMUBUC1N77U8,972683275,4.0,1385164800,5182,2
274,AQBLWW13U66XD,972683275,5.0,1375574400,7800,2


In [11]:
train, test, val = split(data, 'reviewerID', 0.2, 225)

Now that we are ready to train Deep Sparse Autoencoder

In [12]:
n_users, n_items = len(data.userID.unique()), len(data.itemID.unique())
latent_factor = 40
activation = 'linear'
reg_rate = 0.001
pre_feed = 'concat'
layers = [128, 64, 32, 16]

DSAE = Deep_SparseAutoEncoder(n_users, n_items, latent_factor, activation, reg_rate, pre_feed, layers)
DSAE.create_model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [13]:
DSAE.model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 40)        334320      User[0][0]                       
__________________________________________________________________________________________________
item-Embedding (Embedding)      (None, 1, 40)        1710880     Item[0][0]                       
__________________________________________________________________________________________________
FlattenUse

In [14]:
#history = DSAE.model.fit([train.userID, train.itemID], train.overall, epochs=50, validation_split=0.2, verbose=1)
history = DSAE.model.fit([train.userID, train.itemID], 
                    train.overall, 
                    epochs=20,
                    validation_data=[[val.userID, val.itemID], val.overall],
                    verbose=1, callbacks=[LearningRateScheduler(lr_schedule)])

Instructions for updating:
Use tf.cast instead.
Train on 178540 samples, validate on 44636 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


In [23]:
MAE, RMSE =  evaluateModel(DSAE.model, test)

In [24]:
print(MAE)
print(RMSE)

0.7358318
1.076966


In [None]:
def getRecList(model, ratings, meta, user, top = 50):
    # get the converted itemID
    meta = pd.merge(meta, ratings[['asin','itemID']], on='asin', how='inner')
    # remove any duplicates during merge
    meta = meta.drop_duplicates(subset ='itemID', keep = 'first') 
    # not mandatory but just for better visualization 
    meta = meta.sort_values(by=['itemID'])
    
    # get all the avaialable item for rating
    items = meta.itemID.reset_index().drop('index', axis=1)
    # preparing dataset for prediction 
    items['userID'] = [user] * len(items)
    
    # get prediction of the all the available items
    rec = np.round(model.predict([items.userID, items.itemID]),0)
    # clip the prediction back to 1 - 5
    rec = np.clip(rec,1 ,5)
    
    # add prediction score
    items['pred'] = rec
    # add product ID to the list
    items['asin'] = meta_data.asin
    # add title into the list
    items['title'] = meta_data.title
    # remove any unwanted na rows
    items = items.dropna(subset=['asin'])
    
    Top_n = items.sort_values(by='pred', ascending=False).head(top)
    
    return Top_n

def getPurcHist(userID, data, meta_data):
    # get all the ratings in the ratings dataset
    history_purchase = data.loc[data.userID == userID]
    
    # left join the meta dataset and history purchase 
    history_purchase = pd.merge(history_purchase, meta_data[['asin','title']], on='asin', how='left')
    # drop review time 
    history_purchase = history_purchase.drop('reviewTime', axis=1)
    
    return history_purchase

In [None]:
#meta_data = getDF('meta_Electronics.json.gz')
#col = ['imUrl', 'categories','related', 'salesRank', 'brand', 'price'] 
#meta_data = meta_data.drop(col, axis=1)

meta_data = getMeta('meta_Electronics.json.gz')

Now we will try to get a reccomendation for a user.

In [None]:
user = 15518
top = 20

pHist = getPurcHist(user, data, meta_data)

In [None]:
recList recList = getRecList(DSAE.model, data, meta_data, user, top)

In [None]:
pHist

In [25]:
import keras; 
print(keras.__version__)

2.2.4


In [26]:
print(np.__version__)

1.15.4


In [27]:
import tensorflow;
print(tensorflow.__version__)

1.13.1


In [30]:
import sklearn
print(sklearn.__version__)

0.21.2
