In [None]:
import pandas as pd
import numpy as np
import os
from IPython.display import Image
import matplotlib.pyplot as plt
%matplotlib inline
from surprise import SVD
from surprise.dataset import Reader, Dataset
from surprise.model_selection import GridSearchCV

In [None]:
pwd

## Recommender systems 

#### Collaborative filtering 

Collaborative recommenders rely on data generated by users as they interact with items. 

Benefits 

- it is always “self-generating” — users create the data for you naturally as they interact with items. This can be a valuable data source, especially in cases where high-quality item features are not available or difficult to obtain. 
- Another benefit of collaborative filters is that it helps users discover new items that are outside the subspace defined by their historical profile.

Drawbacks 

- the well-known cold start problem. 
- It is also difficult for collaborative filters to accurately recommend novel or niche items because these items typically do not have enough user-item interaction data.

**Item-item** (http://www.diva-portal.org/smash/get/diva2:1111865/FULLTEXT01.pdf):

Item based collaborative filtering was introduced 1998 by Amazon[6]. Unlike user based collaborative filtering, item based filtering looks at the similarity between different items, and does this by taking note of how many users that bought item X also bought item Y. If the correlation is high enough, a similarity can be presumed to exist between the two items, and they can be assumed to be similar to one another. Item Y will from there on be recommended to users who bought item X and vice versa.

In item-item collaborative filtering, we provide a recommendation based on other items similar to ours. The **benefits** of it, compared to user-user collaborative filtering, is that we usually need much fewer similarity computations (in most cases, there are much more users in systems than items). The most **common pitfall** - the system can provide very obvious recommendations.

**User-user** (http://www.diva-portal.org/smash/get/diva2:1111865/FULLTEXT01.pdf):

The report is focusing on the “nearest neighbour” approach for recommendations, which looks at the users rating patterns and finds the “nearest neighbours”, i.e users with ratings similar to yours. The algorithm then proceeds to give you recommendations based on the ratings of these neighbours.

In user-user collaborative filtering, we provide a recommendation based on tastes of other users similar to us. **The problem** with that algorithm is that we need a lot of information about other people to provide correct recommendations, but the main benefits are effectiveness and ability to provide new, unexpected, and, yet, good recommendations.

### Content recommenders

Content recommenders rely on item features to make recommendations. 

Benefits

- Content filters tend to be more robust against popularity bias and the cold start problem. 
- They can easily recommend new or novel items based on niche tastes. 

Drawbacks

- However, in an item-to-item recommender, content filters can only recommend items with features similar to the original item. 
- This limits the scope of recommendations, and can also result in surfacing items with low ratings.

##### In this project

Given our dataset and the features we have, we are not able to create a content-based filtering algorithm, as the algorithm would need more information. We would essentially need to know some specific attributes about every product, e.g. with movies we know whether they include themes such as: Baseball, Economics, etc. 
Furthermore, companies such as Amazon are using collaborative item-item based recommender systems (https://www.quora.com/What-algorithm-s-does-Amazon-use-in-their-recommendation-system), and this is also the one that makes most sense in our case, since we do not have a lot of information of about each individual (we do not have many purchases from each person). 



In [None]:
df_reviews = pd.read_csv("../data/olist_order_reviews_dataset.csv" ,sep=',')
df_orders = pd.read_csv("../data/olist_orders_dataset.csv" ,sep=',')
df_items = pd.read_csv("../data/olist_order_items_dataset.csv" ,sep=',')
df_products = pd.read_csv("../data/olist_products_dataset.csv" ,sep=',')
df_customer = pd.read_csv("../data/olist_customers_dataset.csv" ,sep=',')

In [None]:
Image("data_tables.png")
#Overview of the different data tables:

### Data mapping: 

The information essentially needed, is a tables with customer ID's in rows and products in the columns, with reviews as values in the dataframe. This means that the three, customers, reviews and products are matched using order and order items respectively. 

Short overview of the relevant tables:

In [None]:
#df_orders.head()

In [None]:
#df_reviews.head()

In [None]:
#df_customer.head()

In [None]:
#df_products.head()

In [None]:
#df_items.head()

##### 1. Matching product ID's and product categories on the order items

In [None]:
df_items.describe(include="O")

In [None]:
print(len(df_items))

In [None]:
df_products.describe(include="O")

As seen, we have some duplicates of orderID's, since the customers who are ordering several products in one order, has different `order_item_id`. Thus at first, the table df_items is enriched with the `product_id` and `product_category_name` for every item. 

In [None]:
#Matching items and products to obtain the product category name:
ordered_products = df_items.merge(df_products, on="product_id", how= "left")

In [None]:
#Exclude irrelevant columns:
cols= ordered_products.columns
ordered_products = ordered_products[cols[0:8]]
ordered_products = ordered_products.drop(["shipping_limit_date", "price", "freight_value"], axis=1)

In [None]:
ordered_products.head()

In [None]:
ordered_products.describe(include="O")

##### 2. Matching the unique customer ID's with the orders

In [None]:
df_customer.describe(include="O")

In [None]:
df_orders.describe(include="O")

In [None]:
print("Lenth of orders: ", len(df_orders), "Length of customers: ", len(df_customer))

As seen from the tables, there are less `customer_unique_id` than `customer_id`, which indicates that some customers has several `customer_id`'s. The unique customer ID's are now matched with all the `order_id`'s, such that every unique customers orders can be reviewd

In [None]:
# Unique customer ID's is matched witht the given order. The key is customer_id
unique_customer_orders = df_orders.merge(df_customer, on="customer_id", how= "left")

In [None]:
#And then we are only taking relevant columns:
cols = list(unique_customer_orders.columns[0:2])
cols.append("customer_unique_id")
unique_customer_orders = unique_customer_orders[cols]

In [None]:
unique_customer_orders.head()

In [None]:
unique_customer_orders.describe(include="O")

##### 3. The task is now to match the reviews on to every order. 

Short description of the review data:

In [None]:
df_reviews.describe(include="O")

As seen above, some order_id's have multiple reviews. Looking at one of the order_id's with three reviews, we see the following data-points:

In [None]:
df_reviews[df_reviews.order_id=='8e17072ec97ce29f0e1f111e598b0c85']

We have to filter these duplicate values out, such that we do not have several reviews for each order. The reason why this might be, is that one order_id can have several items. However, it is not possible to link the different review_id's to the order_item_id, which means that we have to use assumptions. 

The approach is to use the aggregate mean for each order_id to calculate the score for that particular order. This might give the best result for the entire order and the different items in that order.

The review table looks like the following

In [None]:
df_reviews.head()

In [None]:
len(df_reviews)

In [None]:
reviews = df_reviews.groupby("order_id").mean().reset_index()

In [None]:
reviews.head()

And we can now match these reviews to every order. 

In [None]:
unique_orders = unique_customer_orders.merge(reviews, on="order_id", how="left")

In [None]:
unique_orders.head()

##### 4. Lastly, the two datasets with orders and items can now be combined. 

It is then assumed, that each item in an order is reviewed as the mean score for that particular order. This is an assumption which is mentioned in step 3. The all the reviews are therefore merged in to all the items, such that we get a per-item review score for each unique customer.

In [None]:
len(ordered_products)

In [None]:
len(unique_orders)

In [None]:
unique_orders.head()

In [None]:
ordered_products.head()

In [None]:
item_review = ordered_products.merge(unique_orders, on="order_id", how="left")

In [None]:
item_review.head()

### Creating the final data-table

From the above item_review table, it is possible to make a table including per-customer and per-item reviews. However, a problem arise when trying to make a custumer-item-review table, since there are 32,951 unique products and 95,420 unique customers, which is too large to hold in memeory. Therefore, the number of purchases per product is sorted, and the products which has very few purchases are excluded. Arguably, these products might also be uncertain to recommend to others as they have not recieved a lot of reviews. 

In [None]:
len(item_review.product_id.unique())

In [None]:
len(item_review.customer_unique_id.unique())

In [None]:
len(item_review)

In [None]:
#Creating a dataframe that will contain how many reviews each product has
pp = item_review.groupby("product_id")["review_score"].count().sort_values()
ppdf = pd.DataFrame(pp)
ppdf.head()

In [None]:
#Creating a dataframe which only contain products that has more than 1 review 
ppdf_small = ppdf[ppdf.review_score>10]
len(ppdf_small)

In [None]:
#Filtering the products with only 1 review out if the main dataframe
products = list(ppdf_small.index)
item_review_s = item_review[item_review['product_id'].isin(products)]
len(item_review_s)

In [None]:
item_review_s.head()

Since some of the customers has bought the same products several times, these reviews has to be transformed in to one metric. Therefore, identical purchases from the same customer is averaged into one review score.

In [None]:
#Averaging identical purchases
item_review_ss = item_review_s.groupby(["customer_unique_id", "product_id"]).mean().reset_index()

In [None]:
#Creating a pivot table, which has unique customers in the rows, product id's in the columns and review score as
# values
dfr = item_review_ss.pivot(index= "customer_unique_id", columns= "product_id", values="review_score")
#dfr.shape

In [None]:
dfr.head()

### Item-item based collaborative filtering

At first, we start out simple using an item-item system, where we can recommend different users products, that they have not consumed yet. Lets look at the top 10 products to recommend the first user in our dataset. 

In [None]:
#Recommending top ten products that fits best to the first customer in the dataset
customer = list(dfr.index)[0]
products = 10

In [None]:
#Calculating the correlation for between each item
correlations = dfr.corr()

In [None]:
#Finding products that the customer has not yet purchased
_purchased = dfr.loc[customer]
_purchased = _purchased[_purchased.isnull()]
_purchased.head()

In [None]:
for product, review in _purchased.iteritems():
    rating = 0
    weights_sum = 0
    neighbours_corr = correlations[product].sort_values(ascending=False)[1: products+1]
    item_mean = dfr[product].mean()
    neighbours_ratings = dfr[neighbours_corr.index].transpose()
    neighbours_means = neighbours_ratings.mean(axis=1)
    for neighbour_id, row in neighbours_ratings.iterrows():
        if np.isnan(row[customer]): continue
        rating += neighbours_corr[neighbour_id] * (row[customer] - neighbours_means[neighbour_id])
        weights_sum += abs(neighbours_corr[neighbour_id])
    if weights_sum > 0:
        rating /= weights_sum
    rating += item_mean
    _purchased.at[product] = rating

In [None]:
#Top 10 recommended products for the customer 
_purchased.sort_values(ascending=False)[0:products]

It is also possible to look at what product category we are recommending the user, which is gives some more context to the prediction. 

In [None]:
rec_cat = item_review_s[item_review_s["product_id"].isin(list(pd.DataFrame(_purchased.sort_values(ascending=False)\
                        [0:products]).index))].product_category_name.unique()

In [None]:
#Displaying recommended categories:
list(rec_cat)

Apparently, some of the products is within the same product category, since we only get eight different categories. 

### SVD 

Since an item-item based collaborative filtering approach is relatively simple we are building a more complex model using an SVD

In [None]:
#Sorting out the data needed
data = item_review_s[["customer_unique_id","product_id", "review_score"]]

In [None]:
data.head()

In [None]:
print("The shape of the dataframe is: ", data.shape, " and the data includes", \
      len(data.groupby(["customer_unique_id"]).count()), " unique customers")

As we have so many unique customers we will take a small subset, and only look at the customers who has made at least two purchases. This will dramatically reduce the size of the dataset, but it will also make the predictions of higher quality, as we will have more data on each customer. 

In [None]:
#Creating a dataframe that will contain how many reviews each product has
pp = data.groupby("customer_unique_id")["product_id"].count().sort_values()
ppdf = pd.DataFrame(pp)
ppdf.head()

In [None]:
#Creating a dataframe which only contain products that has more than 1 review 
ppdf_small = ppdf[ppdf.product_id>3]
len(ppdf_small)

In [None]:
#Filtering the products with only 1 review out if the main dataframe
products = list(ppdf_small.index)
data = data[data['customer_unique_id'].isin(products)]
len(data)

In [None]:
data.shape

In [None]:
data = data.reset_index()
data = data.drop(["index"],axis=1)

Then we can split the data into test and training data:

In [None]:
#Splitting the data into test/train by a 70% split
train_ind, test_ind = [], []
#
for i, product_id in enumerate(data['product_id'].unique()):
    rows = data[data['product_id'] == product_id]
#    print(rows)
    ind = rows.index[:2].values.tolist()
#    print(ind)
    train_ind += ind

#
c = 0.7
b = len(train_ind) / len(data)
a = (c - b) / (1 - b)
print(a)

In [None]:
all_ind = set(range(len(data)))
not_used = list(all_ind - set(train_ind))

In [None]:
not_done = True
np.random.seed(42)
while not_done:
    np.random.shuffle(not_used)
    train_ind_ = train_ind + not_used[:int(a * len(not_used))]
    df_train = data.loc[train_ind_]
    print(data.nunique()['product_id'], df_train.nunique()['product_id'])
    print(data.nunique()['customer_unique_id'], df_train.nunique()['customer_unique_id'])
    if data.nunique()['product_id'] == df_train.nunique()['product_id'] and data.nunique()['customer_unique_id'] == df_train.nunique()['customer_unique_id']:
        not_done = False
        train_ind = train_ind_

test_ind = list(all_ind - set(train_ind))
train_ind = sorted(train_ind)
test_ind = sorted(test_ind)
print(len(train_ind)/len(all_ind))
print(len(test_ind)/len(all_ind))

In [None]:
#Since the code above takes quite a while, the is saved for future runs
np.savetxt('data/recommender_data_train.csv', train_ind, fmt="%d")
np.savetxt('data/recommender_data_test.csv', test_ind, fmt="%d")

#if we want to load the data later
#train_ind = np.loadtxt('data/data_train.csv', dtype=int)
#test_ind = np.loadtxt('data/data_test.csv', dtype=int)

In [None]:
#Defining the train and test data from the split made above
df_train = data.iloc[train_ind]
df_test = data.iloc[test_ind]

In [None]:
#Defining helper functions to calculate RMSE and create a boxplot for later exploraiton of the precision of the model
def RMSE(y_true, y_pred):
    return np.linalg.norm(y_true - y_pred) / np.sqrt(len(y_true))

def MakeBoxplot(y_true, y_pred, title):
    data = [y_pred[y_true == (x*0.5+0.5)] for x in range(10)]
    fig = plt.figure(figsize=(5, 5))
    plt.boxplot(data)
    min_a, max_a = 0., 5.5
    plt.xlim((min_a, max_a))
    plt.ylim((min_a, max_a))
    plt.plot([min_a, max_a * 2], [min_a, max_a], ls='--', color='gray', linewidth=1.0)
    plt.xticks(range(12), [x*0.5 for x in range(12)])
    plt.xlabel('True Rating')
    plt.ylabel('Predicted Rating')
    plt.title(title)
    plt.show()

In [None]:
#Defining y_true from the test data
y_true = df_test['review_score'].values

##### Dummy baseline 

As a start, we are creating a dummy baseline, which are predicting the mean of all the reviews as a prediction for every review unknown to the model.

In [None]:
#Using the global mean of all the training reviews as prediction for the unknown test datapoints. 
global_mean = df_train['review_score'].mean()
print("global_mean =", global_mean)

In [None]:
#Using the variable global_mean as prediction
y_pred = []
for i, row in df_test.iterrows():
    y_pred.append(global_mean)
y_pred = np.array(y_pred)

In [None]:
#Calculating the performance of the model
error = RMSE(y_true, y_pred)
print("RMSE =", error)
MakeBoxplot(y_true, y_pred, 'Test Set')

##### Beating the baseline model

As the average review for all products might be a bad estimate for how a given customer might rate some product, we will try to beat this baseline. For this purpose, an SVD model is implemented to predict ratings of unrated products for customers. This will hopefully beat the baseline model but also the item-item based collaborative filtering model. However, results from the SVD can not be directly compared to the predictions of the item-item based recommendation system, as we did not split values into test/train values. 

In [None]:
#Defining the rating scale (from 0.5 to 5) and loading our dataset from a pandas dataframe. 
reader = Reader(rating_scale=(0.5, 5.0))
data_surprise = Dataset.load_from_df(df_train, reader)

In [None]:
#Using grid search to tune the hyperparameters of the model. Note also, that we are using 5 cross validations. 
param_grid = {
    'n_epochs': [5, 10, 20], 
    'lr_all': [0.005, 0.05, 0.01],
    'reg_all': [0.001, 0.01, 0.1], 
    'n_factors': [10, 25, 50, 100],
    'biased': [True, False]
}

#Creating the grid and conducting grid-search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data_surprise)

#Printing the best RMSE score and the best hyperparameters: 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
#We define the model, as the one having the bese RMSE
model = gs.best_estimator['rmse']
model.fit(data_surprise.build_full_trainset());

In [None]:
#Defining the test values for customer and product ID's
customer_ids_true = df_test['customer_unique_id'].values
product_ids_true = df_test['product_id'].values

In [None]:
#Predicting customer values: 
y_pred = []
for customer_id, product_id in zip(customer_ids_true, product_ids_true):
    r = model.predict(customer_id, product_id, verbose=False).est
    y_pred.append(r)
y_pred = np.array(y_pred)
# performance


In [None]:
y_true

In [None]:
len(y_pred)

In [None]:
error = RMSE(y_true, y_pred)
print("RMSE =", error)
MakeBoxplot(y_true, y_pred, 'Test Set')

### Trying to beat the SVD with a deep learning model

#https://medium.com/@jdwittenauer/deep-learning-with-keras-recommender-systems-e7b99cb29929

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
#data = item_review_s[["customer_unique_id","product_id", "review_score"]]

In [None]:
data.shape

In [None]:
data.columns = ["userId", "movieId", "rating"]

In [None]:
data.head()

In [None]:
ratings = data

In [None]:
g = ratings.groupby('userId')['rating'].count()
top_users = g.sort_values(ascending=False)[:15]
g = ratings.groupby('movieId')['rating'].count()
top_movies = g.sort_values(ascending=False)[:15]
top_r = ratings.join(top_users, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(top_movies, rsuffix='_r', how='inner', on='movieId')
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

In [None]:
user_enc = LabelEncoder()
ratings['user'] = user_enc.fit_transform(ratings['userId'].values)
n_users = ratings['user'].nunique()
item_enc = LabelEncoder()
ratings['movie'] = item_enc.fit_transform(ratings['movieId'].values)
n_movies = ratings['movie'].nunique()
ratings['rating'] = ratings['rating'].values.astype(np.float32)
min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])
print("Number of customers, products, their min rating and max rating: ", n_users, n_movies, min_rating, max_rating)

In [None]:
X = ratings[['user', 'movie']].values
y = ratings['rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
n_factors = 50
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [None]:
from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam, RMSprop
from keras.regularizers import l2

In [None]:
def RecommenderV1(n_users, n_movies, n_factors):
    user = Input(shape=(1,))
    u = Embedding(n_users, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(user)
    u = Reshape((n_factors,))(u)
    
    movie = Input(shape=(1,))
    m = Embedding(n_movies, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(movie)
    m = Reshape((n_factors,))(m)
    
    x = Dot(axes=1)([u, m])
    model = Model(inputs=[user, movie], outputs=x)
    #opt = Adam(lr=0.05,beta_1=0.001)
    opt = RMSprop(lr=0.001, rho= 0.9)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [None]:
model = RecommenderV1(n_users, n_movies, n_factors)
model.summary()

In [None]:
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=15,
                    verbose=1, validation_data=(X_test_array, y_test))

In [None]:
history.history["loss"][-1]

##### Trying an even more advanced model:

In [None]:
from keras import backend as K

In [None]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
from keras.layers import Add, Activation, Lambda
class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x
def RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating):
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users, n_factors)(user)
    ub = EmbeddingLayer(n_users, 1)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    mb = EmbeddingLayer(n_movies, 1)(movie)
    x = Dot(axes=1)([u, m])
    x = Add()([x, ub, mb])
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss=root_mean_squared_error, optimizer=opt)#, metrics=['mse'])
    return model

In [None]:
model = RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating)
model.summary()

In [None]:
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=15,
                    verbose=1, validation_data=(X_test_array, y_test))

In [None]:
print("The RMSE of the deep learning model is therefore: ", history.history["loss"][-1])