In [7]:
import pandas as pd
import numpy as np
import math
import json
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

from sklearn.preprocessing import MultiLabelBinarizer
from utils import *



from datetime import date
sns.set(style="darkgrid")

In [8]:
# read in the json files
portfolio = pd.read_json('././Data/portfolio.json', orient='records', lines=True) 
profile = pd.read_json('././Data/profile.json', orient='records', lines=True)
transcript = pd.read_json('././Data/transcript.json', orient='records', lines=True)

In [9]:
mlb = MultiLabelBinarizer()

# 'channels' column dummies
channel_dummies = pd.DataFrame(mlb.fit_transform(portfolio['channels']), columns=mlb.classes_, index=portfolio.index)

# 'offer_type' column dummies
offer_type_dummies = portfolio['offer_type'].str.get_dummies()

# concatenate back to portfolio
portfolio = pd.concat([portfolio, channel_dummies, offer_type_dummies], axis=1)

# drop old columns
# portfolio = portfolio.drop(['channels', 'offer_type'], axis=1)


In [10]:
# 'event' column dummies
event_dummies = transcript['event'].str.get_dummies()

transcript = pd.concat([transcript, event_dummies], axis=1).rename(columns={'offer completed': 'offer_completed',\
                                                                             'offer received': 'offer_received', \
                                                                             'offer viewed': 'offer_viewed'})

In [11]:
# create separate columns for amount and offer_id from value col.
transcript['offer_id'] = transcript.value.apply(create_offer_id_col)
transcript['amount'] = transcript.value.apply(create_amount_col)

# change amount column type to float
transcript['amount'] = transcript['amount'].astype('float')

# transcript = transcript.drop('value', axis=1)

# drop value column

In [12]:
# profile table
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')

profile['gender'] = profile['gender'].fillna('NA')

# treat income NA's with mean value of the column 
profile['income'] = profile['income'].fillna(profile['income'].mean())

In [13]:

all_df = pd.read_pickle('/workspace/DataScience-StarbucksCapstone/user_item_matrix.p')
train_matrix = pd.read_pickle('/workspace/DataScience-StarbucksCapstone/train_df.p')
test_matrix = pd.read_pickle('/workspace/DataScience-StarbucksCapstone/test_df.p')


In [22]:
import math
import numpy as np

In [27]:
training_df,test_df,train_matrix, test_matrix=user_item_train_test_split(transcript)

Preparing Training matrix
Processing Offer:  0b1e1539f2cc45b7b9fa7c272da2e1d7
finished upto #: 5000 persons
finished upto #: 10000 persons
finished upto #: 15000 persons
Processing Offer:  2298d6c36e964ae4a3e7e9706d1fb8c2
finished upto #: 5000 persons
finished upto #: 10000 persons
finished upto #: 15000 persons
Processing Offer:  2906b810c7d4411798c6938adc9daaa5
finished upto #: 5000 persons
finished upto #: 10000 persons
finished upto #: 15000 persons
Processing Offer:  4d5c57ea9a6940dd891ad53e9dbe8da0
finished upto #: 5000 persons
finished upto #: 10000 persons
finished upto #: 15000 persons
Processing Offer:  9b98b8c7a33c4b65b9aebfe6a799e6d9
finished upto #: 5000 persons


KeyboardInterrupt: 

In [None]:
# Fit FunkSVD with the specified hyper parameters to the training data
np_train=np.matrix(train_matrix)

user_mat_20, offer_mat_20 = FunkSVD(np_train, latent_features=20, learning_rate=0.005, iters=250)

In [None]:
# Test for the best number of latent feature. (with latent features 10)
user_mat_15, offer_mat_15 = FunkSVD(np_train, latent_features=15, learning_rate=0.005, iters=250)

In [None]:
# Test for the best number of latent feature. (with latent features 10)
user_mat_10, offer_mat_10 = FunkSVD(np_train, latent_features=10, learning_rate=0.005, iters=250)

In [None]:
# Test for the best number of latent feature. (with latent features 5)
user_mat_5, offer_mat_5 = FunkSVD(np_train, latent_features=5, learning_rate=0.005, iters=250)

*Check performance of the FUNKSVD models with the various number of latent features against the test dataset*

In [None]:
# Evaluation for latent features of 20
validation_score(test_matrix, user_mat_20, offer_mat_20)

In [None]:
# Evaluation for latent features of 15
validation_score(test_matrix, user_mat_15, offer_mat_15)

In [None]:
# Evaluation for latent features of 10
validation_score(test_matrix, user_mat_10, offer_mat_10)

In [None]:
# Evaluation for latent features of 5
validation_score(test_matrix, user_mat_5, offer_mat_5)

Based on the validation scores the model using latent featues of 5 seems to be performing the best.

## Make Recomendations

Since our training dataset only consists of some users, we need to have a recommendation engine that can also handle a new user. Below functions will help make default offer recommendations to a new Customer by recommending offer which generated the maximum reactions from existing Customers.

In [None]:
offer_reactions=offer_max_reactions(all_df)
offer_reactions=offer_reactions.merge(portfolio[['id','offer_type']], left_on='offer_id',right_on='id', how='left').head()

In [None]:
offer_reactions.sample(5)

In [None]:
colours = {"bogo": "#273c75", "discount": "#44bd32"}
offer_reactions['Total_reactions'].plot(
        kind="bar", color=offer_reactions['offer_type'].replace(colours)
).legend(
    [
        Patch(facecolor=colours['bogo']),
        Patch(facecolor=colours['discount'])
    ], ["bogo", "discount"]
)
plt.title('Offer count of total reactions')
plt.xlabel('Offer index number')
plt.ylabel('Total Reactions')

From the above chart it is clear that discount is the best performing offer in the dataset

6. Next steps and improvements
In order to improve the above recommendation engine, I would suggest the following approaches.

The default recommendation for new users can be improved by accounting for demographic information such as gender, age, etc assuming such information is available to us
Alternatively algorithms apart from Funk SVD or neural networks can be explored
7.Credits and references
Starbucks and Udacity for dataset
https://stackoverflow.com/questions
Udacity implementing 'Matrix factorization for Recommendations' lesson for function implementations such as FunkSVD, User item matrix creations, prediction and validation