In [None]:
# Package installation if needed
!pip install sklearn
!pip install turicreate

In [12]:
# Import dependencies
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
import turicreate as tc

**Input Data**

In [13]:
# Upload csv file from a local drive
from google.colab import files
uploaded = files.upload()

Saving uk_data.csv to uk_data.csv


In [15]:
# Import csv file into a dataframe
import io
df = pd.read_csv(io.BytesIO(uploaded['uk_data.csv']), encoding='unicode_escape')

In [16]:
print(df.shape)
df.head()

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


**Data Preparation**

In [17]:
# Get data for product/customer matrix
data = df[['CustomerID', 'StockCode', 'Quantity']]
print(data.shape)
data.head()

(541909, 3)


Unnamed: 0,CustomerID,StockCode,Quantity
0,17850.0,85123A,6
1,17850.0,71053,6
2,17850.0,84406B,8
3,17850.0,84029G,6
4,17850.0,84029E,6


**Create Data with User, Product and Purchase Count**

In [58]:
# Group by to get purchase quantity by customer ID and stock code
data = data.groupby(['CustomerID','StockCode']) \
    .agg({'Quantity':'sum'}) \
    .rename(columns={'StockCode':'ProductID'}) \ # How come rename function didn't work?
    .reset_index()
print(data.shape)
data.head()

(265220, 3)


Unnamed: 0,CustomerID,StockCode,Quantity
0,12347,16008,24
1,12347,17021,36
2,12347,20665,6
3,12347,20719,40
4,12347,20780,12


In [19]:
# Question - get rid of negative values? - Decision: Get rid!
data[data['CustomerID']==17850]

Unnamed: 0,CustomerID,StockCode,Quantity
251481,17850.0,15056BL,24
251482,17850.0,20679,42
251483,17850.0,21068,96
251484,17850.0,21071,90
251485,17850.0,21169,-2
251486,17850.0,21730,102
251487,17850.0,21871,83
251488,17850.0,21874,-1
251489,17850.0,22411,60
251490,17850.0,22632,102


In [57]:
# Filter out quantity with negative values
data = data[data['Quantity']>0]
print(data.shape)
data.head()

(265220, 3)


Unnamed: 0,CustomerID,StockCode,Quantity
1,12347,16008,24
2,12347,17021,36
3,12347,20665,6
4,12347,20719,40
5,12347,20780,12


In [40]:
# Format CustomerID from integer to string
data['CustomerID'] = data['CustomerID'].astype(int).astype(str)
data.dtypes

CustomerID    object
StockCode     object
Quantity       int64
dtype: object

**Create Dummy Dataset**

In [41]:
# Create dummy
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)
data_dummy

Unnamed: 0,CustomerID,StockCode,Quantity,purchase_dummy
1,12347,16008,24,1
2,12347,17021,36,1
3,12347,20665,6,1
4,12347,20719,40,1
5,12347,20780,12,1
...,...,...,...,...
267610,18287,84920,4,1
267611,18287,85039A,96,1
267612,18287,85039B,120,1
267613,18287,85040A,48,1


**Normalize Item Values Across Users**

In [22]:
# Function to normalize item values across users
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='Quantity', index='CustomerID', columns='StockCode')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['CustomerID'], value_name='scaled_purchase_freq').dropna()

In [42]:
# Execute function
norm_data = normalize_data(data)

In [43]:
print(norm_data.shape)
norm_data.head()

(264838, 3)


Unnamed: 0,CustomerID,StockCode,scaled_purchase_freq
85,12451,10002,0.04
128,12510,10002,0.083636
186,12583,10002,0.170909
231,12637,10002,0.04
262,12673,10002,0.0


In [None]:
# Import drive first before exporting as csv file
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
# Save pandas dataframe as csv file
norm_data.to_csv('uk_data_scaled_freq.csv')
!cp uk_data_scaled_freq.csv "drive/My Drive/PREWORK_JT/"

**Split Train and Test Datasets**

In [25]:
# Declare function to split train and test data
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [44]:
# Split normalised scaled frequency data
train_data_norm, test_data_norm = split_data(norm_data)

In [45]:
# Split dummy data
train_data_dummy, test_data_dummy = split_data(data_dummy)

In [46]:
# Split purchase count data
train_data, test_data = split_data(data)

**Define Models Using Turicreate Library**

In [30]:
# Define variables for field names
user_id = 'CustomerID'
item_id = 'StockCode'
users_to_recommend = list(data['CustomerID'])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [32]:
# Declare function for all models
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

**Popularity Model as Baseline**

In [47]:
# Using Purchase Count
name = 'popularity'
target = 'Quantity'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|   12347    |   84568   | 849.2307692307693  |  1   |
|   12347    |   84826   | 522.2692307692307  |  2   |
|   12347    |   16014   | 439.93333333333334 |  3   |
|   12347    |   17084R  | 276.9230769230769  |  4   |
|   12347    |   17096   | 260.8888888888889  |  5   |
|   12347    |   84691   |       240.0        |  6   |
|   12347    |   84598   |       240.0        |  7   |
|   12347    |   22053   |       209.9        |  8   |
|   12347    |   84077   | 187.92857142857142 |  9   |
|   12347    |   17003   | 174.7314814814815  |  10  |
|   12347    |   84568   | 849.2307692307693  |  1   |
|   12347    |   84826   | 522.2692307692307  |  2   |
|   12347    |   16014   | 439.93333333333334 |  3   |
|   12347    |   17084R  | 276.9230769230769  |  4   |
|   12347    |   17096   | 260.8888888888889  |  5   |
|   12347 

In [48]:
# Using purchase dummy
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|   12347    |   21192   |  1.0  |  1   |
|   12347    |   22111   |  1.0  |  2   |
|   12347    |   22865   |  1.0  |  3   |
|   12347    |   21914   |  1.0  |  4   |
|   12347    |   22189   |  1.0  |  5   |
|   12347    |   84997C  |  1.0  |  6   |
|   12347    |   23240   |  1.0  |  7   |
|   12347    |   16049   |  1.0  |  8   |
|   12347    |   22753   |  1.0  |  9   |
|   12347    |   46775D  |  1.0  |  10  |
|   12347    |   21192   |  1.0  |  1   |
|   12347    |   22111   |  1.0  |  2   |
|   12347    |   22865   |  1.0  |  3   |
|   12347    |   21914   |  1.0  |  4   |
|   12347    |   22189   |  1.0  |  5   |
|   12347    |   84997C  |  1.0  |  6   |
|   12347    |   23240   |  1.0  |  7   |
|   12347    |   16049   |  1.0  |  8   |
|   12347    |   22753   |  1.0  |  9   |
|   12347    |   46775D  |  1.0  |  10  |
|   12347    |   21192   |  1.0  |

In [49]:
# Using scaled purchase count
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|   12347    |   35647   |  1.0  |  1   |
|   12347    |   62094B  |  1.0  |  2   |
|   12347    |   35817P  |  1.0  |  3   |
|   12347    |   90036B  |  1.0  |  4   |
|   12347    |   45013   |  1.0  |  5   |
|   12347    |   90174   |  1.0  |  6   |
|   12347    |   90177C  |  1.0  |  7   |
|   12347    |   90123D  |  1.0  |  8   |
|   12347    |   21769   |  1.0  |  9   |
|   12347    |   82095   |  1.0  |  10  |
|   12347    |   35647   |  1.0  |  1   |
|   12347    |   62094B  |  1.0  |  2   |
|   12347    |   35817P  |  1.0  |  3   |
|   12347    |   90036B  |  1.0  |  4   |
|   12347    |   45013   |  1.0  |  5   |
|   12347    |   90174   |  1.0  |  6   |
|   12347    |   90177C  |  1.0  |  7   |
|   12347    |   90123D  |  1.0  |  8   |
|   12347    |   21769   |  1.0  |  9   |
|   12347    |   82095   |  1.0  |  10  |
|   12347    |   35647   |  1.0  |

In [68]:
# Baseline summary - Not sure why, explore tomorrow
train.groupby(by=item_id)['Quantity'].mean().sort_values(ascending=False).head(20)

NameError: ignored

**Collaborative Filtering Model**

Cosine similarity

In [59]:
# Using purchase count
name = 'cosine'
target = 'Quantity'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|   12347    |   22551   |  4.54032360514005  |  1   |
|   12347    |   23345   | 4.373026153870991  |  2   |
|   12347    |   22629   | 4.3532323723747615 |  3   |
|   12347    |   22856   | 4.344635916607721  |  4   |
|   12347    |   23346   | 4.342059684651239  |  5   |
|   12347    |   22595   | 4.340843625011898  |  6   |
|   12347    |   23281   | 4.327331219400678  |  7   |
|   12347    |   22243   | 4.323169976472855  |  8   |
|   12347    |   22399   | 4.319746955519631  |  9   |
|   12347    |   22244   | 4.3072595454397655 |  10  |
|   12347    |   22551   |  4.54032360514005  |  1   |
|   12347    |   23345   | 4.373026153870991  |  2   |
|   12347    |   22629   | 4.3532323723747615 |  3   |
|   12347    |   22856   | 4.344635916607721  |  4   |
|   12347    |   23346   | 4.342059684651239  |  5   |
|   12347 

In [60]:
# Using purchase dummy
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| CustomerID | StockCode |        score         | rank |
+------------+-----------+----------------------+------+
|   12347    |   21212   | 0.05843215161248257  |  1   |
|   12347    |   22423   | 0.057853370904922485 |  2   |
|   12347    |   23173   | 0.05579692674310584  |  3   |
|   12347    |   23245   | 0.05063931251827039  |  4   |
|   12347    |   21977   |  0.0492162171163057  |  5   |
|   12347    |   23307   | 0.048265778704693445 |  6   |
|   12347    |   22382   | 0.047969822036592585 |  7   |
|   12347    |   22630   | 0.04728647282249049  |  8   |
|   12347    |   22993   | 0.04556221083590859  |  9   |
|   12347    |   22629   | 0.040193057373950354 |  10  |
|   12347    |   21212   | 0.05843215161248257  |  1   |
|   12347    |   22423   | 0.057853370904922485 |  2   |
|   12347    |   23173   | 0.05579692674310584  |  3   |
|   12347    |   23245   | 0.05063931251827039  |  4   |
|   12347    |   21977   |  0.0

In [61]:
# Using scaled purchase count
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| CustomerID | StockCode |        score         | rank |
+------------+-----------+----------------------+------+
|   12347    |   22769   | 0.026481623509350946 |  1   |
|   12347    |   90141A  | 0.026481623509350946 |  2   |
|   12347    |   90205C  | 0.026481623509350946 |  3   |
|   12347    |   90141C  | 0.02638743414598353  |  4   |
|   12347    |   90072   | 0.02638743414598353  |  5   |
|   12347    |   16206B  | 0.026197289018069995 |  6   |
|   12347    |   84226   | 0.025820734220392563 |  7   |
|   12347    |   90177D  | 0.02549883618074305  |  8   |
|   12347    |   84707B  | 0.024936350654153262 |  9   |
|   12347    |   90101   | 0.024931075993706198 |  10  |
|   12347    |   22769   | 0.026481623509350946 |  1   |
|   12347    |   90141A  | 0.026481623509350946 |  2   |
|   12347    |   90205C  | 0.026481623509350946 |  3   |
|   12347    |   90141C  | 0.02638743414598353  |  4   |
|   12347    |   90072   | 0.02

Pearson similarity

In [62]:
# Using purchase count
name = 'pearson'
target = 'Quantity'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|   12347    |   84568   |  849.21352769036   |  1   |
|   12347    |   84826   | 522.2692307692307  |  2   |
|   12347    |   16014   | 439.93333333333334 |  3   |
|   12347    |   17084R  | 277.09725923590605 |  4   |
|   12347    |   17096   | 260.8888888888889  |  5   |
|   12347    |   84598   |  240.058377883264  |  6   |
|   12347    |   84691   |       240.0        |  7   |
|   12347    |   22053   | 209.89999999999998 |  8   |
|   12347    |   84077   | 187.98843734463057 |  9   |
|   12347    |   17003   | 174.71930398930007 |  10  |
|   12347    |   84568   |  849.21352769036   |  1   |
|   12347    |   84826   | 522.2692307692307  |  2   |
|   12347    |   16014   | 439.93333333333334 |  3   |
|   12347    |   17084R  | 277.09725923590605 |  4   |
|   12347    |   17096   | 260.8888888888889  |  5   |
|   12347 

In [63]:
# Using purchase dummy
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|   12347    |   21192   |  0.0  |  1   |
|   12347    |   22111   |  0.0  |  2   |
|   12347    |   22865   |  0.0  |  3   |
|   12347    |   21914   |  0.0  |  4   |
|   12347    |   22189   |  0.0  |  5   |
|   12347    |   84997C  |  0.0  |  6   |
|   12347    |   23240   |  0.0  |  7   |
|   12347    |   16049   |  0.0  |  8   |
|   12347    |   22753   |  0.0  |  9   |
|   12347    |   46775D  |  0.0  |  10  |
|   12347    |   21192   |  0.0  |  1   |
|   12347    |   22111   |  0.0  |  2   |
|   12347    |   22865   |  0.0  |  3   |
|   12347    |   21914   |  0.0  |  4   |
|   12347    |   22189   |  0.0  |  5   |
|   12347    |   84997C  |  0.0  |  6   |
|   12347    |   23240   |  0.0  |  7   |
|   12347    |   16049   |  0.0  |  8   |
|   12347    |   22753   |  0.0  |  9   |
|   12347    |   46775D  |  0.0  |  10  |
|   12347    |   21192   |  0.0  |

In [64]:
# Using scaled purchase count
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|   12347    |   62094B  |  1.0  |  1   |
|   12347    |   35817P  |  1.0  |  2   |
|   12347    |   35647   |  1.0  |  3   |
|   12347    |   90036B  |  1.0  |  4   |
|   12347    |   45013   |  1.0  |  5   |
|   12347    |   90174   |  1.0  |  6   |
|   12347    |   90177C  |  1.0  |  7   |
|   12347    |   21769   |  1.0  |  8   |
|   12347    |   90123D  |  1.0  |  9   |
|   12347    |   82095   |  1.0  |  10  |
|   12347    |   62094B  |  1.0  |  1   |
|   12347    |   35817P  |  1.0  |  2   |
|   12347    |   35647   |  1.0  |  3   |
|   12347    |   90036B  |  1.0  |  4   |
|   12347    |   45013   |  1.0  |  5   |
|   12347    |   90174   |  1.0  |  6   |
|   12347    |   90177C  |  1.0  |  7   |
|   12347    |   21769   |  1.0  |  8   |
|   12347    |   90123D  |  1.0  |  9   |
|   12347    |   82095   |  1.0  |  10  |
|   12347    |   62094B  |  1.0  |

**Model Evaluation**

In [65]:
# Declare initial callable variables for model evaluation
models_w_counts = [popularity_model, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

NameError: ignored

In [None]:
# Compare all the models based on RMSE and precision-recall characteristics
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

**Evaluation Summary**

**Final Output**