In [0]:
# Finding how often a certain product has been sold.

In [0]:
# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [0]:
import numpy as np
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
! pip install turicreate



In [0]:
import time
from sklearn.model_selection import train_test_split
import turicreate as tc

In [0]:
#Loading dataset

In [0]:
url = 'https://raw.githubusercontent.com/HassanSherwani/Price_frequency/master/20190207_transactions.json'

In [0]:
transactions = pd.read_json(url, lines= True)

In [9]:
transactions.head()

Unnamed: 0,id,products
0,0,"[185, 30, 77, 188, 78, 125, 45, 155, 241, 229,..."
1,1,"[119, 148, 108, 34, 157, 82, 113, 45, 165]"
2,2,"[173, 103, 229, 240]"
3,3,[91]
4,4,"[175, 192, 54, 172]"


In [10]:
transactions.shape

(2500, 2)

In [11]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2500 entries, 0 to 2499
Data columns (total 2 columns):
id          2500 non-null int64
products    2500 non-null object
dtypes: int64(1), object(1)
memory usage: 58.6+ KB


In [12]:
transactions.describe()

Unnamed: 0,id
count,2500.0
mean,1249.5
std,721.83216
min,0.0
25%,624.75
50%,1249.5
75%,1874.25
max,2499.0


In [13]:
# Get the row names
transactions.index.values

array([   0,    1,    2, ..., 2497, 2498, 2499])

In [14]:
# Get the column names
transactions.products.values

array([list([185, 30, 77, 188, 78, 125, 45, 155, 241, 229, 133, 161, 210, 89, 89, 238]),
       list([119, 148, 108, 34, 157, 82, 113, 45, 165]),
       list([173, 103, 229, 240]), ..., list([195, 97, 226, 4, 29]),
       list([29, 83, 143, 249, 123, 244, 39, 243, 167, 184, 123, 210]),
       list([118, 198, 183, 216, 98, 170, 163, 93, 240, 28, 223, 161, 41, 99, 151, 47, 60, 14, 90])],
      dtype=object)

In [15]:
# Order the rows of transactions by TransactionID descending
transactions.sort_values('products', ascending=False).head()

Unnamed: 0,id,products
1415,1415,"[250, 236, 242, 229, 92, 2, 71, 172, 109, 247,..."
1316,1316,"[250, 224, 232, 211, 25]"
1093,1093,"[250, 221, 155, 24, 188, 179, 43, 36, 183, 152..."
1203,1203,"[250, 183, 90, 172, 27, 67, 232, 124, 190, 42,..."
594,594,"[250, 144, 245, 8, 74, 17, 112, 121, 90, 242, ..."


In [16]:
print(transactions['products'][1415])

[250, 236, 242, 229, 92, 2, 71, 172, 109, 247, 171, 209, 90, 139, 188, 191, 145, 214, 216, 237]


In [0]:
# Adding features

In [0]:
## Creating a dataframe for customers' recommendation.Will use this later in our model
customers=transactions['id']

In [19]:
customers.head()

0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64

In [0]:
import random
random.shuffle(customers)

In [21]:
customers.head()

0    2045
1     570
2    1127
3    1055
4    2388
Name: id, dtype: int64

In [0]:
customers=customers[:1000]

In [23]:
customers.head()

0    2045
1     570
2    1127
3    1055
4    2388
Name: id, dtype: int64

In [0]:
## Creating purchase count
data = pd.melt(transactions.set_index('id')['products'].apply(pd.Series).reset_index(), 
             id_vars=['id'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['id', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

In [25]:
data.head()

Unnamed: 0,id,productId,purchase_count
0,0,65,1
1,0,118,1
2,0,131,1
3,0,174,1
4,0,213,1


In [0]:
## Create dummy
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)
"""
Dummy for marking whether a customer bought that item or not.
If one buys an item, then purchase_dummy are marked as 1.
"""

In [27]:
data_dummy.head()

Unnamed: 0,id,productId,purchase_count,purchase_dummy
0,0,65,1,1
1,0,118,1,1
2,0,131,1,1
3,0,174,1,1
4,0,213,1,1


In [0]:
# Normalize:normalize items by purchase frequency across all users

In [0]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='id', columns='productId')

In [30]:
df_matrix.head()

productId,1,2,3,4,5,6,7,8,9,10,...,241,242,243,244,245,246,247,248,249,250
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,1.0,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [0]:
"""

The NaN tells us that the item represented by the column was not purchased in that specific transaction.

"""

In [0]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [32]:
df_matrix_norm.head()

productId,1,2,3,4,5,6,7,8,9,10,...,241,242,243,244,245,246,247,248,249,250
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,0.0,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [0]:
# create a table for input to the modeling  
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['id'], value_name='scaled_purchase_freq').dropna()

In [34]:
data_norm.shape

(22530, 3)

In [74]:
data_norm.head()

Unnamed: 0,id,productId,scaled_purchase_freq
11,11,1,0.0
27,28,1,0.0
68,71,1,0.0
92,96,1,0.0
146,153,1,0.0


In [0]:
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='purchase_count', index='id', columns='productId')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['id'], value_name='scaled_purchase_freq').dropna()
"""
we have normalized the items accoreding to their purchase history, from 0–1 (with 1 being the most number of purchase for an item and 0 being 0 purchase count for that item

"""

In [0]:
# Splitting dataset into train and test
### we have three datasets with purchase counts(data), purchase dummy(data_dummy), and scaled purchase counts(data_norm).

In [0]:
# Split train and test set
def split_data(data):
  train, test = train_test_split(data, test_size = .2)
  train_data = tc.SFrame(train)
  test_data = tc.SFrame(test)
  return train_data, test_data

In [0]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [0]:
# Defining key field parameters
user_id = 'id'
item_id = 'productId'
users_to_recommend = list(customers)
n_rec = 10 # number of items to recommend
n_display = 30 # first  30 rows of dataset

In [0]:
# Build the model

In [0]:
def model(train_data, name, user_id, item_id, target,users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model
        

In [0]:
#1-Popularity Model 

In [43]:
## 1.a-purchase count as target
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, user_id, item_id, target,users_to_recommend, n_rec, n_display)


+------+-----------+--------------------+------+
|  id  | productId |       score        | rank |
+------+-----------+--------------------+------+
| 2045 |    201    | 1.088235294117647  |  1   |
| 2045 |    207    | 1.0777777777777777 |  2   |
| 2045 |     96    | 1.0769230769230769 |  3   |
| 2045 |    153    | 1.0769230769230769 |  4   |
| 2045 |     35    | 1.0714285714285714 |  5   |
| 2045 |     44    | 1.069767441860465  |  6   |
| 2045 |    117    | 1.0694444444444444 |  7   |
| 2045 |     85    | 1.0686274509803921 |  8   |
| 2045 |     71    | 1.064516129032258  |  9   |
| 2045 |     63    | 1.0638297872340425 |  10  |
| 570  |    201    | 1.088235294117647  |  1   |
| 570  |    207    | 1.0777777777777777 |  2   |
| 570  |     96    | 1.0769230769230769 |  3   |
| 570  |    153    | 1.0769230769230769 |  4   |
| 570  |     35    | 1.0714285714285714 |  5   |
| 570  |     44    | 1.069767441860465  |  6   |
| 570  |    117    | 1.0694444444444444 |  7   |
| 570  |     85    |

In [44]:
## 1.b-Using purchase dummy
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target,users_to_recommend, n_rec, n_display)

+------+-----------+-------+------+
|  id  | productId | score | rank |
+------+-----------+-------+------+
| 2045 |    180    |  1.0  |  1   |
| 2045 |    186    |  1.0  |  2   |
| 2045 |    121    |  1.0  |  3   |
| 2045 |     96    |  1.0  |  4   |
| 2045 |    126    |  1.0  |  5   |
| 2045 |     90    |  1.0  |  6   |
| 2045 |     69    |  1.0  |  7   |
| 2045 |     92    |  1.0  |  8   |
| 2045 |     50    |  1.0  |  9   |
| 2045 |    104    |  1.0  |  10  |
| 570  |    180    |  1.0  |  1   |
| 570  |    186    |  1.0  |  2   |
| 570  |    121    |  1.0  |  3   |
| 570  |     96    |  1.0  |  4   |
| 570  |    126    |  1.0  |  5   |
| 570  |     90    |  1.0  |  6   |
| 570  |     69    |  1.0  |  7   |
| 570  |     92    |  1.0  |  8   |
| 570  |     50    |  1.0  |  9   |
| 570  |    104    |  1.0  |  10  |
| 1127 |    180    |  1.0  |  1   |
| 1127 |    186    |  1.0  |  2   |
| 1127 |    121    |  1.0  |  3   |
| 1127 |     96    |  1.0  |  4   |
| 1127 |    126    |  1.0  |

In [45]:
## 1.c-Using scaled purchase count
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target,users_to_recommend, n_rec, n_display)

+------+-----------+----------------------+------+
|  id  | productId |        score         | rank |
+------+-----------+----------------------+------+
| 2045 |    207    | 0.08974358974358974  |  1   |
| 2045 |     96    | 0.07142857142857142  |  2   |
| 2045 |     26    |  0.0684931506849315  |  3   |
| 2045 |    153    | 0.06493506493506493  |  4   |
| 2045 |    120    | 0.06329113924050633  |  5   |
| 2045 |    201    | 0.058823529411764705 |  6   |
| 2045 |     35    | 0.05555555555555555  |  7   |
| 2045 |    232    | 0.05405405405405406  |  8   |
| 2045 |    181    | 0.05405405405405406  |  9   |
| 2045 |     66    | 0.05333333333333334  |  10  |
| 570  |    207    | 0.08974358974358974  |  1   |
| 570  |     96    | 0.07142857142857142  |  2   |
| 570  |     26    |  0.0684931506849315  |  3   |
| 570  |    153    | 0.06493506493506493  |  4   |
| 570  |    125    | 0.06329113924050633  |  5   |
| 570  |    120    | 0.06329113924050633  |  6   |
| 570  |    201    | 0.05882352

In [0]:
### checking frequency of product purchase
#train.data.groupby(by='item_id')['purchase_count'].mean().sort_value(ascending=False).head()
# train_data.groupby('item_id').mean().sort_value(by='purchase_count').head()

In [0]:
#2-Cosine similarity

In [48]:
## 2.a)- Using purchase count
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------+-----------+----------------------+------+
|  id  | productId |        score         | rank |
+------+-----------+----------------------+------+
| 2045 |    139    | 0.04708235080425556  |  1   |
| 2045 |     61    | 0.044628964020655706 |  2   |
| 2045 |    181    | 0.041394637181208685 |  3   |
| 2045 |     51    | 0.04069681809498714  |  4   |
| 2045 |    157    | 0.039747380293332614 |  5   |
| 2045 |    214    |  0.0394859634912931  |  6   |
| 2045 |     17    | 0.03785779384466318  |  7   |
| 2045 |    130    | 0.03731348422857431  |  8   |
| 2045 |     83    | 0.03631529899743887  |  9   |
| 2045 |     11    |  0.0359420868066641  |  10  |
| 570  |    122    | 0.055515170097351074 |  1   |
| 570  |    170    | 0.05258390638563368  |  2   |
| 570  |     63    |  0.0496698882844713  |  3   |
| 570  |    149    | 0.04754097594155206  |  4   |
| 570  |     80    |  0.0447641478644477  |  5   |
| 570  |     79    | 0.044544875621795654 |  6   |
| 570  |    131    | 0.04280611

In [49]:
## 2.b)-Using purchase dummy
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------+-----------+----------------------+------+
|  id  | productId |        score         | rank |
+------+-----------+----------------------+------+
| 2045 |    243    | 0.04784289995829264  |  1   |
| 2045 |    206    | 0.039752443631490074 |  2   |
| 2045 |    107    | 0.03778278827667236  |  3   |
| 2045 |    151    | 0.03660514950752258  |  4   |
| 2045 |    221    | 0.03576985001564026  |  5   |
| 2045 |     17    | 0.035399372378985085 |  6   |
| 2045 |    175    | 0.03513320287068685  |  7   |
| 2045 |    162    | 0.03462957342465719  |  8   |
| 2045 |     52    | 0.034606486558914185 |  9   |
| 2045 |    171    | 0.03374261657396952  |  10  |
| 570  |    112    | 0.06572159131368001  |  1   |
| 570  |    130    |  0.050746222337087   |  2   |
| 570  |    150    | 0.05041569471359253  |  3   |
| 570  |     63    | 0.04976290464401245  |  4   |
| 570  |    140    | 0.04907870292663574  |  5   |
| 570  |     5     | 0.04892701903978983  |  6   |
| 570  |    160    |  0.0444582

In [50]:
## 2.c)-Using scaled purchase count
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------+-----------+-------+------+
|  id  | productId | score | rank |
+------+-----------+-------+------+
| 2045 |     83    |  0.0  |  1   |
| 2045 |     9     |  0.0  |  2   |
| 2045 |     98    |  0.0  |  3   |
| 2045 |    219    |  0.0  |  4   |
| 2045 |     88    |  0.0  |  5   |
| 2045 |    247    |  0.0  |  6   |
| 2045 |    171    |  0.0  |  7   |
| 2045 |    179    |  0.0  |  8   |
| 2045 |     87    |  0.0  |  9   |
| 2045 |    178    |  0.0  |  10  |
| 570  |     83    |  0.0  |  1   |
| 570  |     9     |  0.0  |  2   |
| 570  |     98    |  0.0  |  3   |
| 570  |    219    |  0.0  |  4   |
| 570  |     88    |  0.0  |  5   |
| 570  |    247    |  0.0  |  6   |
| 570  |    171    |  0.0  |  7   |
| 570  |    179    |  0.0  |  8   |
| 570  |     87    |  0.0  |  9   |
| 570  |    178    |  0.0  |  10  |
| 1127 |     83    |  0.0  |  1   |
| 1127 |     9     |  0.0  |  2   |
| 1127 |     98    |  0.0  |  3   |
| 1127 |    219    |  0.0  |  4   |
| 1127 |     88    |  0.0  |

In [0]:
# 3)-Pearson similarity

In [52]:
## 3.a)-Using purchase count
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------+-----------+--------------------+------+
|  id  | productId |       score        | rank |
+------+-----------+--------------------+------+
| 2045 |    201    | 1.088360775649817  |  1   |
| 2045 |    207    | 1.0777505476250608 |  2   |
| 2045 |     96    | 1.077081208045666  |  3   |
| 2045 |    153    | 1.0761534846746004 |  4   |
| 2045 |     35    | 1.0714074943091842 |  5   |
| 2045 |     44    | 1.069910690161751  |  6   |
| 2045 |    117    | 1.0694220101731455 |  7   |
| 2045 |     85    | 1.0688250633026857 |  8   |
| 2045 |     71    | 1.0646787717679593 |  9   |
| 2045 |    131    | 1.0639533568326076 |  10  |
| 570  |    201    | 1.0882263401754537 |  1   |
| 570  |    207    | 1.0777681681844924 |  2   |
| 570  |    153    | 1.0768975131532066 |  3   |
| 570  |     96    | 1.0768883274151728 |  4   |
| 570  |     35    | 1.0714208691839187 |  5   |
| 570  |     44    | 1.0697402025378027 |  6   |
| 570  |    117    | 1.0694210529327393 |  7   |
| 570  |     85    |

In [53]:
## 3b)-Using purchase dummy
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------+-----------+-------+------+
|  id  | productId | score | rank |
+------+-----------+-------+------+
| 2045 |    180    |  0.0  |  1   |
| 2045 |    186    |  0.0  |  2   |
| 2045 |    121    |  0.0  |  3   |
| 2045 |     96    |  0.0  |  4   |
| 2045 |    126    |  0.0  |  5   |
| 2045 |     90    |  0.0  |  6   |
| 2045 |     69    |  0.0  |  7   |
| 2045 |     92    |  0.0  |  8   |
| 2045 |     50    |  0.0  |  9   |
| 2045 |    104    |  0.0  |  10  |
| 570  |    180    |  0.0  |  1   |
| 570  |    186    |  0.0  |  2   |
| 570  |    121    |  0.0  |  3   |
| 570  |     96    |  0.0  |  4   |
| 570  |    126    |  0.0  |  5   |
| 570  |     90    |  0.0  |  6   |
| 570  |     69    |  0.0  |  7   |
| 570  |     92    |  0.0  |  8   |
| 570  |     50    |  0.0  |  9   |
| 570  |    104    |  0.0  |  10  |
| 1127 |    180    |  0.0  |  1   |
| 1127 |    186    |  0.0  |  2   |
| 1127 |    121    |  0.0  |  3   |
| 1127 |     96    |  0.0  |  4   |
| 1127 |    126    |  0.0  |

In [54]:
## 3c)-Using scaled purchase count
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------+-----------+----------------------+------+
|  id  | productId |        score         | rank |
+------+-----------+----------------------+------+
| 2045 |    207    | 0.08970907269380032  |  1   |
| 2045 |     96    | 0.07142407723835534  |  2   |
| 2045 |     26    | 0.06848716041813158  |  3   |
| 2045 |    153    | 0.06373434128699362  |  4   |
| 2045 |    120    | 0.06328715765023532  |  5   |
| 2045 |    201    |  0.0588155185475069  |  6   |
| 2045 |     35    | 0.05553464028570389  |  7   |
| 2045 |    181    | 0.05403781178835277  |  8   |
| 2045 |    232    | 0.05402055624369029  |  9   |
| 2045 |     66    | 0.053329459031422936 |  10  |
| 570  |    207    | 0.08972784418326159  |  1   |
| 570  |     96    | 0.07141078937621341  |  2   |
| 570  |     26    | 0.06848047476380927  |  3   |
| 570  |    153    | 0.06490944487191896  |  4   |
| 570  |    125    |  0.0632837383304467  |  5   |
| 570  |    120    | 0.06328004284247063  |  6   |
| 570  |    201    | 0.05881899

In [0]:
# Model Evaluation

In [0]:
models_w_counts = [popularity, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [57]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.009921671018276776 | 0.003994778067885115 |
|   2    | 0.008616187989556127 | 0.006947241908077432 |
|   3    | 0.008006962576153166 | 0.010012018732645374 |
|   4    | 0.009530026109660576 | 0.015564673214803756 |
|   5    | 0.009817232375979113 | 0.019242405404285283 |
|   6    | 0.010879025239338565 | 0.02485225247627334  |
|   7    | 0.01089145841104064  | 0.029392225123295618 |
|   8    | 0.010509138381201042 | 0.032138712752289744 |
|   9    | 0.01050188569770814  |  0.0352202743586556  |
|   10   | 0.010234986945169704 | 0.03860147540304204  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.1390434089970408

Per User RMSE (best)
+------+------+-------+
|  id  | rmse | count |
+------+------+-------+



Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.010966057441253264 | 0.0033855526544821577 |
|   2    | 0.010704960835509146 |  0.007472336192962816 |
|   3    | 0.010617928633594427 |  0.011353972398358803 |
|   4    | 0.010966057441253278 |  0.015117079033528125 |
|   5    | 0.010757180156657947 |  0.018441709146670003 |
|   6    | 0.010530896431679723 |  0.02224998963902366  |
|   7    | 0.009921671018276786 |  0.024808736375315995 |
|   8    |  0.0101827676240209  |  0.029404036636412632 |
|   9    | 0.010501885697708165 |  0.03399063367731776  |
|   10   | 0.010391644908616172 |   0.0369584317626092  |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0121247967410023

Per User RMSE (best)
+-----+-------------------+-------+
|  id |        rmse   


Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.010443864229765034 |  0.004255874673629238 |
|   2    | 0.008877284595300293 | 0.0072083385138215425 |
|   3    | 0.009225413402959098 |  0.010616270877367481 |
|   4    | 0.009530026109660562 |   0.0155646732148038  |
|   5    | 0.009817232375979113 |  0.019242405404285293 |
|   6    | 0.010704960835509127 |  0.024895768577230727 |
|   7    | 0.010816859380828069 |  0.029131128517551487 |
|   8    | 0.010574412532637077 |  0.03220398690372573  |
|   9    | 0.010559907165651281 |   0.0362211446806748  |
|   10   | 0.01023498694516972  |  0.03948050064238053  |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.1390451561869032

Per User RMSE (best)
+------+------+-------+
|  id  | rmse | count |
+------+--


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.012454592631032695 | 0.003930980799169703 |
|   2    | 0.012195121951219497 | 0.008895519806261886 |
|   3    | 0.010032866286109672 | 0.011230755924580534 |
|   4    | 0.010768033212247015 | 0.01674018335928039  |
|   5    | 0.011416709911779973 | 0.022100724046754124 |
|   6    | 0.01184916104480192  | 0.027300845956787834 |
|   7    | 0.011268440951886732 | 0.02970527425639002  |
|   8    | 0.011416709911779977 | 0.03376166588413608  |
|   9    | 0.011589690364988749 | 0.03866689730727104  |
|   10   | 0.011468604047742609 | 0.042524361413826865 |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+-----+------+-------+
|  id | rmse | count |
+-----+------+-------+
| 940 | 0.0  |   3


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.011416709911779978 | 0.004364049719524544 |
|   2    | 0.010638297872340437 | 0.007125765026647234 |
|   3    | 0.011070749005362387 | 0.011502170492829523 |
|   4    | 0.009989621172807486 | 0.013522952858707922 |
|   5    | 0.009444732745199786 | 0.01563537367896475  |
|   6    | 0.010032866286109674 | 0.019959885009184428 |
|   7    | 0.01037882719252724  | 0.024496915181917777 |
|   8    | 0.01050856253243384  | 0.028670686402912655 |
|   9    | 0.010436487343596831 | 0.03329420680225036  |
|   10   | 0.010015568240788779 | 0.03674146011976826  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9836765638356803

Per User RMSE (best)
+------+--------------------+-------+
|  id  |        rmse        | coun


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.012454592631032692 | 0.00393098079916969  |
|   2    | 0.012195121951219481 | 0.008895519806261899 |
|   3    | 0.010032866286109676 | 0.011230755924580506 |
|   4    | 0.010768033212247017 | 0.016740183359280392 |
|   5    | 0.01141670991177997  | 0.022100724046754127 |
|   6    | 0.011849161044801937 | 0.027300845956787816 |
|   7    | 0.01126844095188672  | 0.029705274256389975 |
|   8    | 0.011416709911779956 | 0.03376166588413594  |
|   9    | 0.011589690364988735 | 0.03866689730727104  |
|   10   | 0.01146860404774262  | 0.04252436141382691  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+-----+------+-------+
|  id | rmse | count |
+-----+------+-------+
| 940 | 1.0  |   3


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.013896312132549432 | 0.005723957140312025 |
|   2    | 0.011491181186531295 | 0.010672927642462623 |
|   3    | 0.011580260110457858 | 0.016235906441678778 |
|   4    | 0.011223944414751483 | 0.020605227660278422 |
|   5    | 0.01122394441475146  | 0.02518388435010564  |
|   6    | 0.011758417958311042 | 0.03104145987630753  |
|   7    | 0.01160571123157976  | 0.035402509480542625 |
|   8    | 0.011557990379476215 | 0.03909673971138428  |
|   9    | 0.011520874161173478 |  0.0430251202565473  |
|   10   | 0.011437733832175324 | 0.04808925708177449  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.16667196047761426

Per User RMSE (best)
+-----+------+-------+
|  id | rmse | count |
+-----+------+-------+
| 


Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.007482629609834312 | 0.0032424728309282044 |
|   2    |  0.0088188134687333  |  0.007509353287012293 |
|   3    | 0.009976839479779098 |  0.013099692041434437 |
|   4    | 0.010288615713522188 |  0.017384388282303882 |
|   5    | 0.011223944414751474 |  0.023842610266982284 |
|   6    | 0.010778549795118452 |  0.027620829197526137 |
|   7    | 0.010536764144460562 |  0.03095238095238099  |
|   8    | 0.010489043292357014 |  0.034164312437962856 |
|   9    | 0.010630084921907477 |  0.03796861876765673  |
|   10   | 0.010636023516835921 |  0.042600722811839895 |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.16808840641498554

Per User RMSE (best)
+-----+------+-------+
|  id | rmse | count |
+-----+----


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.01443078567610905  | 0.006258430683871621 |
|   2    | 0.01095670764297167  | 0.010316611946756257 |
|   3    | 0.01175841795831104  | 0.016770379985238372 |
|   4    | 0.011357562800641376 | 0.02113970120383809  |
|   5    | 0.011865312667022981 | 0.027594105520348192 |
|   6    | 0.011847496882237681 | 0.031575933419867135 |
|   7    | 0.01168206459494539  | 0.03593698302410229  |
|   8    | 0.011624799572421142 | 0.03973301773943143  |
|   9    | 0.011283330364035858 | 0.04311419918047397  |
|   10   | 0.011491181186531271 | 0.04862373062533407  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.16671349660960721

Per User RMSE (best)
+-----+------+-------+
|  id | rmse | count |
+-----+------+-------+
| 

In [0]:
# Final step

In [59]:
users_to_recommend = list(customers)

final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', 
                                            similarity_type='cosine')

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)


#pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------+-----------+----------------------+------+
|  id  | productId |        score         | rank |
+------+-----------+----------------------+------+
| 2045 |    248    | 0.05350113312403361  |  1   |
| 2045 |    168    | 0.05334077676137288  |  2   |
| 2045 |    214    | 0.04833198388417562  |  3   |
| 2045 |    126    | 0.042779699961344404 |  4   |
| 2045 |    243    | 0.039056877295176186 |  5   |
| 2045 |    139    | 0.038295801480611166 |  6   |
| 2045 |    169    | 0.03688371976216634  |  7   |
| 2045 |     32    |  0.0363245685895284  |  8   |
| 2045 |     91    |  0.0356879194577535  |  9   |
| 2045 |     11    | 0.03529319365819295  |  10  |
| 570  |     17    | 0.05245302783118354  |  1   |
| 570  |    177    | 0.05185988876554701  |  2   |
| 570  |    214    | 0.04956638813018799  |  3   |
| 570  |     80    | 0.04877706368764242  |  4   |
| 570  |    112    | 0.04800879955291748  |  5   |
| 570  |     2     | 0.04694501558939616  |  6   |
| 570  |     40    | 0.04673304

In [60]:
### .csv file
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()




(10000, 4)


Unnamed: 0,id,productId,score,rank
0,2045,248,0.053501,1
1,2045,168,0.053341,2
2,2045,214,0.048332,3
3,2045,126,0.04278,4
4,2045,243,0.039057,5


In [0]:
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id].transform(lambda x: '|'.join(x.astype(str)))
df_output = df_rec[['id', 'recommendedProducts']].drop_duplicates().sort_values('id').set_index('id')

In [62]:
df_output.head()

Unnamed: 0_level_0,recommendedProducts
id,Unnamed: 1_level_1
0,170|152|37|171|158|211|15|69|75|167
4,166|19|68|90|190|248|244|200|168|32
5,234|235|13|5|112|2|75|48|153|140
6,28|155|4|33|136|175|85|69|35|212
11,55|209|87|49|21|120|129|243|206|198


In [0]:
#Define a function to create a desired output(Bonus part)
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['id', 'recommendedProducts']].drop_duplicates() \
        .sort_values('id').set_index('id')
    if print_csv:
        df_output.to_csv('recommendation.csv')
        print("An output file can be found in 'output' folder with name 'recommendation.csv'")
    return df_output


In [73]:
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'recommendation.csv'
(1000, 1)


Unnamed: 0_level_0,recommendedProducts
id,Unnamed: 1_level_1
0,207|96|26|153|120|125|201|35|181|232
4,207|96|26|153|120|125|201|35|232|181
5,207|96|26|153|120|125|201|35|232|181
6,207|96|26|153|125|120|201|35|232|66
11,96|26|153|120|125|201|35|232|181|152


In [0]:
# define recommendation function
def customer_recomendation(id):
    if id not in df_output.index:
        print('Customer not found.')
        return id
    return df_output.loc[id]

In [69]:
customer_recomendation(48)

recommendedProducts    207|96|26|153|120|125|201|35|232|181
Name: 48, dtype: object

In [72]:
customer_recomendation(2438)

recommendedProducts    54|96|26|153|120|125|201|35|232|181
Name: 2438, dtype: object