In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import turicreate as tc
from sklearn.model_selection import train_test_split
import os
import time
import seaborn as sns
# We can override the default matplotlib styles with those of Seaborn
sns.set()## Importing the relevant libraries

In [34]:
# Load the data from a .csv
customer_data = pd.read_csv(os.path.join(os.path.pardir,'data','raw','recommend_1.csv'))
transactions_data = pd.read_csv(os.path.join(os.path.pardir,'data','raw','trx_data.csv'))

In [3]:
customer_data.head()

Unnamed: 0,customerId
0,1553
1,20400
2,19750
3,6334
4,27773


In [27]:
transactions_data

Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2
5,6,144|144|55|266
6,7,135|206|259
7,8,79|8|8|48
8,9,102|2|2|297
9,10,84|77|290|260


In [4]:
transactions_data.head()

Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2


In [36]:
transactions_df = transactions_data.copy()

In [37]:
transactions_df['products'] = transactions_df['products'].apply(lambda x: [int(i) for i in x.split('|')])
transactions_df.head(10).set_index('customerId')['products'].apply(pd.Series).reset_index()

Unnamed: 0,customerId,0,1,2,3,4,5,6,7,8,9
0,0,20.0,,,,,,,,,
1,1,2.0,2.0,23.0,68.0,68.0,111.0,29.0,86.0,107.0,152.0
2,2,111.0,107.0,29.0,11.0,11.0,11.0,33.0,23.0,,
3,3,164.0,227.0,,,,,,,,
4,5,2.0,2.0,,,,,,,,
5,6,144.0,144.0,55.0,266.0,,,,,,
6,7,135.0,206.0,259.0,,,,,,,
7,8,79.0,8.0,8.0,48.0,,,,,,
8,9,102.0,2.0,2.0,297.0,,,,,,
9,10,84.0,77.0,290.0,260.0,,,,,,


In [38]:
pd.melt(transactions_df.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})

Unnamed: 0,customerId,productId,purchase_count
0,0,20.0,1
1,1,2.0,2
2,1,23.0,1
3,1,29.0,1
4,1,68.0,2
5,1,86.0,1
6,1,107.0,1
7,1,111.0,1
8,1,152.0,1


In [40]:
s=time.time()

df = pd.melt(transactions_df.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
df['productId'] = df['productId'].astype(np.int64)
print("Execution time:", round((time.time()-s)/60,2), "minutes")

Execution time: 0.35 minutes


In [42]:
df

Unnamed: 0,customerId,productId,purchase_count
0,0,1,2
1,0,13,1
2,0,19,3
3,0,20,1
4,0,31,2
5,0,52,1
6,0,69,2
7,0,93,3
8,0,136,2
9,0,157,1


In [44]:
df['purchase_count'].unique()

array([ 2,  1,  3,  5,  6,  9,  4, 11,  7,  8, 30, 10, 13, 23, 14, 18, 21,
       15, 17, 20, 12, 42, 28, 16, 31, 19, 24, 25, 64, 75, 46, 27, 57, 22])

In [47]:
def create_dummies(df, feature_label):
    dummies = pd.get_dummies(df[feature_label], prefix=feature_label, drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop([feature_label], axis=1)
    return df

In [50]:
data_dummy = create_dummies(df, 'purchase_count')

In [51]:
data_dummy

Unnamed: 0,customerId,productId,purchase_count_2,purchase_count_3,purchase_count_4,purchase_count_5,purchase_count_6,purchase_count_7,purchase_count_8,purchase_count_9,...,purchase_count_25,purchase_count_27,purchase_count_28,purchase_count_30,purchase_count_31,purchase_count_42,purchase_count_46,purchase_count_57,purchase_count_64,purchase_count_75
0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,19,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,31,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,52,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,69,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,93,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,136,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,157,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
train_data_dummy

customerId,productId,purchase_count_2,purchase_count_3,purchase_count_4,purchase_count_5,purchase_count_6
19324,1,1,0,0,0,0
22065,33,0,0,0,0,0
25023,213,0,0,0,0,0
12911,95,1,0,0,0,0
6443,258,1,0,0,0,0
1350,262,1,0,0,0,0
11015,102,0,0,0,0,0
17638,45,0,0,0,0,0
1109,84,1,0,0,0,0
17424,150,0,0,0,0,0

purchase_count_7,purchase_count_8,purchase_count_9,purchase_count_10,purchase_count_11,purchase_count_12,purchase_count_13
0,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0

purchase_count_14,purchase_count_15,purchase_count_16,purchase_count_17,purchase_count_18,purchase_count_19
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0

purchase_count_20,purchase_count_21,purchase_count_22,purchase_count_23,purchase_count_24,purchase_count_25
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0

purchase_count_27,purchase_count_28,purchase_count_30,purchase_count_31,purchase_count_42,purchase_count_46
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0

purchase_count_57,purchase_count_64,purchase_count_75
0,0,0
0,0,0
0,0,0
0,0,0
0,0,0
0,0,0
0,0,0
0,0,0
0,0,0
0,0,0


In [58]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScalar(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scalar = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scalar.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scalar.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
        

In [62]:
unscalled_inputs = df.copy()

In [59]:
columns_to_scale = ['purchase_count']

In [60]:
scalar = CustomScalar(columns_to_scale)

In [65]:
scaled_df = scalar.fit_transform(unscalled_inputs)

In [67]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [69]:
train_data_scalled, test_data_scalled = split_data(scaled_df)

In [70]:
user_id = 'customerId'
item_id = 'productId'
target = 'purchase_count'
users_to_recommend = list(scaled_df[user_id])
n_rec = 10 # number of items to recommend
n_display = 30

In [74]:
# Since turicreate is very accessible library, we can define a model selection function as below

def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [75]:
name = 'cosine'
cos = model(train_data_scalled, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-----------------------+------+
| customerId | productId |         score         | rank |
+------------+-----------+-----------------------+------+
|     0      |    248    |  0.01345472534497579  |  1   |
|     0      |    137    |  0.009817779064178467 |  2   |
|     0      |     73    |  0.008232211073239645 |  3   |
|     0      |    186    |  0.00813913345336914  |  4   |
|     0      |     43    | 0.0073691656192143755 |  5   |
|     0      |     2     |  0.00702752669652303  |  6   |
|     0      |    230    | 0.0070043206214904785 |  7   |
|     0      |     47    | 0.0068231721719106036 |  8   |
|     0      |     89    |  0.006513054172197978 |  9   |
|     0      |    225    |  0.006312519311904907 |  10  |
|     0      |    248    |  0.01345472534497579  |  1   |
|     0      |    137    |  0.009817779064178467 |  2   |
|     0      |     73    |  0.008232211073239645 |  3   |
|     0      |    186    |  0.00813913345336914  |  4   |
|     0      |

In [73]:
popularity_model

Class                            : PopularityRecommender

Schema
------
User ID                          : customerId
Item ID                          : productId
Target                           : purchase_count
Additional observation features  : 0
User side features               : []
Item side features               : []

Statistics
----------
Number of observations           : 106868
Number of users                  : 23256
Number of items                  : 300

Training summary
----------------
Training time                    : 0.01

Model Parameters
----------------
Model class                      : PopularityRecommender

In [79]:
users_to_recommend = list(customer_data[user_id])

final_model = tc.item_similarity_recommender.create(tc.SFrame(scaled_df), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_count', 
                                            similarity_type='cosine')

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+-----------------------+------+
| customerId | productId |         score         | rank |
+------------+-----------+-----------------------+------+
|    1553    |     73    |          0.0          |  1   |
|    1553    |     48    |          0.0          |  2   |
|    1553    |     21    |          0.0          |  3   |
|    1553    |    259    |          0.0          |  4   |
|    1553    |    149    |          0.0          |  5   |
|    1553    |    296    |          0.0          |  6   |
|    1553    |    274    |          0.0          |  7   |
|    1553    |    255    |          0.0          |  8   |
|    1553    |     52    |          0.0          |  9   |
|    1553    |    136    |          0.0          |  10  |
|   20400    |     23    |          0.0          |  1   |
|   20400    |     2     |          0.0          |  2   |
|   20400    |    260    |          0.0          |  3   |
|   20400    |    256    |          0.0          |  4   |
|   20400    |