## Simple RFM model for Instacart market basket competition on Kaggle.com

### This model works on the hypothesis that more frequently and recently an item is purchased, higher is the probability of the customer re-purchasing it in the future. Also, higher the probability of repurchase, higher will it be in sequence in the cart. I was not able to make a submission to the competition as I started working on the problem a little late but it was a good excercise.  

In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
import pickle
%matplotlib inline

In [2]:
os.listdir()

['.ipynb_checkpoints',
 'aisles.csv',
 'departments.csv',
 'df.pickle',
 'explore.qvw',
 'instaR.R',
 'orders.csv',
 'order_products__prior.csv',
 'order_products__train.csv',
 'products.csv',
 'qlik.csv',
 'sample_submission.csv',
 'Untitled.ipynb']

In [3]:
#aisles = pd.read_csv("aisles.csv")
#departments = pd.read_csv("departments.csv")
orders = pd.read_csv("orders.csv")
order_prior = pd.read_csv("order_products__prior.csv")
order_train = pd.read_csv("order_products__train.csv")
#products = pd.read_csv("products.csv")

In [97]:
#print(aisles.head())
#print(departments.head())
print(orders.head())
print(order_prior.head())
print(order_train.head())
#print(products.head())

   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   2539329        1    prior             1          2                  8   
1   2398795        1    prior             2          3                  7   
2    473747        1    prior             3          3                 12   
3   2254736        1    prior             4          4                  7   
4    431534        1    prior             5          4                 15   

   days_since_prior_order  
0                     NaN  
1                    15.0  
2                    21.0  
3                    29.0  
4                    28.0  
   order_id  product_id  add_to_cart_order  reordered
0         2       33120                  1          1
1         2       28985                  2          1
2         2        9327                  3          0
3         2       45918                  4          1
4         2       30035                  5          0
   order_id  product_id  add_to_cart_order  r

Separate training data and data for submission (predictions)

In [5]:
users_train = orders.user_id[orders.eval_set=='train']
data_train = orders[orders['user_id'].isin(users_train)]
data_train.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [6]:
users_predict = orders.user_id[orders.eval_set=='test']
data_predict = orders[orders['user_id'].isin(users_predict)]
data_predict.head(20)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
26,1374495,3,prior,1,1,14,
27,444309,3,prior,2,3,19,9.0
28,3002854,3,prior,3,3,16,21.0
29,2037211,3,prior,4,2,18,20.0
30,2710558,3,prior,5,0,17,12.0
31,1972919,3,prior,6,0,16,7.0
32,1839752,3,prior,7,0,15,7.0
33,3225766,3,prior,8,0,17,7.0
34,3160850,3,prior,9,0,16,7.0
35,676467,3,prior,10,3,16,17.0


To better train the model, select users with higher number of orders created. We are taking median as threshold

In [7]:
data_train.groupby('user_id').order_number.max().median() 

10.0

In [8]:
users_10 = data_train.user_id[data_train.order_number > 9]
data_train = data_train[data_train['user_id'].isin(users_10)]
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1814094 entries, 0 to 3421082
Data columns (total 7 columns):
order_id                  int64
user_id                   int64
eval_set                  object
order_number              int64
order_dow                 int64
order_hour_of_day         int64
days_since_prior_order    float64
dtypes: float64(1), int64(5), object(1)
memory usage: 110.7+ MB


In [186]:
orders_train = data_train.loc[(data_train.user_id==2) & (data_train.eval_set=='prior'),:]
orders_train = pd.merge(orders_train,order_prior,'inner',on='order_id')
orders_test = data_train.loc[(data_train.user_id==2) & (data_train.eval_set=='train'),:]
orders_test = pd.merge(orders_test,order_train,'inner',on='order_id')
print(orders_train.head())
print(orders_test.head())

   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   2168274        2    prior             1          2                 11   
1   2168274        2    prior             1          2                 11   
2   2168274        2    prior             1          2                 11   
3   2168274        2    prior             1          2                 11   
4   2168274        2    prior             1          2                 11   

   days_since_prior_order  product_id  add_to_cart_order  reordered  
0                     NaN       32792                  1          0  
1                     NaN       47766                  2          0  
2                     NaN       20574                  3          0  
3                     NaN       12000                  4          0  
4                     NaN       48110                  5          0  
   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   1492625        2    train           

In [187]:
order_days = orders_train[['order_number','days_since_prior_order']]
order_days = order_days.drop_duplicates()
order_days.update(order_days[['days_since_prior_order']].fillna(0))
order_days['cum_days'] = order_days['days_since_prior_order'].cumsum()
order_days

Unnamed: 0,order_number,days_since_prior_order,cum_days
0,1,0.0,0.0
13,2,10.0,10.0
19,3,3.0,13.0
24,4,8.0,21.0
37,5,8.0,29.0
50,6,13.0,42.0
71,7,14.0,56.0
85,8,27.0,83.0
101,9,8.0,91.0
127,10,6.0,97.0


In [188]:
order_map = order_days.groupby('order_number').cum_days.mean().to_dict()
orders_train['cum_days'] = orders_train.order_number.map(order_map)
orders_train.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,cum_days
0,2168274,2,prior,1,2,11,,32792,1,0,0.0
1,2168274,2,prior,1,2,11,,47766,2,0,0.0
2,2168274,2,prior,1,2,11,,20574,3,0,0.0
3,2168274,2,prior,1,2,11,,12000,4,0,0.0
4,2168274,2,prior,1,2,11,,48110,5,0,0.0


Calculate recency of each product by first finding out the last time it was ordered and then subracting it from the latest order days. COnvert recency days into recency weeks

In [189]:
recency_map = orders_train.groupby('product_id').cum_days.max().to_dict()
orders_train['recency'] = (orders_train.cum_days.max() - orders_train.product_id.map(recency_map)) / 7

Calculate frequencies of all products and then normalize by dividing frquency by total number of orders for the user

In [190]:
frequency_map = orders_train['product_id'].value_counts().to_dict()
orders_train['frequency'] = orders_train.product_id.map(frequency_map) / orders_train.order_number.max()

Find average cart order for each product and normalize by dividing by average number of products in 

In [191]:
cart_order_map = orders_train.groupby('product_id').add_to_cart_order.mean().to_dict()
orders_train['cart_order'] = orders_train.product_id.map(cart_order_map) / \
                             orders_train.groupby('order_number').order_number.value_counts().mean()

In [192]:
orders_train = orders_train[['product_id','recency','frequency','cart_order']].copy()
orders_train = orders_train.drop_duplicates()

In [193]:
orders_test['buy'] = 1
orders_test = orders_test[['product_id','buy']]
df = pd.merge(orders_train,orders_test,'left',on='product_id')
df.update(df[['buy']].fillna(0))
df = df.drop(['product_id'], axis = 1)
df.head()

Unnamed: 0,recency,frequency,cart_order,buy
0,6.142857,0.642857,0.087749,1.0
1,24.142857,0.285714,0.358974,0.0
2,26.428571,0.142857,0.215385,0.0
3,14.428571,0.357143,0.186667,0.0
4,16.428571,0.142857,0.358974,0.0


In [194]:
df.corr()

Unnamed: 0,recency,frequency,cart_order,buy
recency,1.0,-0.337522,-0.037655,-0.146146
frequency,-0.337522,1.0,-0.220448,0.222986
cart_order,-0.037655,-0.220448,1.0,0.221162
buy,-0.146146,0.222986,0.221162,1.0


Check the class distribution

In [195]:
df.buy.value_counts()

0.0    90
1.0    12
Name: buy, dtype: int64

Simple NaiveBayes

In [196]:
features = ['recency','frequency','cart_order']
X = df[features].copy()
y = df['buy'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=324)
clf = GaussianNB()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_true = y_test, y_pred = predictions))
print(roc_auc_score(y_test,predictions))

0.838709677419
0.464285714286


Not very good. Also, data is biased as we have more instance of not buy than buy. Let us check logistic regression

In [199]:
def run_logistic(df):
    features = ['recency','frequency','cart_order']
    X = df[features].copy()
    y = df['buy'].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=324)
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    predictions = logreg.predict(X_test)
    y_proba = logreg.predict_proba(X_test)
    print(accuracy_score(y_true = y_test, y_pred = predictions))
    print(roc_auc_score(y_test,predictions))

In [200]:
run_logistic(df)

0.903225806452
0.5


As suspected, because of the biased class distribution, algorithm is predicting buys as no buys and still getting high accuracy score. Let us try logistic after standardizing table

In [201]:
def run_logistic_std(df):
    features = ['recency','frequency','cart_order']
    X = df[features].copy()
    y = df['buy'].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=324)
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)
    logreg = LogisticRegression()
    logreg.fit(X_train_std, y_train)
    predictions = logreg.predict(X_test_std)
    print(accuracy_score(y_true = y_test, y_pred = predictions))
    print(roc_auc_score(y_test,predictions))

In [202]:
run_logistic_std(df)

0.838709677419
0.464285714286


Not any better. Let us check XGBoost 

In [205]:
def run_xgb(df):
    features = ['recency','frequency','cart_order']
    X = df[features].copy()
    y = df['buy'].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=324)
    gbm = xgb.XGBClassifier(max_depth=6, n_estimators=400, learning_rate=0.001).fit(X_train, y_train)
    predictions = gbm.predict(X_test)
    y_proba = gbm.predict_proba(X_test)
    print(accuracy_score(y_true = y_test, y_pred = predictions))
    print(roc_auc_score(y_test,predictions))

In [206]:
run_xgb(df)

0.903225806452
0.5


Create a function to create the rfm table as above for user ids passed as list

In [207]:
def prepare_rfm(user_ids):
    frames = []
    for user_id in user_ids:
#    from IPython.core.debugger import Tracer; Tracer()()
        orders_train = data_train.loc[(data_train.user_id==user_id) & (data_train.eval_set=='prior'),:]
        orders_train = pd.merge(orders_train,order_prior,'inner',on='order_id')
        orders_test = data_train.loc[(data_train.user_id==user_id) & (data_train.eval_set=='train'),:]
        orders_test = pd.merge(orders_test,order_train,'inner',on='order_id')
        frequency_map = orders_train['product_id'].value_counts().to_dict()
        order_days = orders_train[['order_number','days_since_prior_order']]
        order_days = order_days.drop_duplicates()
        order_days.update(order_days[['days_since_prior_order']].fillna(0))
        order_days['cum_days'] = order_days['days_since_prior_order'].cumsum()
        order_map = order_days.groupby('order_number').cum_days.mean().to_dict()
        orders_train['cum_days'] = orders_train.order_number.map(order_map)
        recency_map = orders_train.groupby('product_id').cum_days.max().to_dict()
        orders_train['recency'] = (orders_train.cum_days.max() - orders_train.product_id.map(recency_map)) / 7
        orders_train['frequency'] = orders_train.product_id.map(frequency_map) / orders_train.order_number.max()
        cart_order_map = orders_train.groupby('product_id').add_to_cart_order.mean().to_dict()
        orders_train['cart_order'] = orders_train.product_id.map(cart_order_map) / \
                                     orders_train.groupby('order_number').order_number.value_counts().mean()
        orders_train = orders_train[['product_id','recency','frequency','cart_order']].copy()
        orders_train = orders_train.drop_duplicates()
        orders_test['buy'] = 1
        orders_test = orders_test[['product_id','buy']]
        orders_comb = pd.merge(orders_train,orders_test,'left',on='product_id')
        orders_comb.update(orders_comb[['buy']].fillna(0))
        orders_comb = orders_comb.drop(['product_id'], axis = 1)
        frames.append(orders_comb)
    df = pd.concat(frames)    
    return df

Select some random users and re-try 

In [157]:
users_train = data_train['user_id'].unique()
user_ids = list(np.random.choice(users_train, 1000))
df = prepare_rfm(user_ids)
df.buy.value_counts()

0.0    83353
1.0     7565
Name: buy, dtype: int64

Used 1000 random users and saved on the disk for later retrieval

In [208]:
#with open('df.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
#    pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)

with open('df.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    df = pickle.load(f)

Build the final model to make the predictions on the test set. Since the dataset is biassed on class, we will use class weight parameter to force it to predict the rare class. Using class weight of 0.12 for class 0 give accuracy of 83% and ROC area 0.75 which is a fair model.

In [265]:
features = ['recency','frequency','cart_order']
X = df[features].copy()
y = df['buy'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)
logreg = LogisticRegression(class_weight={0:0.12})
#logreg = LogisticRegression()
logreg.fit(X_train, y_train)
predictions = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)
#predictions = (y_proba[:,1]  > 0.125).astype(int)
print(accuracy_score(y_true = y_test, y_pred = predictions))
print(roc_auc_score(y_test,predictions))

0.82916849978
0.747646906609


Using the logic in prepare rfm function above, I built a df_predict dataframe for user id 3. 

In [275]:
df_predict

Unnamed: 0,product_id,recency,frequency,cart_order
0,9387,8.142857,0.416667,0.490909
1,17668,2.142857,0.416667,0.490909
2,15143,19.0,0.083333,0.409091
3,16797,6.142857,0.25,0.545455
4,39190,0.0,0.833333,0.245455
5,47766,0.0,0.75,0.515152
6,21903,0.0,0.666667,0.579545
7,39922,19.0,0.083333,1.090909
8,24810,0.0,0.25,0.954545
9,32402,3.714286,0.25,1.136364


In [305]:
df_pred = pd.DataFrame({"product_id":df_predict['product_id'],"probability":y_proba[:,1]})
df_pred = df_pred[df_pred.probability >= 0.5]
df_pred = df_pred.sort_values(["probability"], ascending = False)
df_pred

Unnamed: 0,probability,product_id
4,0.972146,39190
5,0.958292,47766
6,0.939242,21903
1,0.805858,17668
56,0.766322,18599
52,0.737246,43961
0,0.728426,9387
8,0.679432,24810
9,0.61338,32402
17,0.594702,23650


In [348]:
text = orders_test['order_id'].values
ord = " ".join(str(x) for x in text)
ord

'2774568'

In [342]:
prods = " ".join(str(x) for x in df_pred['product_id'].values)
prods

'39190 47766 21903 17668 18599 43961 9387 24810 32402 23650 16797 22035 1819'

In [349]:
final = ord + ", " + prods
final

'2774568, 39190 47766 21903 17668 18599 43961 9387 24810 32402 23650 16797 22035 1819'

We have code ready to create submission file for all the users in the test set.

In [389]:
orders_test = data_predict.loc[data_predict.eval_set=='test',:]
features = ['recency','frequency','cart_order']
submission = []
for index, row in orders_test.iterrows():
    orders_predict = data_predict.loc[(data_predict.user_id==row.user_id) & (data_predict.eval_set=='prior'),:]
    orders_predict = pd.merge(orders_predict,order_prior,'inner',on='order_id')
    frequency_map = orders_predict['product_id'].value_counts().to_dict()
    order_days = orders_predict[['order_number','days_since_prior_order']]
    order_days = order_days.drop_duplicates()
    order_days.update(order_days[['days_since_prior_order']].fillna(0))
    order_days['cum_days'] = order_days['days_since_prior_order'].cumsum()
    order_map = order_days.groupby('order_number').cum_days.mean().to_dict()
    orders_predict['cum_days'] = orders_predict.order_number.map(order_map)
    recency_map = orders_predict.groupby('product_id').cum_days.max().to_dict()
    orders_predict['recency'] = (orders_predict.cum_days.max() - orders_predict.product_id.map(recency_map)) / 7
    orders_predict['frequency'] = orders_predict.product_id.map(frequency_map) / orders_predict.order_number.max()
    cart_order_map = orders_predict.groupby('product_id').add_to_cart_order.mean().to_dict()
    orders_predict['cart_order'] = orders_predict.product_id.map(cart_order_map) / \
                               orders_predict.groupby('order_number').order_number.value_counts().mean()
    orders_predict = orders_predict[['product_id','recency','frequency','cart_order']].copy()
    orders_predict = orders_predict.drop_duplicates()
    df_predict = orders_predict.copy()
    X = df_predict[features].copy()
    y_proba = logreg.predict_proba(X)
    df_pred = pd.DataFrame({"product_id":df_predict['product_id'],"probability":y_proba[:,1]})
    df_pred = df_pred[df_pred.probability >= 0.5]
    df_pred = df_pred.sort_values(["probability"], ascending = False)
    ord_text = str(row['order_id'])
    prods_text = " ".join(str(x) for x in df_pred['product_id'].values)
    submit_text = ord_text + ", " + prods_text
    submission.append([submit_text])   

KeyboardInterrupt: 

In [391]:
len(submission)

14129

In [392]:
submission[0:10]

[['2774568, 39190 47766 21903 17668 18599 43961 9387 24810 32402 23650 16797 22035 1819'],
 ['329954, 35469 22199 26576 25146 25623 21573 1200 17769 43704 37646 11865 19057 42329 7160 36606'],
 ['1528013, 38293 21903 49401 25659 8424 20323 40992 27521 45007 11068 48679 10644'],
 ['1376945, 8309 27959 14947 35948 28465 34658 33572 8670 44632 20383 42585 35640 10644 24799 17706 48697 49374 30563 5989 41909 17794 13176 4799 33021 21040 8230 30480 22950 34551'],
 ['1356845, 13176 14992 10863 7076 28134 21616 8239 5746 20350 44422 11520 31506 22959 7120 37687 19006 39667 17794 30489 49683 24390 47509 34243 48364 16185 248 42736 19895 18761 45056 37646 38693 47144 44661 22935 35345 12797 10978 33443 28342 20144 47672 38164 42450'],
 ['2161313, 14715 12427 11266 10441 37710 27839 196 48142 1747'],
 ['1416320, 21903 5134 41950 17948 24852 21137 48745 651 4086 43014 28985 24561 15872 21405 14197 21616 28289 48283 32691 44359 17794 7948 27104'],
 ['1735923, 17008 35123 2192 12108 15131 24629 181

In [393]:
with open('submit.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(submission, f, pickle.HIGHEST_PROTOCOL)