# Instacart Product recommendations
Kaggle Competition: https://www.kaggle.com/c/instacart-market-basket-analysis

In [1]:
# ! pip install shap


In [2]:
import pandas as pd
pd.set_option('display.width', 1000)

# Load Data

In [3]:
products = pd.read_csv("datasets/instacart-market-basket-analysis/products.csv")
print(products.head(2))
print(f"{products.shape}")

departments = pd.read_csv("datasets/instacart-market-basket-analysis/departments.csv")
print(departments.head(2))
print(f"{departments.shape}")

aisles = pd.read_csv("datasets/instacart-market-basket-analysis/aisles.csv")
print(aisles.head(2))
print(f"{aisles.shape}")


   product_id                product_name  aisle_id  department_id
0           1  Chocolate Sandwich Cookies        61             19
1           2            All-Seasons Salt       104             13
(49688, 4)
   department_id department
0              1     frozen
1              2      other
(21, 2)
   aisle_id                  aisle
0         1  prepared soups salads
1         2      specialty cheeses
(134, 2)


In [4]:
orders = pd.read_csv("datasets/instacart-market-basket-analysis/orders.csv")
print(orders.head(2))
print(f"{orders.shape}")

orders_products_prior = pd.read_csv("datasets/instacart-market-basket-analysis/order_products__prior.csv")
print(orders_products_prior.head(2))
print(f"{orders_products_prior.shape}")

orders_products_train = pd.read_csv("datasets/instacart-market-basket-analysis/order_products__train.csv")
print(orders_products_train.head(2))
print(f"{orders_products_train.shape}")


   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  days_since_prior_order
0   2539329        1    prior             1          2                  8                     NaN
1   2398795        1    prior             2          3                  7                    15.0
(3421083, 7)
   order_id  product_id  add_to_cart_order  reordered
0         2       33120                  1          1
1         2       28985                  2          1
(32434489, 4)
   order_id  product_id  add_to_cart_order  reordered
0         1       49302                  1          1
1         1       11109                  2          1
(1384617, 4)


In [5]:
# Goal is to predict which previously ordered items will be in next user order
# Extract a small sample set and perform EDA
# Featues
# Model Design:
# Input : [User , Product] -> [Probability of ordering again]
# 
# Features:
# User: TotalOrders, 
# Product: Reorder Frequency, add_to_cart_orderFrequency
# UserProduct: UserProductOrders, UserProductOrdersFrequency, TimeFromPastProductOrder
# Order: DOW, order_hour_of_day, days_since_prior_order

# User Features

In [6]:
orders_prior = orders[orders['eval_set']=='prior']
user_features = orders_prior.groupby(['user_id'])['order_number'].max().reset_index(name='u_total_orders')
user_history = orders_prior.groupby(['user_id'])['days_since_prior_order'].sum().reset_index(name='u_days')

user_features = user_features.merge(user_history, on='user_id', how="left")
user_features

Unnamed: 0,user_id,u_total_orders,u_days
0,1,10,176.0
1,2,14,198.0
2,3,12,133.0
3,4,5,55.0
4,5,4,40.0
...,...,...,...
206204,206205,3,40.0
206205,206206,67,249.0
206206,206207,16,215.0
206207,206208,49,357.0


# Product Features

In [7]:
product_features = orders_products_prior.groupby(['product_id'])['reordered'].mean().reset_index(name='p_reorder_frequency')
# add_to_cart_frequency= orders_products_prior.groupby(['product_id'])['reordered'].mean().reset_index(name='reorder_frequency')
# orders_products_prior[['add_to_cart_order']].describe()

# UXP Features

In [8]:
order_products_prior = orders_prior.merge(orders_products_prior, on='order_id', how='left')
up_features = order_products_prior.groupby(['user_id','product_id'])['reordered'].mean().reset_index(name='up_reordered_ratio')
up_reorder_days_total = order_products_prior.groupby(['user_id','product_id'])['days_since_prior_order'].sum().reset_index(name='up_reordered_days_total')
up_features = up_features.merge(up_reorder_days_total, on=['user_id','product_id'], how='left')
up_features['up_reorder_days'] = up_features['up_reordered_ratio']*up_features['up_reordered_days_total']

In [9]:
orders_train =orders[orders['eval_set']=='train']
order_products_train = orders_train.merge(orders_products_train, on='order_id', how='left')

train_data = order_products_train.merge(user_features, on='user_id', how='left')
train_data = train_data.merge(product_features, on='product_id', how='left')
train_data = train_data.merge(up_features, on=['user_id', 'product_id'], how='left')
train_data

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,u_total_orders,u_days,p_reorder_frequency,up_reordered_ratio,up_reordered_days_total,up_reorder_days
0,1187899,1,train,11,4,8,14.0,196,1,1,10,176.0,0.776480,0.900,176.0,158.400
1,1187899,1,train,11,4,8,14.0,25133,2,1,10,176.0,0.740155,0.875,161.0,140.875
2,1187899,1,train,11,4,8,14.0,38928,3,1,10,176.0,0.827769,0.000,30.0,0.000
3,1187899,1,train,11,4,8,14.0,26405,4,1,10,176.0,0.441516,0.500,29.0,14.500
4,1187899,1,train,11,4,8,14.0,39657,5,1,10,176.0,0.766288,0.000,30.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384612,272231,206209,train,14,6,14,30.0,40603,4,0,13,210.0,0.213158,,,
1384613,272231,206209,train,14,6,14,30.0,15655,5,0,13,210.0,0.533417,,,
1384614,272231,206209,train,14,6,14,30.0,42606,6,0,13,210.0,0.467503,,,
1384615,272231,206209,train,14,6,14,30.0,37966,7,0,13,210.0,0.493496,,,


In [10]:
# drop unwanted columns
ignore_columns  = [
    'user_id',
    # 'order_id',
    'eval_set',
    'product_id',
    'add_to_cart_order',    
]

train_data = train_data.drop(ignore_columns, axis=1)
train_data = train_data.fillna(0)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 12 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   order_id                 1384617 non-null  int64  
 1   order_number             1384617 non-null  int64  
 2   order_dow                1384617 non-null  int64  
 3   order_hour_of_day        1384617 non-null  int64  
 4   days_since_prior_order   1384617 non-null  float64
 5   reordered                1384617 non-null  int64  
 6   u_total_orders           1384617 non-null  int64  
 7   u_days                   1384617 non-null  float64
 8   p_reorder_frequency      1384617 non-null  float64
 9   up_reordered_ratio       1384617 non-null  float64
 10  up_reordered_days_total  1384617 non-null  float64
 11  up_reorder_days          1384617 non-null  float64
dtypes: float64(6), int64(6)
memory usage: 126.8 MB


In [11]:
from sklearn.model_selection import GroupKFold

X = train_data.drop(columns=['reordered', 'order_number'])
Y = train_data['reordered']

groups = train_data['order_number']
gkf = GroupKFold(n_splits=5)

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_recall_curve, auc

In [13]:
def pr_auc_score(y_true, y_proba):
    precision, recall, _ = precision_recall_curve(y_true, y_proba)
    return  auc(recall, precision)

In [14]:
# for train_idx, test_idx in gkf.split(X, Y, groups):
#     X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#     Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]

#     dtc = DecisionTreeClassifier(        
#         random_state=42
#     )
#     dtc.fit(X_train, Y_train)
#     y_proba = dtc.predict_proba(X_test)
#     print(y_proba[:, 1].shape)
#     pr_auc = pr_auc_score(Y_test, y_proba[:, 1])

#     print(f"PR-AUC: {pr_auc}")


In [15]:
pr_auc_scorer = make_scorer(pr_auc_score, response_method="predict_proba")

param_grid = {
    # DecisionTree Hyperparameters
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20],
    
    # GradientBoostedTrees Hyperparameters
    # 'n_estimators': [100, 200],
    # 'criterion': ["friedman_mse"],
    # "learning_rate": [0.01, 0.05],
    # 'max_depth': [3, 5],

    # Common Hyperparameters
   
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'random_state': [42]
}


dtc = DecisionTreeClassifier()
# dtc = GradientBoostingClassifier()

grid_search = GridSearchCV(
    estimator=dtc, 
    param_grid=param_grid, 
    cv=gkf, # Cross validation with 5 splits
    scoring=pr_auc_scorer, # PR AUC
    n_jobs = 10 # Parallel Execution
)


# grid_search.fit(X, Y, groups=groups)
# print("Best Hyperparameters:", grid_search.best_params_)
# print("Best PR-AUC Score:", grid_search.best_score_)

# Temporal Train Test Split

In [16]:


order_ids=train_data['order_id'].unique()
order_ids = sorted(order_ids)
train_size = int(len(order_ids)*0.8)

train_orders = set(order_ids[:train_size])
test_orders  = set(order_ids[train_size:])

max(train_orders), min(test_orders)

(2732730, 2732738)

In [17]:
train = train_data[train_data['order_id'].isin(train_orders)]
test =  train_data[train_data['order_id'].isin(test_orders)]

X_train = train.drop(columns=['reordered', 'order_number','order_id'])
Y_train = train['reordered']

X_test = test.drop(columns=['reordered', 'order_number','order_id'])
Y_test = test['reordered']
train.shape, test.shape

((1105737, 12), (278880, 12))

In [18]:
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, auc, confusion_matrix, f1_score, classification_report, average_precision_score

def metrics(model):

    Y_train_pred = model.predict(X_train)
    precision = precision_score(Y_train, Y_train_pred)
    recall = recall_score(Y_train, Y_train_pred)
    pred_proba = model.predict_proba(X_train)[:, 1]
    p,r,_ = precision_recall_curve(Y_train, pred_proba)
    pr_auc = auc(r, p)
    report = classification_report(Y_train, Y_train_pred)
    print(f"\n\n----- Training Metrics -------")
    print(f"PR-AUC: {pr_auc}")
    # print(f"Precision: {precision}")
    # print(f"Recall: {recall}")
    # print(f"{report}")


    Y_pred = model.predict(X_test)
    precision = precision_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    pred_proba = model.predict_proba(X_test)[:, 1]
    p,r,_ = precision_recall_curve(Y_test, pred_proba)
    pr_auc = auc(r, p)
    report = classification_report(Y_test, Y_pred)
    aps = average_precision_score(Y_test, Y_pred)

    print(f"\n----- Testing Metrics -------")
    print(f"PR-AUC: {pr_auc}")
    print(f"AveragePrecision: {aps}")
    # print(f"Precision: {precision}")
    # print(f"Recall: {recall}")
    # print(report)
    
    if isinstance(model, DecisionTreeClassifier):
        featureImportances = model.feature_importances_
        depth = model.get_depth()
        leaves = model.get_n_leaves()
        print(f"Tree:  Max Depth: {depth}    Leaves: {leaves}")
    elif isinstance(model, RandomForestClassifier) or isinstance(model, GradientBoostingClassifier):
        featureImportances = model.feature_importances_
    elif isinstance(model, LogisticRegression):
        featureImportances = model.coef_[0]

    features = X_train.columns
    features = list(zip(features, featureImportances))
    features.sort(key=lambda x: x[1], reverse=True)

    print(f"\n --- Top 5 Features ------ ")
    for i in range(len(X_train.columns)):
        print(f"{features[i][0]}:\t{round(features[i][1], 4)}")



In [19]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(
    random_state=42
)

dtc.fit(X_train, Y_train)

metrics(dtc)



----- Training Metrics -------
PR-AUC: 0.9999999840317113

----- Testing Metrics -------
PR-AUC: 0.975600480116442
AveragePrecision: 0.9522556620399714
Tree:  Max Depth: 51    Leaves: 41393

 --- Top 5 Features ------ 
up_reordered_days_total:	0.9256
p_reorder_frequency:	0.0314
u_days:	0.0134
order_hour_of_day:	0.0102
days_since_prior_order:	0.0073
order_dow:	0.0072
u_total_orders:	0.0039
up_reordered_ratio:	0.001
up_reorder_days:	0.0


# Ordinal Encoding

In [20]:
from sklearn.preprocessing import OrdinalEncoder


products_info = products.merge(departments, on='department_id', how='left')

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

print(products_info[['department']].head(10))

print(oe.fit_transform(products_info[['department']]))

      department
0         snacks
1         pantry
2      beverages
3         frozen
4         pantry
5  personal care
6      beverages
7         frozen
8     dairy eggs
9      beverages
[[20.]
 [16.]
 [ 3.]
 ...
 [ 2.]
 [18.]
 [17.]]


# OneHotEncoder

In [21]:
from sklearn.preprocessing import OneHotEncoder

oneHotEncoder = OneHotEncoder(handle_unknown='ignore')
day_encoded = oneHotEncoder.fit_transform(orders[['order_dow']]).toarray()  
orders[oneHotEncoder.get_feature_names_out(['order_dow'])] = day_encoded

orders[oneHotEncoder.get_feature_names_out(['order_dow'])].head(10)

Unnamed: 0,order_dow_0,order_dow_1,order_dow_2,order_dow_3,order_dow_4,order_dow_5,order_dow_6
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Standard Scaler Encoding

In [27]:
from sklearn.preprocessing import StandardScaler


standardScaler =  StandardScaler()
scaled_train_data = standardScaler.fit_transform(train_data)

X = pd.DataFrame(scaled_train_data, columns=standardScaler.get_feature_names_out())


array([1.70629762e+06, 1.70914101e+01, 2.70139179e+00, 1.35775922e+01,
       1.70661259e+01, 5.98594413e-01, 1.60914101e+01, 1.62043962e+02,
       5.78957504e-01, 3.12093312e-01, 3.31392883e+01, 2.52021071e+01])