# Instacart Product recommendations
Kaggle Competition: https://www.kaggle.com/c/instacart-market-basket-analysis

In [None]:
# ! pip install shap


In [None]:
import pandas as pd
pd.set_option('display.width', 1000)

# Load Data

In [None]:
products = pd.read_csv("datasets/instacart-market-basket-analysis/products.csv")
print(products.head(2))
print(f"{products.shape}")

departments = pd.read_csv("datasets/instacart-market-basket-analysis/departments.csv")
print(departments.head(2))
print(f"{departments.shape}")

aisles = pd.read_csv("datasets/instacart-market-basket-analysis/aisles.csv")
print(aisles.head(2))
print(f"{aisles.shape}")


In [None]:
orders = pd.read_csv("datasets/instacart-market-basket-analysis/orders.csv")
print(orders.head(2))
print(f"{orders.shape}")

orders_products_prior = pd.read_csv("datasets/instacart-market-basket-analysis/order_products__prior.csv")
print(orders_products_prior.head(2))
print(f"{orders_products_prior.shape}")

orders_products_train = pd.read_csv("datasets/instacart-market-basket-analysis/order_products__train.csv")
print(orders_products_train.head(2))
print(f"{orders_products_train.shape}")


In [None]:
# Goal is to predict which previously ordered items will be in next user order
# Extract a small sample set and perform EDA
# Featues
# Model Design:
# Input : [User , Product] -> [Probability of ordering again]
# 
# Features:
# User: Counts: Total Orders, Order frequency, Avg unique products, Avg total items,       
# Product: Avg order items, Order frequency
# User-product: Order frequency per order, Order frequency per days
# Label: Reordered

# User Features

In [None]:
orders.info()

In [None]:
# exrtact prior orders for processing features
prior_orders = orders[orders.eval_set=='prior']

In [None]:
u_total_orders = prior_orders.groupby('user_id')['order_id'].count().reset_index(name='u_total_orders')
print(u_total_orders.head(2))
u_history = prior_orders.groupby('user_id')['days_since_prior_order'].sum().reset_index(name='u_history_days')
print(u_history.head(2))

u_total_orders = u_total_orders.merge(u_history, on='user_id', how='left')
u_total_orders['u_order_frequency_days'] = u_total_orders['u_history_days']/u_total_orders['u_total_orders']
u_features = u_total_orders

print(u_features.head(2))
print(u_features.shape)

# Product Features

In [None]:
prior_order_products = pd.merge(prior_orders, orders_products_prior, on="order_id", how="left")
print(prior_order_products.head(2))
print(f"{prior_order_products.shape}")

p_total_orders = prior_order_products.groupby('product_id')['order_id'].count().reset_index(name='p_total_orders')
print(p_total_orders.head(2))

p_order_frequency_per_order = prior_order_products.groupby('product_id')['reordered'].mean().reset_index(name='p_reorder_rate_per_order')
print(p_order_frequency_per_order.head(2))

p_features = p_total_orders.merge(p_order_frequency_per_order, on='product_id', how='left')
print(p_features.head(2))
print(p_features.shape)

# User x Product Features

In [None]:
uxp_reorder_rate = prior_order_products.groupby(['user_id', 'product_id'])['reordered'].mean().reset_index(name='uxp_reorder_rate_per_order')

# uxp_reorders = prior_order_products.groupby(['user_id', 'product_id']).size().reset_index(name='uxp_total_orders')

# uxp_reorder_rate = uxp_reorder_rate.merge(uxp_reorders, on=['user_id', 'product_id'], how='left')

print(uxp_reorder_rate.head(2))
print(uxp_reorder_rate.shape)


# Build Train Data

In [None]:
# orders train
# combine with user, product and uxp features
# remove unnecessayr columns
# split to train and test

orders_train = orders[orders.eval_set=='train']
print(orders_train.shape)

order_products_train = pd.merge(orders_train, orders_products_train, on='order_id', how='left')
print(order_products_train.head(2))
print(order_products_train.shape)

# merge user feaures
features_dataset = order_products_train.merge(u_features, on='user_id', how='left')
print(features_dataset.head(2))

# merge product features
features_dataset = features_dataset.merge(p_features, on='product_id', how='left')
print(features_dataset.head(2))

# merge uxp features
features_dataset = features_dataset.merge(uxp_reorder_rate, on=['user_id', 'product_id'], how='left')
print(features_dataset.head(2))

# remove unwanted features
features_dataset = features_dataset.drop(['eval_set', 'order_id', 'product_id', 'user_id'], axis=1)

features_dataset["p_reorder_rate_per_day"] = features_dataset["p_reorder_rate_per_order"] * features_dataset["u_order_frequency_days"]

print(features_dataset.head(10))

In [None]:
# print Nan values
print(features_dataset.isnull().sum())
features_dataset = features_dataset.fillna(0)
features_dataset.describe()

In [None]:
features_dataset.groupby('reordered').size()

In [None]:
# SPlit and train
from sklearn.model_selection import train_test_split
RANDOM_STATE = 7
TEST_SIZE = 0.2 

train, test = train_test_split(features_dataset, test_size=TEST_SIZE, random_state=RANDOM_STATE)
X_train = train.drop('reordered', axis=1)
Y_train = train['reordered']

X_test = test.drop('reordered', axis=1)
Y_test = test['reordered']


In [None]:
def metrics(Y_test, Y_pred):
    from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
    print(f"Confusion Matrix: \n{confusion_matrix(Y_test, Y_pred)}")
    print(f"Accuracy: {accuracy_score(Y_test, Y_pred)}")
    print(f"Precision: {precision_score(Y_test, Y_pred)}")
    print(f"Recall: {recall_score(Y_test, Y_pred)}")
    print(f"F1 Score: {f1_score(Y_test, Y_pred)}")

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



model = LogisticRegression(max_iter=200, random_state=RANDOM_STATE)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
metrics(Y_test, Y_pred)



# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=10, 
    min_samples_leaf=10,
    random_state=RANDOM_STATE,
)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
metrics(Y_test, Y_pred)



# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=10,
    max_depth=10, 
    min_samples_leaf=10,
    random_state=RANDOM_STATE,
)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
metrics(Y_test, Y_pred)


# Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_model = GradientBoostingClassifier(
    loss='log_loss',
    n_estimators=10,
    max_depth=10, 
    min_samples_leaf=10,
    random_state=RANDOM_STATE,
)
gbc_model.fit(X_train, Y_train)
Y_pred = gbc_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
metrics(Y_test, Y_pred)



# Plot a Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decisionTreeClassifier = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=10,
    min_samples_leaf=10,
    random_state=RANDOM_STATE,
)
decisionTreeClassifier.fit(X_train, Y_train)
Y_pred = decisionTreeClassifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
metrics(Y_test, Y_pred)

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
# plot a large image of the tree

plt.figure(figsize=(20, 10))  
tree.plot_tree(
    decisionTreeClassifier, 
    max_depth=3, 
    feature_names = X_train.columns, 
    class_names = ['0', '1'],
    filled=True
)
plt.show()

In [None]:
# feature importances
importances = decisionTreeClassifier.feature_importances_

# sort the importances
sorted_index = importances.argsort()

# create labels
labels = X_train.columns[sorted_index]

# create plot
for i in range(len(sorted_index) - 1, -1, -1):
    print(f"{labels[i]} \t {importances[sorted_index[i]].round(4)}")

In [None]:
# encode dow with one hot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
dow_encoded = encoder.fit_transform(features_dataset[['order_dow']]).toarray()
dow_encoded = pd.DataFrame(dow_encoded, columns=[f"dow_{i}" for i in range(dow_encoded.shape[1])])

features_dataset = pd.concat([features_dataset, dow_encoded], axis=1)

# remove dow
features_dataset = features_dataset.drop('order_dow', axis=1)


def get_train_test_split(features_dataset):
    train, test = train_test_split(features_dataset, test_size=TEST_SIZE, random_state=RANDOM_STATE)
    X_train = train.drop('reordered', axis=1)
    Y_train = train['reordered']

    X_test = test.drop('reordered', axis=1)
    Y_test = test['reordered']
    return X_train, Y_train, X_test, Y_test

X_train, Y_train, X_test, Y_test = get_train_test_split(features_dataset)

In [None]:
lr = LogisticRegression(max_iter=200, random_state=RANDOM_STATE)
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
metrics(Y_test, Y_pred)

In [None]:
# Encode hour of day to early morning, morning, afternoon, evening, night

def encode_hour_of_day(hour):
    if hour < 6:
        return 'early_morning'
    elif hour < 12:
        return 'morning'
    elif hour < 18:
        return 'afternoon'
    elif hour < 21:
        return 'evening'
    else:
        return 'night'

features_dataset['order_hour_of_day'] = features_dataset['order_hour_of_day'].apply(encode_hour_of_day)

# encode hour of day with one hot encoding
encoder = OneHotEncoder()
hour_encoded = encoder.fit_transform(features_dataset[['order_hour_of_day']]).toarray()
hour_encoded = pd.DataFrame(hour_encoded, columns=[f"hour_{i}" for i in range(hour_encoded.shape[1])])

features_dataset = pd.concat([features_dataset, hour_encoded], axis=1)

# remove hour of day
features_dataset = features_dataset.drop('order_hour_of_day', axis=1)

X_train, Y_train, X_test, Y_test = get_train_test_split(features_dataset)

In [None]:
lr = LogisticRegression(max_iter=200, random_state=RANDOM_STATE)
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
metrics(Y_test, Y_pred)

In [None]:
decisionTreeClassifier = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=10,
    min_samples_leaf=10,
    random_state=RANDOM_STATE,
)
decisionTreeClassifier.fit(X_train, Y_train)
Y_pred = decisionTreeClassifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
metrics(Y_test, Y_pred)

In [None]:
# feature importances
importances = decisionTreeClassifier.feature_importances_

# sort the importances
sorted_index = importances.argsort()

# create labels
labels = X_train.columns[sorted_index]

# create plot
for i in range(len(sorted_index) - 1, -1, -1):
    print(f"{labels[i]} \t {importances[sorted_index[i]].round(4)}")