In [None]:
# ! pip install shap

In [None]:
import pandas as pd

pd.set_option('display.width', 1000)

# Load Data

In [None]:
products = pd.read_csv("datasets/instacart-market-basket-analysis/products.csv")
print(products.head(2))
print(f"{products.shape}")

departments = pd.read_csv("datasets/instacart-market-basket-analysis/departments.csv")
print(departments.head(2))
print(f"{departments.shape}")

aisles = pd.read_csv("datasets/instacart-market-basket-analysis/aisles.csv")
print(aisles.head(2))
print(f"{aisles.shape}")


In [None]:
orders = pd.read_csv("datasets/instacart-market-basket-analysis/orders.csv")
print(orders.head(2))
print(f"{orders.shape}")

orders_products_prior = pd.read_csv("datasets/instacart-market-basket-analysis/order_products__prior.csv")
print(orders_products_prior.head(2))
print(f"{orders_products_prior.shape}")

orders_products_train = pd.read_csv("datasets/instacart-market-basket-analysis/order_products__train.csv")
print(orders_products_train.head(2))
print(f"{orders_products_train.shape}")


In [None]:
# Goal is to predict which previously ordered items will be in next user order
# Extract a small sample set and perform EDA
# Featues
# Model Design:
# Input : [User , Product] -> [Probability of ordering again]
# 
# Features:
# User: Counts: Total Orders, Order frequency, Avg unique products, Avg total items,       
# Product: Avg order items, Order frequency
# Label: Reordered

# User Features

In [None]:
orders.info()

In [None]:
# exrtact prior orders for processing features
prior_orders = orders[orders.eval_set=='prior']

In [None]:
u_total_orders = prior_orders.groupby('user_id')['order_id'].count().reset_index(name='u_total_orders')
print(u_total_orders.head(2))
u_history = prior_orders.groupby('user_id')['days_since_prior_order'].sum().reset_index(name='u_history_days')
print(u_history.head(2))

u_total_orders = u_total_orders.merge(u_history, on='user_id', how='left')
u_total_orders['u_order_frequency_days'] = u_total_orders['u_history_days']/u_total_orders['u_total_orders']
u_features = u_total_orders

print(u_features.head(2))
print(u_features.shape)

# Product Features

In [None]:
prior_order_products = pd.merge(prior_orders, orders_products_prior, on="order_id", how="left")
print(prior_order_products.head(2))
print(f"{prior_order_products.shape}")

p_total_orders = prior_order_products.groupby('product_id')['order_id'].count().reset_index(name='p_total_orders')
print(p_total_orders.head(2))

p_order_frequency_per_order = prior_order_products.groupby('product_id')['reordered'].mean().reset_index(name='p_reorder_rate_per_order')
print(p_order_frequency_per_order.head(2))

p_features = p_total_orders.merge(p_order_frequency_per_order, on='product_id', how='left')
print(p_features.head(2))
print(p_features.shape)

# User x Product Features

In [None]:
uxp_reorder_rate = prior_order_products.groupby(['user_id', 'product_id'])['reordered'].mean().reset_index(name='uxp_reorder_rate_per_order')

uxp_reorders = prior_order_products.groupby(['user_id', 'product_id']).size().reset_index(name='uxp_total_orders')

print(uxp_reorder_rate.head(2))
print(uxp_reorder_rate.shape)


# Build Train Data

In [None]:
# orders train
# combine with user, product and uxp features
# remove unnecessayr columns
# split to train and test

orders_train = orders[orders.eval_set=='train']
print(orders_train.shape)

order_products_train = pd.merge(orders_train, orders_products_train, on='order_id', how='left')
print(order_products_train.head(2))
print(order_products_train.shape)

# merge user feaures
features_dataset = order_products_train.merge(u_features, on='user_id', how='left')
print(features_dataset.head(2))

# merge product features
features_dataset = features_dataset.merge(p_features, on='product_id', how='left')
print(features_dataset.head(2))

# merge uxp features
features_dataset = features_dataset.merge(uxp_reorder_rate, on=['user_id', 'product_id'], how='left')
print(features_dataset.head(2))

# remove unwanted features
features_dataset = features_dataset.drop(['eval_set', 'order_id', 'product_id', 'user_id'], axis=1)

features_dataset["p_reorder_rate_per_day"] = features_dataset["p_reorder_rate_per_order"] * features_dataset["u_order_frequency_days"]

print(features_dataset.head(10))

In [None]:
# print Nan values
print(features_dataset.isnull().sum())
features_dataset = features_dataset.fillna(0)
features_dataset.describe()

In [None]:
features_dataset.groupby('reordered').size()

In [None]:
# SPlit and train
from sklearn.model_selection import train_test_split
RANDOM_STATE = 7
TEST_SIZE = 0.2 

train, test = train_test_split(features_dataset, test_size=TEST_SIZE, random_state=RANDOM_STATE)
X_train = train.drop('reordered', axis=1)
Y_train = train['reordered']

X_test = test.drop('reordered', axis=1)
Y_test = test['reordered']


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


def train(iterations=100):
    model = LogisticRegression(max_iter=iterations, random_state=RANDOM_STATE)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"Accuracy: {accuracy}")

train(100)



# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
def train_tree():
    model = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=10, min_samples_leaf=10)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"Accuracy: {accuracy}")

train_tree()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
def train_forest():
    model = RandomForestClassifier(
        n_estimators=10,
        max_depth=10, 
        min_samples_leaf=10,
        random_state=RANDOM_STATE,
    )
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"Accuracy: {accuracy}")

train_forest()

# Multi layer Perceptron - MLP

In [None]:
# # train neural network model using sklearn 
# from sklearn.neural_network import MLPClassifier

# def train_neural_network():
#     model = MLPClassifier(
#         hidden_layer_sizes=(10, 5),
#         max_iter=100,
#         random_state=RANDOM_STATE,
#         early_stopping=True,
#     )
#     model.fit(X_train, Y_train)
#     Y_pred = model.predict(X_test)
#     accuracy = accuracy_score(Y_test, Y_pred)
#     print(f"Accuracy: {accuracy}")

# train_neural_network()

# Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gcc_model = GradientBoostingClassifier(
    criterion='squared_error',
    n_estimators=2,
    max_depth=10, 
    min_samples_leaf=10,
    random_state=RANDOM_STATE,
)
gcc_model.fit(X_train, Y_train)
Y_pred = gcc_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")



# Plot a Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decisionTreeClassifier = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=10,
    min_samples_leaf=10,
    random_state=RANDOM_STATE,
)
decisionTreeClassifier.fit(X_train, Y_train)
Y_pred = decisionTreeClassifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
# plot a large image of the tree

plt.figure(figsize=(20, 10))  
tree.plot_tree(
    decisionTreeClassifier, 
    max_depth=3, 
    feature_names = X_train.columns, 
    class_names = ['0', '1'],
    filled=True
)
plt.show()

# Print SHAP values
Shapely Additive explanations

In [None]:
import tqdm as notebook_tqdm
import shap

ex = shap.TreeExplainer(decisionTreeClassifier)
shap_values = ex(X_test)


In [None]:
shap_x_values = shap_values.values
print(shap_x_values.shape)
# Reshape the shap values to be used for bar plot
shap_x_values = shap_x_values.reshape(shap_values.shape[0], shap_values.shape[1])

print(shap_x_values.shape)

In [None]:
import numpy as np
mean_shap_values = np.mean(np.array(shap_values), axis=0)
# mean_shap_values = np.mean(shap_values[0], axis=0)
# mean_shap_values = sum(shap_values) / len(shap_values)
shap.plots.bar(mean_shap_values, feature_names=X_test.columns)

# shap.plots.bar(shap_values[0])

# plt.show()

In [None]:
shap.plots.bar(shap_values, max_display=10)

# Node Level Understanding

In [None]:
# Assuming X_test is your test data
leaf_indices = []
leaf_values = []
single_example = X_test[0:1]

    # Get the leaf node indices for each sample
indices = decisionTreeClassifier.apply(single_example)
print(indices)
# indices = tree_in_forest[0].apply(singl_example)
# leaf_indices.append(indices)

# # Extract the corresponding leaf values
values = decisionTreeClassifier.tree_.value[indices][:,0,0]
print(values)
leaf_values.append(values)

In [None]:
# print decision_path
sparse_matrix = decisionTreeClassifier.decision_path(X_test[0:1])
print(sparse_matrix.shape)
print(sparse_matrix)


In [None]:
clf = decisionTreeClassifier
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
values = clf.tree_.value

In [None]:
node_indicator = clf.decision_path(X_test[0:1])
leaf_id = clf.apply(X_test[0:1])

sample_id = 0
# obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id`
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
]
print(node_index)
print(feature)
print(threshold)
print("Rules used to predict sample {id}:\n".format(id=sample_id))
for node_id in node_index:
    # continue to the next node if it is a leaf node
    if leaf_id[sample_id] == node_id:
        continue

    # check if value of the split feature for sample 0 is below threshold
    # print(f"node_id: {node_id} Feature: {feature[node_id]} Threshold: {threshold[node_id]}")
    sample_value = X_test.iloc[sample_id, feature[node_id]]
    # print(f"X_test[sample_id, feature[node_id]]: {sample_value} threshold: {threshold[node_id]}")

    if X_test.iloc[sample_id, feature[node_id]] <= threshold[node_id]:
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print(
        "decision node {node} : (X_test[{sample}, {feature}] = {value}) "
        "{inequality} {threshold})".format(
            node=node_id,
            sample=sample_id,
            feature=feature[node_id],
            value=X_test.iloc[sample_id, feature[node_id]],
            inequality=threshold_sign,
            threshold=threshold[node_id],
        )
    )

In [None]:
leaf_id = clf.apply(X_test[0:1])
print(clf.predict(X_test[0:1]))
print(clf.predict_proba(X_test[0:1]))
print(clf.predict_log_proba(X_test[0:1]))
