In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
#loading master_data2.csv
data2 = pd.read_csv('saved_datasets/master_data2.csv')

In [54]:
#total unique user_id
data2['user_id'].nunique()

2192

In [55]:
#reordered count/total rows : reordered = 1 percentage
data2[data2['reordered'] == 1].shape[0]/data2.shape[0]

0.5835062884714314

In [56]:
# Spliting the data train validation and test using stratified sampling on reordered
from sklearn.model_selection import train_test_split
train, test = train_test_split(data2, test_size=0.2, stratify=data2['reordered'])
train, val = train_test_split(train, test_size=0.2, stratify=train['reordered'])

train.shape, val.shape, test.shape


((216472, 15), (54118, 15), (67648, 15))

In [57]:
# percentage of reordered = 1 in train, val and test
train[train['reordered'] == 1].shape[0]/train.shape[0], val[val['reordered'] == 1].shape[0]/val.shape[0], test[test['reordered'] == 1].shape[0]/test.shape[0]

(0.5835073358217229, 0.5835027162866329, 0.5835057947019867)

In [58]:
train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department,reordered
18563622,1958229,8736,3,786,prior,72,3,16,3.0,Synergy Organic & Raw Cosmic Cranberry,31,7,refrigerated,beverages,1
1456372,153549,43026,1,454,prior,10,4,15,6.0,Deli Fresh Honey Shaved Smoked Ham,96,20,lunch meat,deli,1
32959543,1289521,13176,5,2159,train,8,5,7,3.0,Bag of Organic Bananas,24,4,fresh fruits,produce,1
8533702,900872,44560,8,1494,prior,11,3,13,9.0,Kids Organic Chocolate Chip ZBars,3,19,energy granola bars,snacks,1
3556608,375755,29944,20,1267,prior,3,6,12,15.0,Sriracha Potato Chips,107,19,chips pretzels,snacks,0


In [59]:
#function to drop order_id, eval_set, product_id, aisle_id and department_id
def drop_columns(data):
    data = data.drop(['order_id', 'eval_set', 'product_id', 'aisle_id', 'department_id'], axis=1)
    return data

#function to seperate features and target
def feature_target(data):
    X = data.drop('reordered', axis=1)
    y = data['reordered']
    return X, y



In [60]:
#dropping columns
train = drop_columns(train)
train.head()



Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,reordered
18563622,3,786,72,3,16,3.0,Synergy Organic & Raw Cosmic Cranberry,refrigerated,beverages,1
1456372,1,454,10,4,15,6.0,Deli Fresh Honey Shaved Smoked Ham,lunch meat,deli,1
32959543,5,2159,8,5,7,3.0,Bag of Organic Bananas,fresh fruits,produce,1
8533702,8,1494,11,3,13,9.0,Kids Organic Chocolate Chip ZBars,energy granola bars,snacks,1
3556608,20,1267,3,6,12,15.0,Sriracha Potato Chips,chips pretzels,snacks,0


In [61]:
#adding a feature: average days_since_prior_order for each user_id

def avg_days_since_prior_order(data):
    days_since_prior_order = data.groupby('user_id')['days_since_prior_order'].mean().reset_index()
    data = data.merge(days_since_prior_order, on='user_id', how='left')
    data = data.rename(columns={'days_since_prior_order_x':'days_since_prior_order', 'days_since_prior_order_y':'avg_days_since_prior_order'})
    return data

train = avg_days_since_prior_order(train)
train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,reordered,avg_days_since_prior_order
0,3,786,72,3,16,3.0,Synergy Organic & Raw Cosmic Cranberry,refrigerated,beverages,1,3.257274
1,1,454,10,4,15,6.0,Deli Fresh Honey Shaved Smoked Ham,lunch meat,deli,1,6.525
2,5,2159,8,5,7,3.0,Bag of Organic Bananas,fresh fruits,produce,1,11.425532
3,8,1494,11,3,13,9.0,Kids Organic Chocolate Chip ZBars,energy granola bars,snacks,1,6.423077
4,20,1267,3,6,12,15.0,Sriracha Potato Chips,chips pretzels,snacks,0,13.010256


In [62]:
#most_common order_dow for each user_id : mode of order_dow for each user_id
def most_common_order_dow(data):
    order_dow = data.groupby('user_id')['order_dow'].agg(lambda x:x.value_counts().index[0]).reset_index()
    data = data.merge(order_dow, on='user_id', how='left')
    data = data.rename(columns={'order_dow_x':'order_dow', 'order_dow_y':'most_common_order_dow'})
    return data

train = most_common_order_dow(train)
train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,reordered,avg_days_since_prior_order,most_common_order_dow
0,3,786,72,3,16,3.0,Synergy Organic & Raw Cosmic Cranberry,refrigerated,beverages,1,3.257274,4
1,1,454,10,4,15,6.0,Deli Fresh Honey Shaved Smoked Ham,lunch meat,deli,1,6.525,4
2,5,2159,8,5,7,3.0,Bag of Organic Bananas,fresh fruits,produce,1,11.425532,2
3,8,1494,11,3,13,9.0,Kids Organic Chocolate Chip ZBars,energy granola bars,snacks,1,6.423077,2
4,20,1267,3,6,12,15.0,Sriracha Potato Chips,chips pretzels,snacks,0,13.010256,6


In [63]:
# check unique values of order_hour_of_day
train['order_hour_of_day'].unique()

array([16, 15,  7, 13, 12,  9, 17, 10, 11, 20,  8, 14, 18, 21,  2, 22,  6,
       19, 23,  0,  4,  5,  1,  3])

In [64]:
#spliting order_hour_of_day into 4 categories
def split_order_hour_of_day(data):
    data['order_hour_of_day'] = pd.cut(data['order_hour_of_day'], bins=[0, 6, 12, 18, 24], labels=['night', 'morning', 'afternoon', 'evening'])
    return data

train = split_order_hour_of_day(train)
train.head()


Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,reordered,avg_days_since_prior_order,most_common_order_dow
0,3,786,72,3,afternoon,3.0,Synergy Organic & Raw Cosmic Cranberry,refrigerated,beverages,1,3.257274,4
1,1,454,10,4,afternoon,6.0,Deli Fresh Honey Shaved Smoked Ham,lunch meat,deli,1,6.525,4
2,5,2159,8,5,morning,3.0,Bag of Organic Bananas,fresh fruits,produce,1,11.425532,2
3,8,1494,11,3,afternoon,9.0,Kids Organic Chocolate Chip ZBars,energy granola bars,snacks,1,6.423077,2
4,20,1267,3,6,morning,15.0,Sriracha Potato Chips,chips pretzels,snacks,0,13.010256,6


In [65]:
#total number of orders for each user_id
def total_orders(data):
    total_orders = data.groupby('user_id')['order_number'].max().reset_index()
    data = data.merge(total_orders, on='user_id', how='left')
    data = data.rename(columns={'order_number_x':'order_number', 'order_number_y':'total_orders'})
    return data

train = total_orders(train)
train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,reordered,avg_days_since_prior_order,most_common_order_dow,total_orders
0,3,786,72,3,afternoon,3.0,Synergy Organic & Raw Cosmic Cranberry,refrigerated,beverages,1,3.257274,4,100
1,1,454,10,4,afternoon,6.0,Deli Fresh Honey Shaved Smoked Ham,lunch meat,deli,1,6.525,4,55
2,5,2159,8,5,morning,3.0,Bag of Organic Bananas,fresh fruits,produce,1,11.425532,2,8
3,8,1494,11,3,afternoon,9.0,Kids Organic Chocolate Chip ZBars,energy granola bars,snacks,1,6.423077,2,14
4,20,1267,3,6,morning,15.0,Sriracha Potato Chips,chips pretzels,snacks,0,13.010256,6,27


In [66]:
#average number of products in each order for each user_id
def avg_products(data):
    avg_products = data.groupby('user_id')['add_to_cart_order'].mean().reset_index()
    data = data.merge(avg_products, on='user_id', how='left')
    data = data.rename(columns={'add_to_cart_order_x':'add_to_cart_order', 'add_to_cart_order_y':'avg_products'})
    return data

train = avg_products(train)
train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,reordered,avg_days_since_prior_order,most_common_order_dow,total_orders,avg_products
0,3,786,72,3,afternoon,3.0,Synergy Organic & Raw Cosmic Cranberry,refrigerated,beverages,1,3.257274,4,100,8.188822
1,1,454,10,4,afternoon,6.0,Deli Fresh Honey Shaved Smoked Ham,lunch meat,deli,1,6.525,4,55,6.953086
2,5,2159,8,5,morning,3.0,Bag of Organic Bananas,fresh fruits,produce,1,11.425532,2,8,7.615385
3,8,1494,11,3,afternoon,9.0,Kids Organic Chocolate Chip ZBars,energy granola bars,snacks,1,6.423077,2,14,9.505376
4,20,1267,3,6,morning,15.0,Sriracha Potato Chips,chips pretzels,snacks,0,13.010256,6,27,8.153465


In [71]:
# check for nan values
train.isnull().sum()


add_to_cart_order             0
user_id                       0
order_number                  0
order_dow                     0
order_hour_of_day             0
days_since_prior_order        0
product_name                  0
aisle                         0
department                    0
reordered                     0
avg_days_since_prior_order    0
most_common_order_dow         0
total_orders                  0
avg_products                  0
dtype: int64

In [68]:
#show rows where days_since_prior_order is nan
train[train['days_since_prior_order'].isnull()]
#imputing nan values with 0 as this is the first order
train['days_since_prior_order'] = train['days_since_prior_order'].fillna(0)

In [70]:
#imputing order_hour_of_day with most common order_hour_of_day
train['order_hour_of_day'] = train['order_hour_of_day'].fillna(train['order_hour_of_day'].mode()[0])

In [72]:
#seperating features and target
X_train, y_train = feature_target(train)

In [73]:
# one hot encoding
categorical_columns = ['order_dow', 'order_hour_of_day','product_name', 'aisle', 'department', 'most_common_order_dow']
X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,days_since_prior_order,avg_days_since_prior_order,total_orders,avg_products,order_dow_1,order_dow_2,order_dow_3,...,department_personal care,department_pets,department_produce,department_snacks,most_common_order_dow_1,most_common_order_dow_2,most_common_order_dow_3,most_common_order_dow_4,most_common_order_dow_5,most_common_order_dow_6
0,3,786,72,3.0,3.257274,100,8.188822,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,454,10,6.0,6.525,55,6.953086,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,5,2159,8,3.0,11.425532,8,7.615385,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,8,1494,11,9.0,6.423077,14,9.505376,0,0,1,...,0,0,0,1,0,1,0,0,0,0
4,20,1267,3,15.0,13.010256,27,8.153465,0,0,0,...,0,0,0,1,0,0,0,0,0,1
