In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
#load master data
data = pd.read_csv('data/master_data.csv')
data.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department,reordered
0,2,33120,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs,1
1,2,28985,2,202279,prior,3,5,9,8.0,Michigan Organic Kale,83,4,fresh vegetables,produce,1
2,2,9327,3,202279,prior,3,5,9,8.0,Garlic Powder,104,13,spices seasonings,pantry,0
3,2,45918,4,202279,prior,3,5,9,8.0,Coconut Butter,19,13,oils vinegars,pantry,1
4,2,30035,5,202279,prior,3,5,9,8.0,Natural Sweetener,17,13,baking ingredients,pantry,0


In [9]:
data.shape

(33819106, 15)

In [10]:
data.describe()

Unnamed: 0,order_id,product_id,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id,reordered
count,33819110.0,33819110.0,33819110.0,33819110.0,33819110.0,33819110.0,33819110.0,31741040.0,33819110.0,33819110.0,33819110.0
mean,1710566.0,25575.51,8.367738,102944.4,17.13998,2.737285,13.43123,11.36415,71.21799,9.918544,0.5900617
std,987400.8,14097.7,7.13954,59467.33,17.49829,2.093296,4.246149,8.9405,38.19898,6.281655,0.491822
min,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,855413.0,13519.0,3.0,51435.0,5.0,1.0,10.0,5.0,31.0,4.0,0.0
50%,1710660.0,25256.0,6.0,102626.0,11.0,3.0,13.0,8.0,83.0,9.0,1.0
75%,2565587.0,37935.0,11.0,154412.0,24.0,5.0,16.0,15.0,107.0,16.0,1.0
max,3421083.0,49688.0,145.0,206209.0,100.0,6.0,23.0,30.0,134.0,21.0,1.0


In [11]:
#total unique user_id
data['user_id'].nunique()

206209

In [12]:
#sorting data by user_id and droping the data where user_id > 25 percentile
data2 = data.sort_values('user_id')
data2 = data[data['user_id'] <= data['user_id'].quantile(0.01)]
data2.shape

(338238, 15)

In [13]:
#total unique user_id
data2['user_id'].nunique()

2192

In [18]:
#reordered count/total rows : reordered = 1 percentage
data2[data2['reordered'] == 1].shape[0]/data2.shape[0]

0.5835062884714314

In [25]:
# Spliting the data train validation and test using stratified sampling on reordered
from sklearn.model_selection import train_test_split
train, test = train_test_split(data2, test_size=0.2, stratify=data2['reordered'])
train, val = train_test_split(train, test_size=0.2, stratify=train['reordered'])

train.shape, val.shape, test.shape


((216472, 15), (54118, 15), (67648, 15))

In [26]:
# percentage of reordered = 1 in train, val and test
train[train['reordered'] == 1].shape[0]/train.shape[0], val[val['reordered'] == 1].shape[0]/val.shape[0], test[test['reordered'] == 1].shape[0]/test.shape[0]

(0.5835073358217229, 0.5835027162866329, 0.5835057947019867)

In [27]:
train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department,reordered
31152342,3285797,32426,10,36,prior,1,4,17,,1 Step Kashmir Spinach Indian Cuisine,76,6,indian foods,international,0
7322226,772953,8518,1,1027,prior,42,1,8,1.0,Organic Red Onion,83,4,fresh vegetables,produce,1
28535648,3009469,5922,6,1170,prior,12,5,13,3.0,"Honest Face, Hand, & Baby Wipes",56,18,diapers wipes,babies,0
12114592,1278717,27117,1,1164,prior,12,3,16,2.0,Refresh Herbal Tea,94,7,tea,beverages,0
18521816,1953848,12254,9,464,prior,25,0,10,5.0,Organic Cherries,123,4,packaged vegetables fruits,produce,1


In [28]:
#function to drop order_id, eval_set, product_id, aisle_id and department_id
def drop_columns(data):
    data = data.drop(['order_id', 'eval_set', 'product_id', 'aisle_id', 'department_id'], axis=1)
    return data

#function to seperate features and target
def feature_target(data):
    X = data.drop('reordered', axis=1)
    y = data['reordered']
    return X, y



In [29]:
#dropping columns
train = drop_columns(train)
train.head()

#seperating features and target
X_train, y_train = feature_target(train)

In [30]:
#adding a feature: average days_since_prior_order for each user_id

def avg_days_since_prior_order(data):
    days_since_prior_order = data.groupby('user_id')['days_since_prior_order'].mean().reset_index()
    data = data.merge(days_since_prior_order, on='user_id', how='left')
    data = data.rename(columns={'days_since_prior_order_x':'days_since_prior_order', 'days_since_prior_order_y':'avg_days_since_prior_order'})
    return data

X_train = avg_days_since_prior_order(X_train)
X_train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,avg_days_since_prior_order
0,10,36,1,4,17,,1 Step Kashmir Spinach Indian Cuisine,indian foods,international,11.416667
1,1,1027,42,1,8,1.0,Organic Red Onion,fresh vegetables,produce,5.601329
2,6,1170,12,5,13,3.0,"Honest Face, Hand, & Baby Wipes",diapers wipes,babies,15.469027
3,1,1164,12,3,16,2.0,Refresh Herbal Tea,tea,beverages,5.70088
4,9,464,25,0,10,5.0,Organic Cherries,packaged vegetables fruits,produce,7.45


In [32]:
#most_common order_dow for each user_id : mode of order_dow for each user_id
def most_common_order_dow(data):
    order_dow = data.groupby('user_id')['order_dow'].agg(lambda x:x.value_counts().index[0]).reset_index()
    data = data.merge(order_dow, on='user_id', how='left')
    data = data.rename(columns={'order_dow_x':'order_dow', 'order_dow_y':'most_common_order_dow'})
    return data

X_train = most_common_order_dow(X_train)
X_train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,avg_days_since_prior_order,most_common_order_dow
0,10,36,1,4,17,,1 Step Kashmir Spinach Indian Cuisine,indian foods,international,11.416667,4
1,1,1027,42,1,8,1.0,Organic Red Onion,fresh vegetables,produce,5.601329,0
2,6,1170,12,5,13,3.0,"Honest Face, Hand, & Baby Wipes",diapers wipes,babies,15.469027,1
3,1,1164,12,3,16,2.0,Refresh Herbal Tea,tea,beverages,5.70088,5
4,9,464,25,0,10,5.0,Organic Cherries,packaged vegetables fruits,produce,7.45,0


In [34]:
# check unique values of order_hour_of_day
X_train['order_hour_of_day'].unique()

array([17,  8, 13, 16, 10,  9, 19,  7, 14, 11, 18, 20, 15, 22, 12, 21,  6,
       23,  0,  4,  1,  5,  3,  2])

In [35]:
#spliting order_hour_of_day into 4 categories
def split_order_hour_of_day(data):
    data['order_hour_of_day'] = pd.cut(data['order_hour_of_day'], bins=[0, 6, 12, 18, 24], labels=['night', 'morning', 'afternoon', 'evening'])
    return data

X_train = split_order_hour_of_day(X_train)
X_train.head()


Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,avg_days_since_prior_order,most_common_order_dow
0,10,36,1,4,afternoon,,1 Step Kashmir Spinach Indian Cuisine,indian foods,international,11.416667,4
1,1,1027,42,1,morning,1.0,Organic Red Onion,fresh vegetables,produce,5.601329,0
2,6,1170,12,5,afternoon,3.0,"Honest Face, Hand, & Baby Wipes",diapers wipes,babies,15.469027,1
3,1,1164,12,3,afternoon,2.0,Refresh Herbal Tea,tea,beverages,5.70088,5
4,9,464,25,0,morning,5.0,Organic Cherries,packaged vegetables fruits,produce,7.45,0


In [36]:
#total number of orders for each user_id
def total_orders(data):
    total_orders = data.groupby('user_id')['order_number'].max().reset_index()
    data = data.merge(total_orders, on='user_id', how='left')
    data = data.rename(columns={'order_number_x':'order_number', 'order_number_y':'total_orders'})
    return data

X_train = total_orders(X_train)
X_train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,avg_days_since_prior_order,most_common_order_dow,total_orders
0,10,36,1,4,afternoon,,1 Step Kashmir Spinach Indian Cuisine,indian foods,international,11.416667,4,37
1,1,1027,42,1,morning,1.0,Organic Red Onion,fresh vegetables,produce,5.601329,0,48
2,6,1170,12,5,afternoon,3.0,"Honest Face, Hand, & Baby Wipes",diapers wipes,babies,15.469027,1,17
3,1,1164,12,3,afternoon,2.0,Refresh Herbal Tea,tea,beverages,5.70088,5,79
4,9,464,25,0,morning,5.0,Organic Cherries,packaged vegetables fruits,produce,7.45,0,49


In [37]:
#average number of products in each order for each user_id
def avg_products(data):
    avg_products = data.groupby('user_id')['add_to_cart_order'].mean().reset_index()
    data = data.merge(avg_products, on='user_id', how='left')
    data = data.rename(columns={'add_to_cart_order_x':'add_to_cart_order', 'add_to_cart_order_y':'avg_products'})
    return data

X_train = avg_products(X_train)
X_train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department,avg_days_since_prior_order,most_common_order_dow,total_orders,avg_products
0,10,36,1,4,afternoon,,1 Step Kashmir Spinach Indian Cuisine,indian foods,international,11.416667,4,37,3.307692
1,1,1027,42,1,morning,1.0,Organic Red Onion,fresh vegetables,produce,5.601329,0,48,7.115756
2,6,1170,12,5,afternoon,3.0,"Honest Face, Hand, & Baby Wipes",diapers wipes,babies,15.469027,1,17,5.942623
3,1,1164,12,3,afternoon,2.0,Refresh Herbal Tea,tea,beverages,5.70088,5,79,5.238372
4,9,464,25,0,morning,5.0,Organic Cherries,packaged vegetables fruits,produce,7.45,0,49,5.212245


In [43]:
# check for nan values
X_train.isna().sum()
#drop nan values
X_train = X_train.dropna()

In [44]:
# one hot encoding
categorical_columns = ['order_dow', 'order_hour_of_day','product_name', 'aisle', 'department', 'most_common_order_dow']
X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_train.head()

Unnamed: 0,add_to_cart_order,user_id,order_number,days_since_prior_order,avg_days_since_prior_order,total_orders,avg_products,order_dow_1,order_dow_2,order_dow_3,...,department_personal care,department_pets,department_produce,department_snacks,most_common_order_dow_1,most_common_order_dow_2,most_common_order_dow_3,most_common_order_dow_4,most_common_order_dow_5,most_common_order_dow_6
1,1,1027,42,1.0,5.601329,48,7.115756,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,6,1170,12,3.0,15.469027,17,5.942623,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,1164,12,2.0,5.70088,79,5.238372,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,9,464,25,5.0,7.45,49,5.212245,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,3,2117,6,16.0,9.212121,19,9.754902,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
!pip install autofeat

In [45]:
from autofeat import AutoFeatClassifier


# Assume X_train and y_train are your training data
# Assume X_test and y_test are your test data


# Create an AutoFeatClassifier instance
af = AutoFeatClassifier(verbose=1)

# Fit AutoFeat on the training set and transform both sets
X_train_trans = af.fit_transform(X_train, y_train)



ValueError: Found input variables with inconsistent numbers of samples: [201741, 216472]