In [31]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import os 
from statsmodels.graphics.mosaicplot import mosaic
color = sns.color_palette()

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

#from library.sb_utils import save_file

import warnings
warnings.filterwarnings("ignore")

In [32]:
df = pd.read_csv('/content/instacart_data.csv')

In [33]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle,department
0,1,49302,1,1,Bulgarian Yogurt,120,16,112108,train,4,4,10,9.0,yogurt,dairy eggs
1,816049,49302,7,1,Bulgarian Yogurt,120,16,47901,train,14,4,6,16.0,yogurt,dairy eggs
2,1242203,49302,1,1,Bulgarian Yogurt,120,16,2993,train,15,0,7,7.0,yogurt,dairy eggs
3,1383349,49302,11,1,Bulgarian Yogurt,120,16,41425,train,4,3,8,14.0,yogurt,dairy eggs
4,1787378,49302,8,0,Bulgarian Yogurt,120,16,187205,train,5,4,14,30.0,yogurt,dairy eggs


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 15 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   order_id                1384617 non-null  int64  
 1   product_id              1384617 non-null  int64  
 2   add_to_cart_order       1384617 non-null  int64  
 3   reordered               1384617 non-null  int64  
 4   product_name            1384617 non-null  object 
 5   aisle_id                1384617 non-null  int64  
 6   department_id           1384617 non-null  int64  
 7   user_id                 1384617 non-null  int64  
 8   eval_set                1384617 non-null  object 
 9   order_number            1384617 non-null  int64  
 10  order_dow               1384617 non-null  int64  
 11  order_hour_of_day       1384617 non-null  int64  
 12  days_since_prior_order  1384617 non-null  float64
 13  aisle                   1384617 non-null  object 
 14  de

In [35]:
# df = df['order_dow'].astype(object)

In [36]:
df.shape

(1384617, 15)

In [37]:
dfo = df.select_dtypes(include=['object'])

In [38]:
dfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 4 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   product_name  1384617 non-null  object
 1   eval_set      1384617 non-null  object
 2   aisle         1384617 non-null  object
 3   department    1384617 non-null  object
dtypes: object(4)
memory usage: 42.3+ MB


In [39]:
#Determine which categorical features should be transformed into indicator variables
dfo.product_name.nunique()

39123

In [40]:
dfo.aisle.nunique()

134

In [41]:
dfo.department.nunique()


21

In [42]:
#dfo = dfo['department']

In [43]:
#order_dow (Order Day of the Week (Monday, Tuesday...Sunday) is a categorical value that is represented numerically.
dfo['order_dow']= df['order_dow'].astype('category')


In [44]:
dfo.drop(columns=['eval_set','aisle','product_name'], inplace=True)

In [45]:
dfo.head()

Unnamed: 0,department,order_dow
0,dairy eggs,4
1,dairy eggs,4
2,dairy eggs,0
3,dairy eggs,3
4,dairy eggs,4


`Department` will be the only categorical feature used in preprocessing. It has a relatively manageable count of unique variables (21). All other categorical variables have 100/1000+ nunique values. This would be exceptionally complex for this project.

In [46]:
dfo = pd.get_dummies(dfo)

In [47]:
df = pd.concat([df.drop(columns=['department', 'order_dow', 'eval_set'], axis=1), dfo], axis=1)

In [48]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,order_number,order_hour_of_day,days_since_prior_order,aisle,department_alcohol,department_babies,department_bakery,department_beverages,department_breakfast,department_bulk,department_canned goods,department_dairy eggs,department_deli,department_dry goods pasta,department_frozen,department_household,department_international,department_meat seafood,department_missing,department_other,department_pantry,department_personal care,department_pets,department_produce,department_snacks,order_dow_0,order_dow_1,order_dow_2,order_dow_3,order_dow_4,order_dow_5,order_dow_6
0,1,49302,1,1,Bulgarian Yogurt,120,16,112108,4,10,9.0,yogurt,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,816049,49302,7,1,Bulgarian Yogurt,120,16,47901,14,6,16.0,yogurt,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1242203,49302,1,1,Bulgarian Yogurt,120,16,2993,15,7,7.0,yogurt,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,1383349,49302,11,1,Bulgarian Yogurt,120,16,41425,4,8,14.0,yogurt,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,1787378,49302,8,0,Bulgarian Yogurt,120,16,187205,5,14,30.0,yogurt,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [48]:
#Convert "Order_hour