In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

#### Questions related to products: 
- Is there a relationship between those people who buy in large amounts vs how much they reorder vs what dow / hod they order?
- For users who buy large amounts - do they consistently buy large amounts for every order? Or is it more of a one-off? Vice versa for those who buy in small amounts
- How many unique items do customers buy given their order history? Do they buy different mix of the same 10 items or are they buying a wide range of items?
- Do people use the service mostly to buy essentials (produce, bread, milk, etc) or hard to find items (niche products, organic foods, lactose free, etc)?
- Are most orders re-orders? How much does the reorder (# of reordered items / total # of items in order) fluctuate on average?
- What are the top items bought? Does this fluctuate based on the dow / hod?
- What are the similarities of the products (by aisle / department) in a given order? Does an order usually contain a majority of a product group? (Ex. Order has 60% of fruits and veggies)
- What are the least/most frequently purchased items? (Need to know for inventory)

In [2]:
orders = pd.read_csv('../data/orders.csv')
orders_products = pd.read_csv('../data/order_products_combined_aisledept.csv')


In [35]:
products = pd.read_csv('../data/products.csv')

In [5]:
orders_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle,department
0,2,33120,1,1,Organic Egg Whites,eggs,dairy eggs
1,26,33120,5,0,Organic Egg Whites,eggs,dairy eggs
2,120,33120,13,0,Organic Egg Whites,eggs,dairy eggs
3,327,33120,5,1,Organic Egg Whites,eggs,dairy eggs
4,390,33120,28,1,Organic Egg Whites,eggs,dairy eggs


### EDA

- What are the top 10 most ordered items / aisle / department? 
- What are the top 10 most reordered items / aisle / department? 




1. Top 10 ordered & reodered items are the same

In [9]:
top_items = orders_products.groupby('product_id')['reordered'].agg(['count', 'sum'])


In [13]:
top_items.columns = ['number_of_buys', 'number_of_reorder']

In [41]:
top_items_buys = top_items.sort_values('number_of_buys', ascending=False).head(10)
top_items_buys

Unnamed: 0_level_0,number_of_buys,number_of_reorder
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
24852,491291,415166
13176,394930,329275
21137,275577,214448
21903,251705,194939
47209,220877,176173
47766,184224,140270
47626,160792,112178
16797,149445,104588
26209,146660,100002
27845,142813,118684


In [42]:
top_items_buys_id = list(top_items_buys.index)

In [43]:
top_items_reorder = top_items.sort_values('number_of_reorder', ascending=False).head(10)
top_items_reorder

Unnamed: 0_level_0,number_of_buys,number_of_reorder
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
24852,491291,415166
13176,394930,329275
21137,275577,214448
21903,251705,194939
47209,220877,176173
47766,184224,140270
27845,142813,118684
47626,160792,112178
27966,142603,109688
16797,149445,104588


In [44]:
top_items_reorder_id = list(x.index)
top_items_reorder_id

[24852, 13176, 21137, 21903, 47209, 47766, 27845, 47626, 27966, 16797]

In [38]:
products[products['product_id'].isin(top_items_buys_id)]

Unnamed: 0,product_id,product_name,aisle_id,department_id
13175,13176,Bag of Organic Bananas,24,4
16796,16797,Strawberries,24,4
21136,21137,Organic Strawberries,24,4
21902,21903,Organic Baby Spinach,123,4
24851,24852,Banana,24,4
27844,27845,Organic Whole Milk,84,16
27965,27966,Organic Raspberries,123,4
47208,47209,Organic Hass Avocado,24,4
47625,47626,Large Lemon,24,4
47765,47766,Organic Avocado,24,4


In [63]:
#items only bought once, there are 102
len(top_items[top_items['number_of_buys']==1])

102

In [66]:
bottom_buys_id = top_items[top_items['number_of_buys']==1].index

In [68]:
#sample of items only bought once
products[products['product_id'].isin(bottom_buys_id)].head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id
1001,1002,All Natural Stevia Liquid Extract Sweetener,17,13
1907,1908,Greek Blended Cherry Fat Free Yogurt,120,16
2768,2769,Pappardelle Nests Pasta,131,9
3116,3117,Fruit Me Up! Applesauce Pouches,99,15
3425,3426,Organic Better Rest Tea Blend,94,7
3717,3718,Wasabi Cheddar Spreadable Cheese,21,16
3830,3831,Jamaican Allspice,104,13
4416,4417,Organic Veggie Ground,14,20
5254,5255,Chardonnay Carneros,62,5
6061,6062,Multigrain Penne Rigate,131,9


2. The top ordered aisles are: 

In [51]:
top_aisle = orders_products.groupby('aisle')['reordered'].count()


In [52]:
top_aisle.sort_values(ascending=False).head(10)

aisle
fresh fruits                     3792661
fresh vegetables                 3568630
packaged vegetables fruits       1843806
yogurt                           1507583
packaged cheese                  1021462
milk                              923659
water seltzer sparkling water     878150
chips pretzels                    753739
soy lactosefree                   664493
bread                             608469
Name: reordered, dtype: int64

In [59]:
#least bought aisles 
top_aisle.sort_values(ascending=False).tail(10)

aisle
specialty wines champagnes    11659
first aid                     11411
shave needs                   10876
skin care                     10698
kitchen supplies               9620
eye ear care                   9522
baby bath body care            8909
baby accessories               8466
beauty                         6455
frozen juice                   5147
Name: reordered, dtype: int64

3. The top ordered departments are: 

In [54]:
top_dept = orders_products.groupby('department')['reordered'].count()

In [55]:
top_dept.sort_values(ascending=False).head(10)

department
produce            9888378
dairy eggs         5631067
snacks             3006412
beverages          2804175
frozen             2336858
pantry             1956819
bakery             1225181
canned goods       1114857
deli               1095540
dry goods pasta     905340
Name: reordered, dtype: int64

In [69]:
#least bought departments
top_dept.sort_values(ascending=False).tail(10)

department
meat seafood     739238
breakfast        739069
personal care    468693
babies           438743
international    281155
alcohol          159294
pets             102221
missing           77396
other             38086
bulk              35932
Name: reordered, dtype: int64

- Organic vs Non Organic

Overall, there are more non-organic items bought but out of the organic items a higher portion of them are reorders

In [71]:
orders_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle,department
0,2,33120,1,1,Organic Egg Whites,eggs,dairy eggs
1,26,33120,5,0,Organic Egg Whites,eggs,dairy eggs
2,120,33120,13,0,Organic Egg Whites,eggs,dairy eggs
3,327,33120,5,1,Organic Egg Whites,eggs,dairy eggs
4,390,33120,28,1,Organic Egg Whites,eggs,dairy eggs


In [77]:
organic_df = orders_products[orders_products.product_name.str.contains('Organic') == True]

In [76]:
10655988 / orders_products.shape[0]

0.31508780864875613

In [79]:
organic_df['reordered'].sum()

6770700

In [81]:
organic_df['reordered'].sum() / organic_df.shape[0]

0.6353892290419246

In [78]:
not_organic_df = orders_products[orders_products.product_name.str.contains('Organic') == False]


In [84]:
not_organic_df.shape[0] / orders_products.shape[0]

0.6849121913512439

In [83]:
not_organic_df['reordered'].sum() / not_organic_df.shape[0]

0.5692092057727288