In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Load Data

In [3]:
products = pd.read_csv('../data/products.csv')
departments = pd.read_csv('../data/departments.csv')
aisles = pd.read_csv('../data/aisles.csv')

In [4]:
prod = pd.merge(products, aisles, how='left', left_on='aisle_id', right_on='aisle_id')
prod = pd.merge(prod, departments, how='left', left_on='department_id', right_on='department_id')
prod

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry
...,...,...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,spirits,alcohol
49684,49685,En Croute Roast Hazelnut Cranberry,42,1,frozen vegan vegetarian,frozen
49685,49686,Artisan Baguette,112,3,bread,bakery
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,cat food care,pets


### High Markups

In [5]:
# Products
allproduce = prod[prod['department']=='produce']
p_organic_produce = allproduce[allproduce['product_name'].str.contains('Organic')]
p_organic_produce = p_organic_produce[['product_id', 'product_name']]

p_batteries = prod[prod['product_name'].str.contains('Batteries')]

In [6]:
# Aisles
p_water = prod[prod['aisle']=='water seltzer sparkling water'][['product_id','product_name']]
p_cereal = prod[prod['aisle']=='cereal'][['product_id','product_name']]
p_spices = prod[prod['aisle']== 'spices seasonings'][['product_id','product_name']]

In [7]:
# Departments
p_personal_care = prod[prod['department']=='personal care'][['product_id','product_name']]
p_household = prod[prod['department']=='household'][['product_id','product_name']]
p_babies = prod[prod['department']=='babies'][['product_id','product_name']]
p_pets = prod[prod['department']=='pets'][['product_id','product_name']]
p_alcohol = prod[prod['department']=='alcohol'][['product_id','product_name']]

### Concat and Add 'High Margin' Feature

In [8]:
#Products
print(p_organic_produce.shape)
print(p_batteries.shape)
#Aisles
print(p_water.shape)
print(p_cereal.shape)
print(p_spices.shape)
#Dept
print(p_personal_care.shape)
print(p_household.shape)
print(p_babies.shape)
print(p_pets.shape)
print(p_alcohol.shape)

(473, 2)
(27, 6)
(344, 2)
(454, 2)
(797, 2)
(6563, 2)
(3085, 2)
(1081, 2)
(972, 2)
(1054, 2)


In [9]:
high_margin = pd.concat([p_organic_produce, p_organic_produce, p_batteries, p_water, p_cereal, p_spices,\
          p_personal_care, p_household, p_babies, p_pets, p_alcohol])
high_margin['high_margin'] = 1
high_margin = high_margin[['product_id', 'high_margin']]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [29]:
high_margin.shape

(15323, 2)

### Merge with Prod

In [10]:
product_margins = pd.merge(prod, high_margin, how='left', left_on='product_id', right_on='product_id')

In [11]:
product_margins['high_margin'] = product_margins['high_margin'].fillna(0)

In [12]:
product_margins

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department,high_margin
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks,0.0
1,2,All-Seasons Salt,104,13,spices seasonings,pantry,1.0
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages,0.0
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen,0.0
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry,0.0
...,...,...,...,...,...,...,...
50179,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,spirits,alcohol,1.0
50180,49685,En Croute Roast Hazelnut Cranberry,42,1,frozen vegan vegetarian,frozen,0.0
50181,49686,Artisan Baguette,112,3,bread,bakery,0.0
50182,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,cat food care,pets,1.0


In [13]:
# product_margins.to_csv('../data/product_margins.csv') 

### Exploring

In [14]:
prod['department'].unique()

array(['snacks', 'pantry', 'beverages', 'frozen', 'personal care',
       'dairy eggs', 'household', 'babies', 'meat seafood',
       'dry goods pasta', 'pets', 'breakfast', 'canned goods', 'produce',
       'missing', 'international', 'deli', 'alcohol', 'bakery', 'other',
       'bulk'], dtype=object)

In [19]:
prod['aisle'].nunique()

134

In [27]:
prod['aisle'].unique()

array(['cookies cakes', 'spices seasonings', 'tea', 'frozen meals',
       'marinades meat preparation', 'cold flu allergy', 'juice nectars',
       'frozen produce', 'yogurt', 'water seltzer sparkling water',
       'refrigerated', 'frozen dessert', 'dish detergents',
       'diapers wipes', 'ice cream toppings', 'poultry counter',
       'frozen pizza', 'grains rice dried goods', 'dog food care',
       'oral hygiene', 'packaged poultry', 'vitamins supplements',
       'energy granola bars', 'cat food care', 'body lotions soap',
       'cereal', 'canned jarred vegetables', 'packaged vegetables fruits',
       'chips pretzels', 'dry pasta', 'hot dogs bacon sausage',
       'soup broth bouillon', 'missing', 'energy sports drinks',
       'crackers', 'fresh vegetables', 'frozen breads doughs',
       'asian foods', 'more household', 'tofu meat alternatives',
       'doughs gelatins bake mixes', 'hair care',
       'specialty wines champagnes', 'popcorn jerky',
       'plates bowls cups 

In [16]:
# prod[prod['department']=='produce']['aisle'].unique()

In [17]:
# prod[prod['aisle']== 'spices seasonings']['product_name'].unique()

In [18]:
# specific_product('Decoration')