In [1]:
import os
os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask

import modin.pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm

tqdm.pandas()

# from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

In [2]:
orders = pd.read_csv('instacart-market-basket-analysis/orders.csv/orders.csv')
products = pd.read_csv('instacart-market-basket-analysis/products.csv/products.csv')
order_products = pd.read_csv('instacart-market-basket-analysis/order_products__train.csv/order_products__train.csv')
order_products_prior = pd.read_csv('instacart-market-basket-analysis/order_products__prior.csv/order_products__prior.csv')
aisles = pd.read_csv('instacart-market-basket-analysis/aisles.csv/aisles.csv')
departments = pd.read_csv('instacart-market-basket-analysis/departments.csv/departments.csv')
sample = pd.read_csv('instacart-market-basket-analysis/sample_submission.csv/sample_submission.csv')

In [3]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [4]:
order_products_total = pd.concat([order_products_prior,order_products])
order_products_join = pd.merge(left=order_products_total, right=products, how='left', left_on='product_id', right_on='product_id')
order_products_join = pd.merge(left=order_products_join, right=aisles, how='left', left_on='aisle_id', right_on='aisle_id')
order_products_join = pd.merge(left=order_products_join, right=departments, how='left', left_on='department_id', right_on='department_id')
order_products_join = pd.merge(left=order_products_join, right=orders, how='left', left_on='order_id', right_on='order_id')
order_products_join = order_products_join.drop([ 'eval_set'], axis = 1)

In [5]:
order_products_join.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,Organic Egg Whites,86,16,eggs,dairy eggs,202279,3,5,9,8.0
1,2,28985,2,1,Michigan Organic Kale,83,4,fresh vegetables,produce,202279,3,5,9,8.0
2,2,9327,3,0,Garlic Powder,104,13,spices seasonings,pantry,202279,3,5,9,8.0
3,2,45918,4,1,Coconut Butter,19,13,oils vinegars,pantry,202279,3,5,9,8.0
4,2,30035,5,0,Natural Sweetener,17,13,baking ingredients,pantry,202279,3,5,9,8.0


In [6]:
users = list(order_products_join['user_id'].value_counts()[:10000].index)
order_products_join = order_products_join[order_products_join['user_id'].isin(users)]
print(order_products_join.shape)
order_products_join.head()

(8594383, 14)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
17,4,46842,1,0,Plain Pre-Sliced Bagels,93,3,breakfast bakery,bakery,178520,36,1,9,7.0
18,4,26434,2,1,Honey/Lemon Cough Drops,11,11,cold flu allergy,personal care,178520,36,1,9,7.0
19,4,39758,3,1,Chewy 25% Low Sugar Chocolate Chip Granola,3,19,energy granola bars,snacks,178520,36,1,9,7.0
20,4,27761,4,1,Oats & Chocolate Chewy Bars,48,14,breakfast bars pastries,breakfast,178520,36,1,9,7.0
21,4,10054,5,1,Kellogg's Nutri-Grain Apple Cinnamon Cereal,48,14,breakfast bars pastries,breakfast,178520,36,1,9,7.0


In [7]:
# prod_ids = list(order_products_join['product_name'].value_counts()[order_products_join['product_name'].value_counts().values>1000].index)
prod_ids = list(order_products_join['product_name'].value_counts()[:500].index)

In [8]:
len(prod_ids)

500

In [9]:
order_products_join['product_name'] = order_products_join['product_name'].apply(lambda x: 'Others' if x not in prod_ids else x)

In [10]:
order_products_join_prod = order_products_join[['order_id', 'user_id', 'product_name', 'product_id']] 
# order_products_join_dept = order_products_join[['order_id', 'user_id', 'department', 'department_id']] 
# order_products_join_aisle = order_products_join[['order_id', 'user_id', 'aisle','aisle_id']] 

In [11]:
order_products_join_all = order_products_join[['order_id', 'user_id','order_number', 'order_dow','order_hour_of_day','days_since_prior_order']] 
order_products_join_all = order_products_join_all.drop_duplicates(subset='order_id').set_index(['order_id'])
order_products_join_all.head()

Unnamed: 0_level_0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,178520,36,1,9,7.0
5,156122,42,6,16,9.0
15,54901,51,3,11,2.0
18,118860,3,4,20,6.0
28,98256,29,3,13,6.0


In [12]:
chunk_size = 10000

chunks = [x for x in range(0,order_products_join_prod.shape[0] , chunk_size)]
order_products_join_prod_pivot = pd.concat([order_products_join_prod.iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot_table (index='order_id', columns='product_name', values='product_id', aggfunc = 'count', fill_value=0) for i in tqdm(range(0, len(chunks) - 1))], copy=False)

# chunks = [x for x in range(0, 10000000, chunk_size)]
# order_products_join_prod_pivot1 = pd.concat([order_products_join_prod.iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot_table (index='order_id', columns='product_name', values='product_id', aggfunc = 'count', fill_value=0) for i in tqdm(range(0, len(chunks) - 1))], copy=False)

# chunks = [x for x in range(10000000,20000000 , chunk_size)]
# order_products_join_prod_pivot2 = pd.concat([order_products_join_prod.iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot_table (index='order_id', columns='product_name', values='product_id', aggfunc = 'count', fill_value=0) for i in tqdm(range(0, len(chunks) - 1))], copy=False)

# chunks = [x for x in range(20000000,order_products_join_prod.shape[0] , chunk_size)]
# order_products_join_prod_pivot3 = pd.concat([order_products_join_prod.iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot_table (index='order_id', columns='product_name', values='product_id', aggfunc = 'count', fill_value=0) for i in tqdm(range(0, len(chunks) - 1))], copy=False)

# order_products_join_prod_pivot = pd.concat([order_products_join_prod_pivot1, order_products_join_prod_pivot2, order_products_join_prod_pivot3], axis=0, copy=False)

# order_products_join_prod_pivot = order_products_join_prod.pivot_table(index = 'order_id', columns ='product_name', values='product_id', aggfunc = 'count', fill_value=0)
# order_products_join_dept = pd.pivot_table(data = order_products_join_dept.sample(10000), index = ['order_id','user_id'], columns ='department', values='department_id', aggfunc = 'count', fill_value=0, copy=False)
# order_products_join_aisle = pd.pivot_table(data = order_products_join_aisle.sample(10000), index = ['order_id','user_id'], columns ='aisle', values='aisle_id', aggfunc = 'count', fill_value=0, copy=False)

100%|████████████████████████████████████████████████████████████████████████████████| 859/859 [04:11<00:00,  3.42it/s]


In [13]:
order_products_join_prod_pivot.head()

product_name,1% Lowfat Milk,100 Calorie Per Bag Popcorn,100% Raw Coconut Water,100% Recycled Paper Towels,100% Whole Wheat Bread,2% Reduced Fat DHA Omega-3 Reduced Fat Milk,2% Reduced Fat Milk,2% Reduced Fat Organic Milk,Aged White Cheddar Baked Rice & Corn Puffs Gluten Free Lunch Packs,Air Chilled Breaded Chicken Breast Nuggets,...,YoBaby Blueberry Apple Yogurt,YoBaby Peach Pear Yogurt,YoKids Blueberry & Strawberry/Vanilla Yogurt,YoKids Squeeze! Organic Strawberry Flavor Yogurt,"YoKids Squeezers Organic Low-Fat Yogurt, Strawberry",Yobaby Organic Plain Yogurt,"Yogurt, Lowfat, Strawberry","Yogurt, Strained Low-Fat, Coconut",ZBar Organic Chocolate Brownie Energy Snack,Crackers Cheddar Bunnies Snack Packs
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
18,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
28,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [14]:
order_products_join_prod_pivot.shape

(595303, 501)

In [15]:
order_products_join_prod_pivot.to_csv('order_products_join_prod_pivot.csv')
# order_products_join_prod_pivot = pd.read_csv('order_products_join_prod_pivot.csv')

In [16]:
# prod_pca = PCA(n_components = 2000)
# prod_pca_vals = prod_pca.fit_transform(order_products_join_prod)
# print(sum(prod_pca.explained_variance_ratio_))
# prod_pca_vals.shape

In [17]:
# prod_pca_df = pd.DataFrame(prod_pca_vals, index=order_products_join_prod.index)
# prod_pca_df

In [18]:
order_products_merge = pd.concat([order_products_join_all, order_products_join_prod_pivot], axis=1, copy=False)

In [19]:
order_products_merge.head()

Unnamed: 0_level_0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,1% Lowfat Milk,100 Calorie Per Bag Popcorn,100% Raw Coconut Water,100% Recycled Paper Towels,100% Whole Wheat Bread,...,YoBaby Blueberry Apple Yogurt,YoBaby Peach Pear Yogurt,YoKids Blueberry & Strawberry/Vanilla Yogurt,YoKids Squeeze! Organic Strawberry Flavor Yogurt,"YoKids Squeezers Organic Low-Fat Yogurt, Strawberry",Yobaby Organic Plain Yogurt,"Yogurt, Lowfat, Strawberry","Yogurt, Strained Low-Fat, Coconut",ZBar Organic Chocolate Brownie Energy Snack,Crackers Cheddar Bunnies Snack Packs
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,178520,36,1,9,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
5,156122,42,6,16,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
15,54901,51,3,11,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
18,118860,3,4,20,6.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
28,98256,29,3,13,6.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [20]:
order_products_merge.shape

(595542, 506)

In [21]:
order_products_merge.to_csv('order_products_merge.csv')