**Table of contents**<a id='toc0_'></a>    
- [Importing Datasets](#toc1_)    
- [Combining Orders and Order Prior DataFrames](#toc2_)    
  - [Merging using Dask](#toc2_1_)    
  - [Changing Data Types](#toc2_2_)    
- [Combining Products and Combined Orders DataFrames](#toc3_)    
  - [Merging using Pandas](#toc3_1_)    
- [Saving combined DataFrame as pickle file](#toc4_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Importing Datasets](#toc0_)

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import os

In [None]:
import dask.dataframe as dd

In [2]:
Path = r'D:\Data Analysis\01-08-2025 Instacart Basket Analysis\Data'

In [3]:
df_ord_prior = dd.read_csv(os.path.join(Path, 'Original Data', 'orders_products_prior.csv'))

In [4]:
df_ord = dd.read_csv(os.path.join(Path, 'Prepared Data', 'orders_wrangled.csv'))

In [14]:
rowCount = df_ord.map_partitions(len).compute().sum()
print(rowCount)

3421083


In [15]:
rowCountprior = df_ord_prior.map_partitions(len).compute().sum()
print(rowCountprior)

32434489



# <a id='toc2_'></a>[Combining Orders and Order Prior DataFrames](#toc0_)

## <a id='toc2_1_'></a>[Merging using Dask](#toc0_)

In [None]:
dfMergedLarge = dd.merge(df_ord, df_ord_prior, on = 'order_id', indicator=True)
dfMergedLarge.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,1746375,2597314,104839,prior,12,0,19,5.0,30337,1,0,both
1,1746375,2597314,104839,prior,12,0,19,5.0,19987,2,0,both
2,1746375,2597314,104839,prior,12,0,19,5.0,32740,3,0,both
3,1746375,2597314,104839,prior,12,0,19,5.0,13712,4,0,both
4,1746375,2597314,104839,prior,12,0,19,5.0,15923,5,0,both


In [18]:
rowCountmerged = df_ord_prior.map_partitions(len).compute().sum()
print(rowCountmerged)

32434489


In [None]:
df_merged = dfMergedLarge.compute()

In [None]:
# Exporting combined DataFrames as orders_products_combined.pkl
df_merged.to_pickle(os.path.join(Path, 'Prepared Data', 'orders_products_combined.pkl'))

In [None]:
# Changing to Pandas to continue working with orders_products_combined
df_merged = pd.read_pickle(os.path.join(Path, 'Prepared Data', 'orders_products_combined.pkl'))
df_merged.head(25)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,7,3108588,1,prior,8,1,14,14.0,12427,1,1,both
1,7,3108588,1,prior,8,1,14,14.0,196,2,1,both
2,7,3108588,1,prior,8,1,14,14.0,10258,3,1,both
3,7,3108588,1,prior,8,1,14,14.0,25133,4,1,both
4,7,3108588,1,prior,8,1,14,14.0,46149,5,0,both
5,7,3108588,1,prior,8,1,14,14.0,49235,6,0,both
6,13,1901567,2,prior,3,1,10,3.0,47766,1,1,both
7,13,1901567,2,prior,3,1,10,3.0,32792,2,1,both
8,13,1901567,2,prior,3,1,10,3.0,20574,3,1,both
9,13,1901567,2,prior,3,1,10,3.0,7781,4,0,both


In [4]:
df_merged.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,30356420.0,32434490.0,32434490.0,32434490.0
mean,1709858.0,1710749.0,102937.2,17.14205,2.738818,13.42498,11.10407,25576.34,8.351076,0.5896975
std,986463.6,987300.7,59466.48,17.53504,2.090049,4.246365,8.778914,14096.69,7.126671,0.4918886
min,0.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,855617.0,855943.0,51421.0,5.0,1.0,10.0,5.0,13530.0,3.0,0.0
50%,1709137.0,1711048.0,102611.0,11.0,3.0,13.0,8.0,25256.0,6.0,1.0
75%,2565949.0,2565514.0,154391.0,24.0,5.0,16.0,15.0,37935.0,11.0,1.0
max,3421081.0,3421083.0,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0


In [5]:
df_merged = df_merged.drop(columns=['Unnamed: 0'])
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32434489 entries, 0 to 3606933
Data columns (total 11 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 int64   
 2   eval_set                string  
 3   order_number            int64   
 4   order_day_of_week       int64   
 5   order_hour_of_day       int64   
 6   days_since_prior_order  float64 
 7   product_id              int64   
 8   add_to_cart_order       int64   
 9   reordered               int64   
 10  _merge                  category
dtypes: category(1), float64(1), int64(8), string(1)
memory usage: 2.8 GB


## <a id='toc2_2_'></a>[Changing Data Types](#toc0_)

In [6]:
df_merged[['order_id', 'user_id']] = df_merged[['order_id', 'user_id']].astype('str')
df_merged[['order_number',	'order_day_of_week',	'order_hour_of_day', 'reordered']] = df_merged[['order_number',	'order_day_of_week',	'order_hour_of_day', 'reordered']].astype('int8')
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32434489 entries, 0 to 3606933
Data columns (total 11 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                object  
 1   user_id                 object  
 2   eval_set                string  
 3   order_number            int8    
 4   order_day_of_week       int8    
 5   order_hour_of_day       int8    
 6   days_since_prior_order  float64 
 7   product_id              int64   
 8   add_to_cart_order       int64   
 9   reordered               int8    
 10  _merge                  category
dtypes: category(1), float64(1), int64(2), int8(4), object(2), string(1)
memory usage: 2.0+ GB


In [7]:
df_merged.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,30356420.0,32434490.0,32434490.0,32434490.0
mean,17.14205,2.738818,13.42498,11.10407,25576.34,8.351076,0.5896975
std,17.53504,2.090049,4.246365,8.778914,14096.69,7.126671,0.4918886
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,5.0,1.0,10.0,5.0,13530.0,3.0,0.0
50%,11.0,3.0,13.0,8.0,25256.0,6.0,1.0
75%,24.0,5.0,16.0,15.0,37935.0,11.0,1.0
max,99.0,6.0,23.0,30.0,49688.0,145.0,1.0


In [8]:
df_merged['add_to_cart_order'] = df_merged['add_to_cart_order'].astype('int16')
df_merged['product_id'] = df_merged['product_id'].astype('int32')
df_merged['days_since_prior_order'] = df_merged['days_since_prior_order'].astype('float32')
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32434489 entries, 0 to 3606933
Data columns (total 11 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                object  
 1   user_id                 object  
 2   eval_set                string  
 3   order_number            int8    
 4   order_day_of_week       int8    
 5   order_hour_of_day       int8    
 6   days_since_prior_order  float32 
 7   product_id              int32   
 8   add_to_cart_order       int16   
 9   reordered               int8    
 10  _merge                  category
dtypes: category(1), float32(1), int16(1), int32(1), int8(4), object(2), string(1)
memory usage: 1.6+ GB


In [9]:
df_merged.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,30356420.0,32434490.0,32434490.0,32434490.0
mean,17.14205,2.738818,13.42498,11.10406,25576.34,8.351076,0.5896975
std,17.53504,2.090049,4.246365,8.377258,14096.69,7.126671,0.4918886
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,5.0,1.0,10.0,5.0,13530.0,3.0,0.0
50%,11.0,3.0,13.0,8.0,25256.0,6.0,1.0
75%,24.0,5.0,16.0,15.0,37935.0,11.0,1.0
max,99.0,6.0,23.0,30.0,49688.0,145.0,1.0


In [10]:
df_merged.head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,3108588,1,prior,8,1,14,14.0,12427,1,1,both
1,3108588,1,prior,8,1,14,14.0,196,2,1,both
2,3108588,1,prior,8,1,14,14.0,10258,3,1,both
3,3108588,1,prior,8,1,14,14.0,25133,4,1,both
4,3108588,1,prior,8,1,14,14.0,46149,5,0,both
5,3108588,1,prior,8,1,14,14.0,49235,6,0,both
6,1901567,2,prior,3,1,10,3.0,47766,1,1,both
7,1901567,2,prior,3,1,10,3.0,32792,2,1,both
8,1901567,2,prior,3,1,10,3.0,20574,3,1,both
9,1901567,2,prior,3,1,10,3.0,7781,4,0,both


In [12]:
df_merged.shape

(32434489, 11)

In [None]:
#2. Exporting orders combined DataFrame as pickle file
df_merged.to_pickle(os.path.join(Path, 'Prepared Data', 'order_product_combined.pkl'))

In [None]:
# 3. Importing Orders combined DataFrame
df_merged = pd.read_pickle(os.path.join(Path, 'Prepared Data', 'order_product_combined.pkl'))
df_merged.shape

(32434489, 11)

In [None]:
#4. Shape of imported DataFrame coincides with previously saved version
df_pro = pd.read_csv(os.path.join(Path, 'Prepared Data', 'df_pro_NoNANDups.csv'), index_col=False)
df_pro.shape

(49672, 6)

In [6]:
df_pro.head(20)

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3
5,5,6,Dry Nose Oil,11,11,2.6
6,6,7,Pure Coconut Water With Orange,98,7,4.4
7,7,8,Cut Russet Potatoes Steam N' Mash,116,1,1.1
8,8,9,Light Strawberry Blueberry Yogurt,120,16,7.0
9,9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4


In [7]:
df_pro = df_pro.drop(columns=['Unnamed: 0'])
df_pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49672 entries, 0 to 49671
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49672 non-null  int64  
 1   product_name   49672 non-null  object 
 2   aisle_id       49672 non-null  int64  
 3   department_id  49672 non-null  int64  
 4   prices         49672 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.9+ MB


In [8]:
df_pro.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [9]:
df_pro['product_id'] = df_pro['product_id'].astype('int32')
df_pro['aisle_id'] = df_pro['aisle_id'].astype('int16')
df_pro['department_id'] = df_pro['department_id'].astype('int8')
df_pro['prices'] = df_pro['prices'].astype('float32')
df_pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49672 entries, 0 to 49671
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49672 non-null  int32  
 1   product_name   49672 non-null  object 
 2   aisle_id       49672 non-null  int16  
 3   department_id  49672 non-null  int8   
 4   prices         49672 non-null  float32
dtypes: float32(1), int16(1), int32(1), int8(1), object(1)
memory usage: 921.8+ KB


In [10]:
df_pro.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993281
std,14340.705287,38.315784,5.850779,453.6073
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


# <a id='toc3_'></a>[Combining Products and Combined Orders DataFrames](#toc0_)

## <a id='toc3_1_'></a>[Merging using Pandas](#toc0_)

In [15]:
Merged_ord_pro = df_merged.merge(df_pro, on='product_id', indicator='pro_merged', suffixes=('_combined', '_pro'))
Merged_ord_pro.shape

(32404859, 16)

In [16]:
Merged_ord_pro.head(20)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,pro_merged
0,3108588,1,prior,8,1,14,14.0,12427,1,1,both,Original Beef Jerky,23,19,4.4,both
1,3108588,1,prior,8,1,14,14.0,196,2,1,both,Soda,77,7,9.0,both
2,3108588,1,prior,8,1,14,14.0,10258,3,1,both,Pistachios,117,19,3.0,both
3,3108588,1,prior,8,1,14,14.0,25133,4,1,both,Organic String Cheese,21,16,8.6,both
4,3108588,1,prior,8,1,14,14.0,46149,5,0,both,Zero Calorie Cola,77,7,13.4,both
5,3108588,1,prior,8,1,14,14.0,49235,6,0,both,Organic Half & Half,53,16,1.8,both
6,1901567,2,prior,3,1,10,3.0,47766,1,1,both,Organic Avocado,24,4,6.3,both
7,1901567,2,prior,3,1,10,3.0,32792,2,1,both,Chipotle Beef & Pork Realstick,23,19,5.2,both
8,1901567,2,prior,3,1,10,3.0,20574,3,1,both,Roasted Turkey,96,20,2.3,both
9,1901567,2,prior,3,1,10,3.0,7781,4,0,both,Organic Sticks Low Moisture Part Skim Mozzarel...,21,16,6.6,both


In [17]:
Merged_ord_pro.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id,prices
count,32404860.0,32404860.0,32404860.0,30328760.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0
mean,17.1423,2.738867,13.42515,11.10409,25598.66,8.352547,0.5895873,71.19612,9.919792,11.98022
std,17.53532,2.090077,4.24638,8.377718,14084.0,7.127071,0.4919087,38.21139,6.281485,495.6431
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
25%,5.0,1.0,10.0,5.0,13544.0,3.0,0.0,31.0,4.0,4.2
50%,11.0,3.0,13.0,8.0,25302.0,6.0,1.0,83.0,9.0,7.4
75%,24.0,5.0,16.0,15.0,37947.0,11.0,1.0,107.0,16.0,11.3
max,99.0,6.0,23.0,30.0,49688.0,145.0,1.0,134.0,21.0,99999.0


In [18]:
Merged_ord_pro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 16 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                object  
 1   user_id                 object  
 2   eval_set                string  
 3   order_number            int8    
 4   order_day_of_week       int8    
 5   order_hour_of_day       int8    
 6   days_since_prior_order  float32 
 7   product_id              int32   
 8   add_to_cart_order       int16   
 9   reordered               int8    
 10  _merge                  category
 11  product_name            object  
 12  aisle_id                int16   
 13  department_id           int8    
 14  prices                  float32 
 15  pro_merged              category
dtypes: category(2), float32(2), int16(2), int32(1), int8(5), object(3), string(1)
memory usage: 1.8+ GB


In [None]:
# 6. Confirming results with merge flag
Merged_ord_pro[['_merge','pro_merged']].value_counts()

_merge  pro_merged
both    both          32404859
Name: count, dtype: int64

# <a id='toc4_'></a>[Saving combined DataFrame as pickle file](#toc0_)

In [None]:
Merged_ord_pro.to_pickle(os.path.join(Path, 'Prepared Data', 'ord_pro_merge.pkl'))