## 4.6 Combining and Exporting Data Part 2

#### This script contains the following points:

### 0.0 Preparational Steps
### 1.0 Combining Dataframes
### 2.0 Data Export

## 0.0 Preparational Steps

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

In [2]:
# defining path
path = r'C:\Users\chris\OneDrive\Desktop\Data Analytics CF\202203_Instacart Basket Analysis'

In [3]:
# Importing dataframe orders_products_combined
df_ords_prods_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [4]:
# Importing cleaned products df
df_prods_clean = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'df_prods_checked_final.csv'))

In [5]:
df_ords_prods_combined.shape

(32434489, 11)

In [6]:
df_prods_clean.shape

(49672, 6)

In [7]:
df_ords_prods_combined.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,30356420.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,102937.2,17.14205,2.738818,13.42498,11.10407,25576.34,8.351076,0.5896975
std,987300.7,59466.48,17.53504,2.090049,4.246365,8.778914,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,855943.0,51421.0,5.0,1.0,10.0,5.0,13530.0,3.0,0.0
50%,1711048.0,102611.0,11.0,3.0,13.0,8.0,25256.0,6.0,1.0
75%,2565514.0,154391.0,24.0,5.0,16.0,15.0,37935.0,11.0,1.0
max,3421083.0,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0


In prior attemps to merge the df while using the 'indicator = True' argument the function could not be executed because the '_merge' column already exists, so a new line was inserted to check the column names of both dataframes to see where this column came from

In [8]:
print(df_ords_prods_combined.columns)

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', '_merge'],
      dtype='object')


In [9]:
df_ords_prods_combined = df_ords_prods_combined.drop(columns = ['_merge'])

In [10]:
df_prods_clean.describe()

Unnamed: 0.1,Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0,49672.0
mean,24852.005053,24850.349775,67.762442,11.728942,9.993282
std,14342.265579,14340.705287,38.315784,5.850779,453.615536
min,0.0,1.0,1.0,1.0,1.0
25%,12432.75,12432.75,35.0,7.0,4.1
50%,24851.5,24850.5,69.0,13.0,7.1
75%,37272.25,37268.25,100.0,17.0,11.1
max,49692.0,49688.0,134.0,21.0,99999.0


In [11]:
print(df_prods_clean.columns)

Index(['Unnamed: 0', 'product_id', 'product_name', 'aisle_id', 'department_id',
       'prices'],
      dtype='object')


In [12]:
# dropping column unnamed: 0
df_prods_clean = df_prods_clean.drop(columns = ['Unnamed: 0'])

In [13]:
df_prods_clean.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


## 1.0 Combining Dataframes

Appropriate cell to combine dataframes is: product_id
As all order information should be kept, I will use the left join

In [14]:
# defining new df for the merged data
df_orders_products_merged = df_ords_prods_combined.merge(df_prods_clean, on = 'product_id', how = 'left', indicator = True )

In [15]:
df_orders_products_merged['_merge'].value_counts()

both          32404859
left_only        30200
right_only           0
Name: _merge, dtype: int64

In [16]:
df_outer_merge = df_ords_prods_combined.merge(df_prods_clean, on = 'product_id', how = 'outer', indicator = True )

In [17]:
df_outer_merge['_merge'].value_counts()

both          32404859
left_only        30200
right_only          11
Name: _merge, dtype: int64

The results tells me that I have 30200 orders of at least one product_id that does not have a matching product definition in the product df.
Moreover I have 11 products that have no orders in the order df.
As I'm only interested in the left join I will keep and export the df_orders_products_merged.
Due to its size I will export this to pickle.

## 2.0 Data Export

In [18]:
# Export df_orders_products_merged to pickle due to size
df_orders_products_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))