# 01. Merging datasets (orders_products_combined & products_checked)

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [3]:
# Import the first dataset - orders_products_combined as df_merged

df_merged = pd.read_pickle(r'C:\Users\IDONG\Prepared data\orders_products_combined.pkl')

In [4]:
# Checking the dimensions(rows and columns) of the imported dataset

df_merged.shape

(32434489, 10)

In [5]:
# Having an overview of the dataset as well

df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


### Steps will be taken to adjust the memory space consumed by this dataset

In [7]:
# Conserving memory space by changing the storage of integer values from int64 to a lower value (int32, int16 or int8) 

df_merged[['order_hour_of_day']] = df_merged[['order_hour_of_day']].astype('int8')

In [9]:
# Confirmation of the memory space consumption

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 10 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 int64   
 2   order_number            int64   
 3   orders_day_of_week      int64   
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float64 
 6   product_id              int64   
 7   add_to_cart_order       int64   
 8   reordered               int64   
 9   _merge                  category
dtypes: category(1), float64(1), int64(7), int8(1)
memory usage: 2.2 GB


In [10]:
df_merged[['orders_day_of_week']] = df_merged[['orders_day_of_week']].astype('int8')

In [11]:
# review of the rare end of the dataset

df_merged.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
32434484,2977660,206209,13,1,12,7.0,14197,5,1,both
32434485,2977660,206209,13,1,12,7.0,38730,6,0,both
32434486,2977660,206209,13,1,12,7.0,31477,7,0,both
32434487,2977660,206209,13,1,12,7.0,6567,8,0,both
32434488,2977660,206209,13,1,12,7.0,22920,9,0,both


In [12]:
# Further reduction in storage space consumed to improve efficiency

df_merged[['order_number']] = df_merged[['order_number']].astype('int8')

In [13]:
df_merged[['add_to_cart_order']] = df_merged[['add_to_cart_order']].astype('int8')

In [14]:
df_merged[['reordered']] = df_merged[['reordered']].astype('int8')

In [15]:
# Check to reconfirm these measures are adequate 
# N.B reduction from 2.2 GB to 1.4 GB has been achieved

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 10 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 int64   
 2   order_number            int8    
 3   orders_day_of_week      int8    
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float64 
 6   product_id              int64   
 7   add_to_cart_order       int8    
 8   reordered               int8    
 9   _merge                  category
dtypes: category(1), float64(1), int64(3), int8(5)
memory usage: 1.4 GB


In [17]:
df_merged[['days_since_prior_order']] = df_merged[['days_since_prior_order']].astype('float16')

In [18]:
df_merged[['product_id']] = df_merged[['product_id']].astype('int32')

In [19]:
df_merged[['user_id']] = df_merged[['user_id']].astype('int32')

In [20]:
df_merged[['order_id']] = df_merged[['order_id']].astype('int32')

In [21]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 10 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int32   
 1   user_id                 int32   
 2   order_number            int8    
 3   orders_day_of_week      int8    
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float16 
 6   product_id              int32   
 7   add_to_cart_order       int8    
 8   reordered               int8    
 9   _merge                  category
dtypes: category(1), float16(1), int32(3), int8(5)
memory usage: 866.1 MB


In [22]:
# The _merge was not required so it was removed, also to acquire more memoory space. Current dataset consumes 835.2 MB

df_merged = df_merged.drop(columns = ['_merge'])

In [23]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 9 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   orders_day_of_week      int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
 6   product_id              int32  
 7   add_to_cart_order       int8   
 8   reordered               int8   
dtypes: float16(1), int32(3), int8(5)
memory usage: 835.2 MB


### Importing second dataset (the same memory space technique would be applied)

In [26]:
# Import the second dataset - products_checked as df_prods_ch

df_prods_ch = pd.read_csv(r'C:\Users\IDONG\Prepared data\products_checked.csv', index_col = 0)

In [27]:
# Review of the dataset

df_prods_ch.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [28]:
# Examining the rows and columns

df_prods_ch.shape

(49688, 5)

In [29]:
# Confirming the memory space consumed by the dataset 

df_prods_ch.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49688 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49688 non-null  int64  
 1   product_name   49672 non-null  object 
 2   aisle_id       49688 non-null  int64  
 3   department_id  49688 non-null  int64  
 4   prices         49688 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 2.3+ MB


In [30]:
# Checking the rare end of the set as well

df_prods_ch.tail()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7
49692,49688,Fresh Foaming Cleanser,73,11,13.5


In [31]:
# Commencement of reduction of space consumed by dataset

df_prods_ch[['product_id']] = df_prods_ch[['product_id']].astype('int32')

In [33]:
# For proper reduction of memory space consumed, the stats of the dataset need to be ascertained. So as to determine if int8,
# int16 or int32 would be best appropriated to the values per given column

df_prods_ch.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49688.0,49688.0,49688.0,49688.0
mean,24844.50004,67.769582,11.728687,9.994254
std,14343.834402,38.316162,5.85041,453.542503
min,1.0,1.0,1.0,1.0
25%,12422.75,35.0,7.0,4.1
50%,24844.5,69.0,13.0,7.1
75%,37266.25,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


In [34]:
df_prods_ch[['aisle_id']] = df_prods_ch[['aisle_id']].astype('int16')

In [35]:
df_prods_ch[['department_id']] = df_prods_ch[['department_id']].astype('int8')

In [36]:
df_prods_ch[['prices']] = df_prods_ch[['prices']].astype('float16')

In [37]:
# Reduction of space consumed was reduced from 2.3+ to 1.2+ M.B

df_prods_ch.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49688 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49688 non-null  int32  
 1   product_name   49672 non-null  object 
 2   aisle_id       49688 non-null  int16  
 3   department_id  49688 non-null  int8   
 4   prices         49688 non-null  float16
dtypes: float16(1), int16(1), int32(1), int8(1), object(1)
memory usage: 1.2+ MB


## Merging of the datasets proper

In [39]:
# 'Product_id' serves as the common key column between the two datasets. The merged sets will represented as df_mega

df_mega = df_prods_ch.merge(df_merged, on = 'product_id', indicator = True)

In [40]:
# A review of the newly merged dataset

df_mega.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.800781,3139998,138,28,6,11,3.0,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.800781,1977647,138,30,6,17,20.0,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.800781,389851,709,2,0,21,6.0,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.800781,652770,764,1,3,13,,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.800781,1813452,764,3,4,17,9.0,11,1,both


In [41]:
# Confirming the result of the merge using the merge flag

df_mega['_merge'].value_counts()

both          32433030
left_only            0
right_only           0
Name: _merge, dtype: int64

In [43]:
# It would be wrong to assume that the merge (using the 'product_id' key column) was complete in both data frames, despite the 
# output of the last command. Using the 'how=outer' is sure to confirm that the merge was not a full match

In [42]:
# Ascertaining the total number of rows and columns 

df_mega.shape

(32433030, 14)

### Exporting the merged data in pickle format as orders_products_merged

In [44]:
# First define path

path = r'C:\Users\IDONG'

In [45]:
# Export data to pkl

df_mega.to_pickle(os.path.join(path, 'Prepared Data', 'orders_products_merged.pkl'))