# Exercise Walkthrough (with memory mitigation plan A)

In [1]:
import pandas as pd
import os

In [2]:
data_path = r'../2_Data/'
df_orders_prior = pd.read_csv(os.path.join(data_path, '1_Original_Data', 'orders_products_prior.csv'), index_col = False)

In [3]:
df_orders_prior.shape

(32434489, 4)

In [4]:
df_orders_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


after reaching the final task of the achievement, I undestood that `add_to_cart_order` and `reordered` will not be used. Since the memory optimization is of the essence, I'm dropping these two fields at this stage, before merging.

In [5]:
df_orders_prior.drop(['add_to_cart_order','reordered'],axis=1,inplace=True)

In [6]:
df_orders = pd.read_csv(os.path.join(data_path, '2_Prepared_Data', 'orders_checked.csv'), index_col = False)

In [7]:
df_orders.shape

(3421083, 6)

In [8]:
df_orders.head()

Unnamed: 0,order_id,user_id,customer_sequential_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [9]:
#checking content of days_since_prior_order
df_orders['days_since_prior_order'].value_counts(dropna=False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

Because this column has empty values, pandas imports them as NaN values.
Therefore, by default, it assigns a float64 dtype. But when we inspect the content, this could perfectly be an integer. Fortunately, there is a an extension datatype called `pd.Int8Dtype())`

read further https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#integer-na

In [10]:
#checking memory usage
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                            Dtype  
---  ------                            -----  
 0   order_id                          int64  
 1   user_id                           int64  
 2   customer_sequential_order_number  int64  
 3   orders_day_of_week                int64  
 4   order_hour_of_day                 int64  
 5   days_since_prior_order            float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


In [11]:
#revise datatypes to consume less memory on orders dataframe

df_orders['order_id']=df_orders['order_id'].astype('int32')
df_orders['user_id'] = df_orders['user_id'].astype('int32')
df_orders['customer_sequential_order_number']=df_orders['customer_sequential_order_number'].astype('int8')
df_orders['orders_day_of_week']=df_orders['orders_day_of_week'].astype('int8')
df_orders['order_hour_of_day']=df_orders['order_hour_of_day'].astype('int8')

#this special type is to handle integers with possible NaN values. 
#See section "Support for Integer NA" here: https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html
df_orders['days_since_prior_order']=df_orders['days_since_prior_order'].astype(pd.Int8Dtype())

In [12]:
#checking new memory usage
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                            Dtype
---  ------                            -----
 0   order_id                          int32
 1   user_id                           int32
 2   customer_sequential_order_number  int8 
 3   orders_day_of_week                int8 
 4   order_hour_of_day                 int8 
 5   days_since_prior_order            Int8 
dtypes: Int8(1), int32(2), int8(3)
memory usage: 42.4 MB


This doesn't look much, but it will have a relevant impact when merging with orders_prior dataframe

In [13]:
#revise datatypes to consume less memory in orders prior dataframe
#the commented lines avoid a code error, because I've decided to drop these fields.
#They can be uncommented if in the future I decide to take the fields back

df_orders_prior['product_id'] =df_orders_prior['product_id'].astype('int32')
#df_orders_prior['reordered']=df_orders_prior['reordered'].astype('int8')
#df_orders_prior['add_to_cart_order']=df_orders_prior['add_to_cart_order'].astype('int32')
df_orders_prior['order_id']=df_orders_prior['order_id'].astype('int32')

In [14]:
#checking new memory usage of each dataframe

df_orders.info()
df_orders_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                            Dtype
---  ------                            -----
 0   order_id                          int32
 1   user_id                           int32
 2   customer_sequential_order_number  int8 
 3   orders_day_of_week                int8 
 4   order_hour_of_day                 int8 
 5   days_since_prior_order            Int8 
dtypes: Int8(1), int32(2), int8(3)
memory usage: 42.4 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 2 columns):
 #   Column      Dtype
---  ------      -----
 0   order_id    int32
 1   product_id  int32
dtypes: int32(2)
memory usage: 247.5 MB


In [15]:
#merging

df_merged_large = df_orders.merge(df_orders_prior, on = 'order_id', indicator = True)

In [16]:
#checking if all records match

df_merged_large['_merge'].value_counts(dropna = False)

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64

In [17]:
#after successful verification, dropping unneeded column
df_merged_large.drop('_merge', axis=1, inplace=True)

In [18]:
# Export data to pkl

df_merged_large.to_pickle(os.path.join(data_path, '2_Prepared_Data', 'orders_products_combined.pkl'))