# Creation of new columns in orders_products_merged dataset

## List of Contents
##### 1.1 Creating price_label column {using the If-statements with the loc () function}
##### 1.2 Creating busiest_day column {using If- statements with For-Loops}
##### 2.1 Clients prefer the inclusion of two Busiest and Slowest days (not 'Busiest day')
##### 2.2 Clients also need to know the busiest periods in the day (according to level of orders by clients)
- Exporting dataframe in Pickle format

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [3]:
# Import dataset for the exercise: orders_products_merged as df_mega

df_mega = pd.read_pickle(r'C:\Users\IDONG\Prepared data\orders_products_merged.pkl')

In [4]:
# Preview the dataframe by reviewing the top and tail ends of the dataset

df_mega.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.800781,3139998,138,28,6,11,3.0,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.800781,1977647,138,30,6,17,20.0,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.800781,389851,709,2,0,21,6.0,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.800781,652770,764,1,3,13,,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.800781,1813452,764,3,4,17,9.0,11,1,both


In [5]:
df_mega.tail()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
32433025,49688,Fresh Foaming Cleanser,73,11,13.5,1788356,200215,2,0,9,5.0,27,0,both
32433026,49688,Fresh Foaming Cleanser,73,11,13.5,3401313,200377,1,4,11,,5,0,both
32433027,49688,Fresh Foaming Cleanser,73,11,13.5,809510,200873,5,3,8,15.0,12,0,both
32433028,49688,Fresh Foaming Cleanser,73,11,13.5,2359893,200873,9,3,15,5.0,11,1,both
32433029,49688,Fresh Foaming Cleanser,73,11,13.5,2385091,205926,11,1,15,6.0,35,0,both


In [6]:
# Determine the number of rows and columns

df_mega.shape

(32433030, 14)

In [7]:
# Also checking the memory space consumed by the dataframe

df_mega.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32433030 entries, 0 to 32433029
Data columns (total 14 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   product_id              int32   
 1   product_name            object  
 2   aisle_id                int16   
 3   department_id           int8    
 4   prices                  float16 
 5   order_id                int32   
 6   user_id                 int32   
 7   order_number            int8    
 8   orders_day_of_week      int8    
 9   order_hour_of_day       int8    
 10  days_since_prior_order  float16 
 11  add_to_cart_order       int8    
 12  reordered               int8    
 13  _merge                  category
dtypes: category(1), float16(2), int16(1), int32(3), int8(6), object(1)
memory usage: 1.2+ GB


In [8]:
# At this point (for the sake of memory space), the '_merge' column can be dropped because it does not appear relevant 

df_mega = df_mega.drop(columns = ['_merge'])

##  1.1 Creating price_label column {using the If-statements with the loc () function}

In [9]:
# This function was chosen because its efficient and can be implemented throughout the entire dataframe (not just a subset)
# Three categories of price labels wil be implemented
# >15 , From <=15 to >5 and <=5. They will all have separate cells as seen below

df_mega.loc[df_mega['prices'] > 15, 'price_range_loc'] = 'High range product'

In [10]:
df_mega.loc[(df_mega['prices'] <=15) & (df_mega['prices'] >5), 'price_range_loc'] = 'Mid range product'

In [11]:
df_mega.loc[df_mega['prices'] <= 5, 'price_range_loc'] = 'Low range product'

In [12]:
# To cross-check the count and make sure all rows were assigned a label we use

df_mega['price_range_loc'].value_counts(dropna = False)

Mid range product     21889009
Low range product     10126339
High range product      417682
Name: price_range_loc, dtype: int64

In [13]:
# A summation of 21889009 + 10126339 + 417682 = 32433030 which is the total number of rows. So all rows has a price label.
# We can also review the new dataframe

df_mega.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,price_range_loc
0,1,Chocolate Sandwich Cookies,61,19,5.800781,3139998,138,28,6,11,3.0,5,0,Mid range product
1,1,Chocolate Sandwich Cookies,61,19,5.800781,1977647,138,30,6,17,20.0,1,1,Mid range product
2,1,Chocolate Sandwich Cookies,61,19,5.800781,389851,709,2,0,21,6.0,20,0,Mid range product
3,1,Chocolate Sandwich Cookies,61,19,5.800781,652770,764,1,3,13,,10,0,Mid range product
4,1,Chocolate Sandwich Cookies,61,19,5.800781,1813452,764,3,4,17,9.0,11,1,Mid range product


In [14]:
# New created column is at the extreme right. We can also reconfirm the number of rows and columns

# N.B ****Number of columns is still 14 because the '_merge' column was initially removed

df_mega.shape

(32433030, 14)

## 1.2 Creating busiest_day column {using If- statements with For-Loops}

In [15]:
# First we establish the frequency of orders placed for each day of the week

df_mega['orders_day_of_week'].value_counts(dropna = False)

0    6209410
1    5665604
6    4500101
2    4217610
5    4209334
3    3843929
4    3787042
Name: orders_day_of_week, dtype: int64

In [16]:
# Its been established that the numbers 0 to 6 represent each day of the week. 
# Thus the day represented as 0 is the busiest, while 4 represents the least busy.
# Effecting the code would be

result = []

for value in df_mega['orders_day_of_week']:
    if value == 0:
        result.append('Busiest day')
    elif value == 4:
        result.append('Least busy')
    else:
        result.append('Regularly busy')

In [17]:
# To include this with the df_mega dataframe, we equate the dataframe with the 'result' as below

df_mega['activity_level'] = result

In [18]:
# To confirm all rows were assigned an activity level tag:

df_mega['activity_level'].value_counts(dropna = False)

Regularly busy    22436578
Busiest day        6209410
Least busy         3787042
Name: activity_level, dtype: int64

In [19]:
# Just like the price label, all rows have the activity level designated. (22436578 + 6209410 + 3787042 = 32433030)
# To review the inclusion of the column (situated at the far right end)

df_mega.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,price_range_loc,activity_level
0,1,Chocolate Sandwich Cookies,61,19,5.800781,3139998,138,28,6,11,3.0,5,0,Mid range product,Regularly busy
1,1,Chocolate Sandwich Cookies,61,19,5.800781,1977647,138,30,6,17,20.0,1,1,Mid range product,Regularly busy
2,1,Chocolate Sandwich Cookies,61,19,5.800781,389851,709,2,0,21,6.0,20,0,Mid range product,Busiest day
3,1,Chocolate Sandwich Cookies,61,19,5.800781,652770,764,1,3,13,,10,0,Mid range product,Regularly busy
4,1,Chocolate Sandwich Cookies,61,19,5.800781,1813452,764,3,4,17,9.0,11,1,Mid range product,Least busy


In [20]:
# Confirming the new number of coulumns

df_mega.shape

(32433030, 15)

## 2.1 Clients prefer the inclusion of two Busiest and Slowest days (not 'Busiest day')

In [21]:
# For sake of memory space conservation, the 'activity_level' column will be removed and replaced with the client's preference

df_mega = df_mega.drop(columns = ['activity_level'])

In [22]:
# Confirming this removal

df_mega.shape

(32433030, 14)

In [23]:
# Now to modify the dataframe to include a new varaiable that highlights the two busiest and slowest days
# Let's recap the frequency of the activity level during the days of the week

df_mega['orders_day_of_week'].value_counts(dropna = False)

0    6209410
1    5665604
6    4500101
2    4217610
5    4209334
3    3843929
4    3787042
Name: orders_day_of_week, dtype: int64

In [24]:
# 0 & 1 are the two busiest days, while 3 & 4 are the slowest days. (2,5,6 will represent regular days)
# The If-statements with For-Loops can be used

result = []

for value in df_mega['orders_day_of_week']:
    if value <= 1:
        result.append('Busiest days')
    elif value >= 3 and value <= 4:
        result.append('Slowest days')
    else:
        result.append('Regular days')

In [25]:
# New Variable/Column will be 'activity_levels'

df_mega['activity_levels'] = result

In [26]:
# Cross-checking count to ensure accuracy

df_mega['activity_levels'].value_counts(dropna = False)

Regular days    12927045
Busiest days    11875014
Slowest days     7630971
Name: activity_levels, dtype: int64

##### **The Summation of each adds up to 32433030 . Days for 0 & 1 also add up to 11875014 for busiest days. Same for slowest and regular days 

In [27]:
# Checking overview to confirm inclusion

df_mega.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,price_range_loc,activity_levels
0,1,Chocolate Sandwich Cookies,61,19,5.800781,3139998,138,28,6,11,3.0,5,0,Mid range product,Regular days
1,1,Chocolate Sandwich Cookies,61,19,5.800781,1977647,138,30,6,17,20.0,1,1,Mid range product,Regular days
2,1,Chocolate Sandwich Cookies,61,19,5.800781,389851,709,2,0,21,6.0,20,0,Mid range product,Busiest days
3,1,Chocolate Sandwich Cookies,61,19,5.800781,652770,764,1,3,13,,10,0,Mid range product,Slowest days
4,1,Chocolate Sandwich Cookies,61,19,5.800781,1813452,764,3,4,17,9.0,11,1,Mid range product,Slowest days


In [28]:
# Newest column is located at the extreme right end

df_mega.shape

(32433030, 15)

## 2.2 Clients also need to know the busiest periods in the day (according to level of orders by clients)

In [29]:
# For this, a similar procedure will be carried out on 'order_hour_of_day' column.
# First, the order frequency is determined for every hour of each day

df_mega['order_hour_of_day'].value_counts(dropna = False)

10    2764288
11    2738483
14    2691448
15    2664420
13    2663169
12    2620719
16    2537358
9     2456591
17    2089385
8     1719888
18    1637858
19    1259335
20     976991
7      891900
21     796341
22     634715
23     402593
6      290763
0      218925
1      115780
5       88054
2       69429
4       53280
3       51317
Name: order_hour_of_day, dtype: int64

In [30]:
# Client requires three categories - 'Most orders', 'Average orders' & 'Fewest orders'
# So we can divide the hour-time vertically into 3 zones; The hours from 9 and vertically above will be grouped as Most orders.
# From 6 and below can be considered as Fewest ordest, while the hours inbetween (9 & 6) will be taken for Average orders.

result = []

for value in df_mega['order_hour_of_day']:
    if value >= 9 and value <= 16:
        result.append('Most orders')
    elif value <= 6:
        result.append('Fewest orders')
    else:
        result.append('Average orders')

In [31]:
# The new Variable/Column will be 'busy_period_levels'

df_mega['busy_period_levels'] = result

In [32]:
# Cross-checking to ensure accuracy

df_mega['busy_period_levels'].value_counts(dropna = False)

Most orders       21136476
Average orders    10409006
Fewest orders       887548
Name: busy_period_levels, dtype: int64

In [33]:
# N.B - Its quite coincidental that the busiest hours range from 9am to 4pm and the slowest hours from midnight to 6am.
# This made it easy to range the busiest hours from 9 to 16 and 0 to 6 for slowest hours in the loop function

# A quick overview of the dataframe to be presente to the clients

df_mega.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,price_range_loc,activity_levels,busy_period_levels
0,1,Chocolate Sandwich Cookies,61,19,5.800781,3139998,138,28,6,11,3.0,5,0,Mid range product,Regular days,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.800781,1977647,138,30,6,17,20.0,1,1,Mid range product,Regular days,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.800781,389851,709,2,0,21,6.0,20,0,Mid range product,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.800781,652770,764,1,3,13,,10,0,Mid range product,Slowest days,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.800781,1813452,764,3,4,17,9.0,11,1,Mid range product,Slowest days,Average orders


In [34]:
# Confirmation of the shape as well

df_mega.shape

(32433030, 16)

##### Exporting dataframe in Pickle format

In [35]:
# First define path

path = r'C:\Users\IDONG'

In [36]:
# Export data to pkl

df_mega.to_pickle(os.path.join(path, 'Prepared Data', 'orders_products_clients.pkl'))