# Table of Contents
## 01. Importing Libraries
## 02. Importing dataframes
## 03. If-Statements with User-Defined Functions
## 04. If-statements with the loc() function
## 05. If-statements with For-Loops
## 05. The Busiest days of the week
## 06. The Busiest period of the day
## 07. Export Data

# 01 Importing libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

# 02 Importing data frame

In [2]:
# Importing ords_prods_merged dataframe
ords_prods_merged=pd.read_pickle('../02 Data/Prepared Data/orders_products_merged.pkl')

In [3]:
# Create subset with the first million rows of the dataframe to avoid memory issues
df=ords_prods_merged[:1000000]

In [4]:
df.shape

(1000000, 15)

In [5]:
# Masking the warnings
import warnings
warnings.filterwarnings("ignore")

# If-Statements with User-Defined Functions

In [6]:
# Define a function
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [7]:
# Apply the function
df['price_range'] = df.apply(price_label, axis=1)

In [8]:
df['price_range'].value_counts(dropna=False)

Mid-range product    756450
Low-range product    243550
Name: price_range, dtype: int64

In [9]:
df['prices'].max()

14.8

# If-statements with the loc() function

In [16]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High range product'

In [17]:
df.loc[(df['prices']<=15)&(df['prices']>5),'price_range_loc']= 'Mid range product'

In [18]:
df.loc[df['prices']<=5, 'price_range_loc']='Low range product'

In [19]:
df['price_range_loc'].value_counts(dropna=False)

Mid range product    756450
Low range product    243550
Name: price_range_loc, dtype: int64

In [20]:
# Apply the loc function on the entire dataframe
ords_prods_merged.loc[ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High range product'

In [21]:
ords_prods_merged.loc[(ords_prods_merged['prices']<=15)&(ords_prods_merged['prices']>5),'price_range_loc']= 'Mid range product'

In [22]:
ords_prods_merged.loc[ords_prods_merged['prices']<=5, 'price_range_loc']='Low range product'

In [23]:
ords_prods_merged['price_range_loc'].value_counts(dropna=False)

Mid range product     21860860
Low range product     10126321
High range product      417678
Name: price_range_loc, dtype: int64

# If-statements with For-Loops

In [24]:
ords_prods_merged['orders_day_of_week'].value_counts(dropna=False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: orders_day_of_week, dtype: int64

In [25]:
result=[]
for value in ords_prods_merged['orders_day_of_week']:
    if value==0:
        result.append('Busiest day')
    elif value==4:
        result.append('Least busy')
    else:
        result.append('Regularly busy')

In [26]:
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Reg

In [27]:
ords_prods_merged['busiest_day']=result

In [28]:
ords_prods_merged['busiest_day'].value_counts(dropna=False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

# The Busiest days of the week

In [29]:
# Create new column with the busiest days
result_2 = []
for value in ords_prods_merged['orders_day_of_week']:
    if value ==0 or value ==1:
        result_2.append('Busiest days')
    elif value ==4 or value ==3:
        result_2.append('Slowest days')
    else:
        result_2.append('Regularly busy')

In [30]:
result_2

['Regularly busy',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Slowest days',
 'Regularly busy',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Slowest days',
 'Slowest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest 

In [31]:
ords_prods_merged['busiest_days']=result_2

In [32]:
ords_prods_merged['busiest_days'].value_counts(dropna=False)

Regularly busy    12916111
Busiest days      11864412
Slowest days       7624336
Name: busiest_days, dtype: int64

In [33]:
# Check output
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days
0,2539329,1,1,2,8,,Yes,196,1,0,Soda,77,7,9.0,both,Mid range product,Regularly busy,Regularly busy
1,2398795,1,2,3,7,15.0,No,196,1,1,Soda,77,7,9.0,both,Mid range product,Regularly busy,Slowest days
2,473747,1,3,3,12,21.0,No,196,1,1,Soda,77,7,9.0,both,Mid range product,Regularly busy,Slowest days
3,2254736,1,4,4,7,29.0,No,196,1,1,Soda,77,7,9.0,both,Mid range product,Least busy,Slowest days
4,431534,1,5,4,15,28.0,No,196,1,1,Soda,77,7,9.0,both,Mid range product,Least busy,Slowest days


In [34]:
ords_prods_merged.shape

(32404859, 18)

Q3 I noticed that all values have been checked since the sum of totla value count of the new column is the same is the same as the total number of dataframe rows.

# The Busiest period of the day

In [35]:
ords_prods_merged['order_hour_of_day'].value_counts(dropna=False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

In [36]:
# Create new column with the busiest period during the day
result_3 = []
for value in ords_prods_merged['order_hour_of_day']:
    if value in [10,11,14,15,13,12,16,9,17]:
        result_3.append('Most orders')
    elif value in [3,4,2,5,1,0,6,23,22,21,7,20]:
        result_3.append('Fewest orders')
    else:
        result_3.append('Average orders')

In [37]:
result_3

['Average orders',
 'Fewest orders',
 'Most orders',
 'Fewest orders',
 'Most orders',
 'Fewest orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Fewest orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Fewest orders',
 'Most orders',
 'Most orders',
 'Most orders',
 '

In [38]:
ords_prods_merged['busiest_period_of_day']=result_3

In [39]:
ords_prods_merged['busiest_period_of_day'].value_counts(dropna=False)

Most orders       23205725
Average orders     4612925
Fewest orders      4586209
Name: busiest_period_of_day, dtype: int64

In [40]:
# Check output
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,Yes,196,1,0,Soda,77,7,9.0,both,Mid range product,Regularly busy,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,No,196,1,1,Soda,77,7,9.0,both,Mid range product,Regularly busy,Slowest days,Fewest orders
2,473747,1,3,3,12,21.0,No,196,1,1,Soda,77,7,9.0,both,Mid range product,Regularly busy,Slowest days,Most orders
3,2254736,1,4,4,7,29.0,No,196,1,1,Soda,77,7,9.0,both,Mid range product,Least busy,Slowest days,Fewest orders
4,431534,1,5,4,15,28.0,No,196,1,1,Soda,77,7,9.0,both,Mid range product,Least busy,Slowest days,Most orders


In [41]:
ords_prods_merged.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,aisle_id,department_id,prices
count,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,30328760.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0
mean,1710745.0,102937.2,17.1423,2.738867,13.42515,11.10408,25598.66,8.352547,0.5895873,71.19612,9.919792,11.98023
std,987298.8,59466.1,17.53532,2.090077,4.24638,8.779064,14084.0,7.127071,0.4919087,38.21139,6.281485,495.6554
min,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
25%,855947.0,51422.0,5.0,1.0,10.0,5.0,13544.0,3.0,0.0,31.0,4.0,4.2
50%,1711049.0,102616.0,11.0,3.0,13.0,8.0,25302.0,6.0,1.0,83.0,9.0,7.4
75%,2565499.0,154389.0,24.0,5.0,16.0,15.0,37947.0,11.0,1.0,107.0,16.0,11.3
max,3421083.0,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0,134.0,21.0,99999.0


# Export prepared dataframe

In [42]:
# Export the updated dataframe as a pickle file
ords_prods_merged.to_pickle('../02 Data/Prepared Data/orders_products_merged_additional_columns.pkl')