# 4.7 Deriving New Variables

### This script contains the following points:
1. Importing Libraries
2. Importing Data 
3. Checking Data
4. Creating 'price label' column
5. Creating 'busiest_day' column
6. Creating 'busiest_days' column
7. Creating 'busiest_period_of_day' column
8. Exporting Data

# 01. Importing Libraries

In [47]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing Data

In [48]:
# Project folder path
path=r'C:\Users\maryn\Documents\Data Projects\Instacart Basket Analysis'

In [49]:
# Import dataset orders_products_merged.pkl
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_1.pkl'))

# 03. Checking Data

In [50]:
# Display the information orders_products_merged.pkl
df_ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


In [51]:
df_ords_prods.shape

(32404859, 14)

In [52]:
df_ords_prods.isnull().sum()

order_id                        0
user_id                         0
order_number                    0
orders_day_of_week              0
order_hour_of_day               0
days_since_prior_order    2076096
product_id                      0
add_to_cart_order               0
reordered                       0
product_name                    0
aisle_id                        0
department_id                   0
prices                          0
_merge                          0
dtype: int64

# 04. Creating 'price label' column

### If-Statements with User-Defined Functions

In [53]:
# Create a subset of first million rows
df = df_ords_prods[:1000000]

In [54]:
df.shape

(1000000, 14)

In [55]:
# Define a function for 'price label'

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High-range product'
  else: return 'Not enough data'

In [56]:
# Apply a function to a new column called 'price_range'
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [57]:
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

In [58]:
df['prices'].max()

14.8

### If-Statements with the loc() Function

#### Use loc() method to apply price range function to subset

In [59]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [60]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [61]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [62]:
df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

#### Use loc() method to apply price range function to the entire dataframe

In [63]:
df_ords_prods.loc[df_ords_prods['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [64]:
df_ords_prods.loc[(df_ords_prods['prices'] <= 15) & (df_ords_prods['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [65]:
df_ords_prods.loc[df_ords_prods['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [66]:
df_ords_prods['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

# 05. Creating 'busiest_day' column

In [67]:
# View the frequency of a column 'orders_day_of_week'
df_ords_prods['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [68]:
# Use for-loop to label the orders as "busiest day" (0 = Saturday), "least busy" (4 = Wednesday), or "regularly busy" (other days)

result = []

for value in df_ords_prods["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [69]:
# Add the values in result to a new column in df_ords_prods
df_ords_prods['busiest_day'] = result

In [70]:
df_ords_prods['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

# 06. Creating 'busiest_days' column

In [71]:
# Use for-loop to label the orders as "busiest days" (0 = Saturday, 1 = Sunday), "lowest days" (3 = Tuesday, 4 = Wednesday), or "regularly busy days" (other days)

result_2 = []

for value in df_ords_prods['orders_day_of_week']:
  if value == 0 or value == 1:
    result_2.append('Busiest days')
  elif value == 3 or value == 4:
    result_2.append('Slowest days')
  else:
    result_2.append('Regularly busy days')

In [72]:
# Add the values in result_2 to a new column 'busiest_days' in df_ords_prods
df_ords_prods['busiest_days'] = result_2

In [73]:
df_ords_prods['busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy days    12916111
Busiest days           11864412
Slowest days            7624336
Name: count, dtype: int64

### The values in this new column 'busiest days' look accurate and correspond to the values in the columns 'orders_day_of_week' and 'busiest day'.

# 07. Creating 'busiest_period_of_day' column

In [74]:
# View the frequency of a column 'order_hour_of_day'
df_ords_prods['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

### The value counts listed above are evenly divided into equal thirds for the following labels: 'Most orders' = 10, 11, 14, 15, 13, 12, 16, 9; 'Fewest orders' = 23, 6, 0, 1, 5, 2, 4, 3; 'Average orders' = all remaining values in order_hour_of_day.

In [75]:
# # Use for-loop to label the orders in 'busiest_period_of_day' as 'Most orders', 'Average orders', and 'Fewest orders'.

result_3 = []

for value in df_ords_prods['order_hour_of_day']:
  if value in [10, 11, 14, 15, 13, 12, 16, 9]:
    result_3.append('Most orders')
  elif value in [23, 6, 0, 1, 5, 2, 4, 3]:
    result_3.append('Fewest orders')
  else:
    result_3.append('Average orders')

In [76]:
# Add the values in result_3 to a new column 'busiest_period_of_day' in df_ords_prods
df_ords_prods['busiest_period_of_day'] = result_3

In [77]:
df_ords_prods['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

# 08. Exporting Data

In [78]:
# Exporting df_ords_prods_merge as a orders_products_merged.pkl in pickle format
df_ords_prods.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_derived.pkl'))