# Table of Content

1. Importing libraries and datasets
2. User-defined function and subsetting
3. Deriving price_range variable for whole dataset
4. Deriving busiest_days variable for whole dataset
5. Deriving busiest_period_of_day variable
6. Exporting new dataframe

# Importing libraries and datasets

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import pickle

In [2]:
# Define path
path = r'/Users/frederikeschulz-mullensiefen/Desktop/Master Folder_Instacart/02_Data'

In [3]:
# Import orders_products_combined dataframe
df_ords_prods_merge = pd.read_pickle(os.path.join(path, 'Prepared Data', 'ords_prods_merge.pkl'))

# User-defined function and subset 

In [4]:
# Creating a subset of merged dataset with 1000000 rows
df = df_ords_prods_merge[:1000000]

In [5]:
# Create new variable (price_label) and user-defined function
def price_label(row):
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High range'
    else: return 'Not enough data'

In [6]:
# Running user-defined function on df subset
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [7]:
# Frequency of product price labels
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

# Deriving price_range variable for whole dataset 

In [8]:
# Defining if-statement for high-range product
df_ords_prods_merge.loc[df_ords_prods_merge['prices'] > 15, 'price_range'] = 'High-range product'

In [9]:
# Defining if-statement for mid-range product
df_ords_prods_merge.loc[(df_ords_prods_merge['prices'] <= 15) & (df_ords_prods_merge['prices'] > 5), 'price_range'] = 'Mid-range product'

In [10]:
# Defining if-statement for low-range product
df_ords_prods_merge.loc[df_ords_prods_merge['prices'] <= 5, 'price_range'] = 'Low-range product'

In [11]:
# Frequency for product price labels
df_ords_prods_merge['price_range'].value_counts(dropna = False)

price_range
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

# Deriving busiest_days variable for whole dataset 

In [12]:
# Defining if-statement for busiest days
df_ords_prods_merge.loc[df_ords_prods_merge['orders_day_of_week'].isin([0,1]), 'busiest_days'] = 'Busiest days'

In [13]:
# Defining if-statement for slowest days
df_ords_prods_merge.loc[df_ords_prods_merge['orders_day_of_week'].isin([3,4]), 'busiest_days'] = 'Slowest days'

In [14]:
# Defining if-statement for busiest days
df_ords_prods_merge.loc[df_ords_prods_merge['orders_day_of_week'].isin([6,2,5]), 'busiest_days'] = 'Regular days'

In [15]:
# Frequency of busiest days
df_ords_prods_merge['busiest_days'].value_counts(dropna = False)

busiest_days
Regular days    12916111
Busiest days    11864412
Slowest days     7624336
Name: count, dtype: int64

The values is the frequency table are correct. 

# Deriving busiest_period_of_day variable

In [16]:
# Frequency of order hours
df_ords_prods_merge['ordertime_hour_of_day'].value_counts(dropna = False)

ordertime_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [17]:
# Defining if-statement for busiest hours of day
df_ords_prods_merge.loc[df_ords_prods_merge['ordertime_hour_of_day'].isin([10,11,14,15,13,12,16,9]), 'busiest_period_of_day'] = 'Most orders'

In [18]:
# Defining if-statement for normal hours of day
df_ords_prods_merge.loc[df_ords_prods_merge['ordertime_hour_of_day'].isin([17,8,18,19,20,7,21,22]), 'busiest_period_of_day'] = 'Average orders'

In [19]:
# Defining if-statement for least busy hours of day
df_ords_prods_merge.loc[df_ords_prods_merge['ordertime_hour_of_day'].isin([23,6,0,1,5,2,4,3]), 'busiest_period_of_day'] = 'Least orders'

In [20]:
# Frequency of order hours labels
df_ords_prods_merge['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Least orders       1289137
Name: count, dtype: int64

# Exporting new dataframe

In [21]:
df_ords_prods_merge.to_pickle(os.path.join(path,'Prepared Data', 'ords_prods_new_variables.pkl'))