# This script presents the derived data from ords_prods_merge data frame, containing the next steps:
#### 01. Import the data sets into Jupyter
#### 02. Create a smaller subset of the merged data frame
#### 03. Define and use a function
#### 04. Use the 'loc' option
#### 05. Create 'busiest_day' column
#### 06. Update 'busiest day' column into 'busiest days', and categorize the days as: 'Busiest days', 'Slowest days', and 'Regularly busy'
#### 07. Check values of the new "busiest days" column for accuracy with observations
#### 08. Create a new column called 'busiest_period_of_day'.Periods of time are labeled 'Most orders', 'Average orders', and 'Fewest orders'
#### 09. Print frequency of 'busiest_period_of_day'
#### 10. Export the data frame as picke file to 'Prepared Data' folder

# 01. Import the data sets into Juypiter

In [43]:
import pandas as pd
import numpy as np
import os

In [44]:
# Turning the project folder path into a string
path = r'/Users/fatemehshahvirdi/Work-Related/Data Analysis/Data Immersion/Achievement 4/Instacart Basket Analysis'

In [45]:
# importing orders_prods_merge
ords_prods_merge = pd.read_pickle (os.path.join(path, '02 Data', 'prepared Data', 'orders_products_merged.pkl'))

# 02. Create a smaller subset of the merged data frame

In [46]:
ords_prods_merge.shape

(32404859, 15)

In [47]:
# Create a subset of data containing 1 million rows
df = ords_prods_merge[:1000000]

In [48]:
df.shape

(1000000, 15)

# 03. Define and use a function

In [49]:
# Define a function for price labels
def price_lable(row):
    if row['prices'] <= 5:
        return 'Low_range product'
    elif (row['prices']> 5) and (row ['prices'] <= 15):
        return 'Mid_range product'
    elif row['prices'] > 15:
        return 'High range'
    else: return 'Not enough data'

In [50]:
# Apply price_range function
df['price_range'] = df.apply(price_lable, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_lable, axis=1)


In [51]:
df['price_range'].value_counts(dropna= False)

price_range
Mid_range product    652638
Low_range product    338018
High range             9344
Name: count, dtype: int64

In [52]:
df['prices'].max()

24.5

# 04. Use the 'loc' option

In [53]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [54]:
df.loc[(df['prices'] <= 15) & (df['prices']> 5), 'price_range_loc'] = 'Mid-range product'

In [55]:
df.loc[df['prices']<= 5, 'price_range_loc'] = 'Low_range product'

In [56]:
df['price_range_loc'].value_counts(dropna= False)

price_range_loc
Mid-range product     652638
Low_range product     338018
High-range product      9344
Name: count, dtype: int64

In [57]:
ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [58]:
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices']> 5), 'price_range_loc'] = 'Mid-range product'

In [59]:
ords_prods_merge.loc[ords_prods_merge['prices']<= 5, 'price_range_loc'] = 'Low_range product'

In [60]:
ords_prods_merge ['price_range_loc'].value_counts(dropna= False)

price_range_loc
Mid-range product     21860860
Low_range product     10126321
High-range product      417678
Name: count, dtype: int64

# 05. Create 'busiest_day' column

In [61]:
# Finding the frequency of orders in the 'orders_day_of_week'
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [62]:
# Using for_loop to group data in categories: Busiest day, Least busy, Regularly busy
result = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [63]:
result

['Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Bus

In [64]:
# Create new column in ords_prods_merge to view the results in it
ords_prods_merge['busiest_day'] = result

In [65]:
ords_prods_merge['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

# 06. Update 'busiest day' column into 'busiest days', and categorize the days as: 'Busiest days', 'Slowest days', and 'Regularly busy'

In [66]:
result_2 = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0 or value == 1:
    result_2.append("Busiest days")
  elif value == 4 or value == 3:
    result_2.append("Slowest days")
  else:
    result_2.append("Regularly busy")

In [67]:
result_2

['Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Slowest days',
 'Slowest days',
 'Busiest days',
 'Regularly busy',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Slowest days',
 'Slowest days',
 'Regularly busy',
 'Slowest days',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',

# 07. Check values of the new "busiest days" column for accuracy with observations

In [68]:
# Create new 'busiest_days' column in ords_prods_merge
ords_prods_merge['busiest_days'] = result_2

In [69]:
ords_prods_merge['busiest_days'].value_counts(dropna= False)

busiest_days
Regularly busy    12916111
Busiest days      11864412
Slowest days       7624336
Name: count, dtype: int64

In [70]:
ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_days
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Regularly busy
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Regularly busy
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest day,Busiest days
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Regularly busy,Slowest days
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy,Slowest days


In [71]:
ords_prods_merge.shape

(32404859, 18)

#### The sum of the counts of the values in both columns 'busiest_day' and 'busiest_days' is equal to 32,404,859, which is also equal to the number of rows in the entire dataframe. This indicates that there are no missing or duplicated values in either of the columns. Overall, this consistency underscores the integrity and completeness of the dataset.

# 08. Create a new column called 'busiest_period_of_day'.Periods of time are labeled 'Most orders', 'Average orders', and 'Fewest orders'

In [72]:
# Check values in order_hour_day column
ords_prods_merge['order_hour_of_day'].value_counts(dropna= False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

#### According to the value counts, in the column'order_hour_of_day', since we have 23 values in total, we can use:
#### lable: Most orders, for values of 10, 11 ,14, 15, 13, 12, 16, 9
#### lable: Fewest orders, for vaues 6, 0, 1, 5, 2, 4, 3, 23
#### lable: Average orders, for other remaining values

In [73]:
# Creat a column with new results
result= []

for value in ords_prods_merge['order_hour_of_day']:
    if value in [10, 11 ,14, 15, 13, 12, 16, 9]:
        result.append('Most orders')
    elif value in [6, 0, 1, 5, 2, 4, 3, 23]:
        result.append('Fewest orders')
    else:
        result.append('Average orders')

In [74]:
result

['Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Fewest orders',
 'Average orders',
 'Fewest orders',
 'Fewest orders',
 'Fewest orders',
 'Fewest orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Fewest orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most ord

In [75]:
# Assign the results to a new column
ords_prods_merge['busiest_period_of_day'] = result

# 09. Print frequency of 'busiest_period_of_day'

In [76]:
ords_prods_merge['busiest_period_of_day'].value_counts(dropna= False)

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

In [77]:
ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Regularly busy,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Regularly busy,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest day,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Regularly busy,Slowest days,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy,Slowest days,Average orders


# 10. Export the data frame as picke file to 'Prepared Data' folder

In [78]:
ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_derived.pkl'))