# 01. Importing Libraries

In [4]:
#Importing Libraries

import pandas as pd
import numpy as np
import os

# 02. Importing Data

In [6]:
path = r'/Users/rkaran/Documents/careerfoundary/02 Data'

In [7]:
print(path)

/Users/rkaran/Documents/careerfoundary/02 Data


In [8]:
# Import dataset ords_prods_merge.pkl

df_ords_prods_merge = pd.read_pickle(os.path.join(path, 'Prepared Data', 'ords_prods_merge.pkl'))

In [9]:
ords_prods_merge = df_ords_prods_merge

# 03. If-Statements with User-Defined Functions

### let’s take a look at how to implement a user-defined function in Instacart project?

In [12]:
# creating a flag that sorts products in ords_prods_merge dataframe according to price. Products within different ranges can be given 
# different flags, which are stored within a new column. Need to write a user-defined function to create and assign these flags.

In [13]:
#One thing to note here is that using user-defined functions on a large dataframe can lead to memory issues or trouble with processing power.
# To avoid any potential issues, let’s work with a subset of the dataframe for now—the first one million rows.

# To create this subset, execute the following code:

df = df_ords_prods_merge[:1000000]

In [14]:
df.shape

(1000000, 14)

In [15]:
# Number after the colon indicates that subset should include everything from the beginning of the dataframe to that number—in this case,
# the first one million rows in the dataframe.

In [16]:
# Define a function

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [17]:
# Need to start by defining it using the def syntax at the beginning of the code. Following this is the name you want to give your new 
# function: price_label. In the parentheses is row, which is a standard argument telling the function to look at each row within the 
# dataframe. Finally, everything’s finished off with a colon. The colon separates the head, where you provide the name and argument(s) for 
# your function, from the body, which is what the function will actually do.

In [18]:
# Apply a function
# function price_label will apply string label to every row within your dataframe, designating it as a low-, mid-, or high-range product 
# based on its price. 
# Rather than calling the df dataframe, the syntax df['price_range'] used. This creates a new column within the df dataframe called 
#“price_range” and designates it as the location for your labels.



df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [19]:
# Function: df.apply(price_label, axis=1), which tells Python to apply the price_label function on axis=1. Axis = 1 stands for “rows,”so this
# code tells Python to apply the function to all rows within the dataframe. Conversely, axis = 0 refer to all columns within the dataframe.

In [20]:
#run the value_counts() function to check the values in your new column

df['price_range'].value_counts(dropna=False)

price_range
Mid-range product    652593
Low-range product    338063
High range             9344
Name: count, dtype: int64

In [21]:
#use the max() function to check what the most expensive product within the subset is?

df['prices'].max()

24.5

In [22]:
# This function returns the maximum value within the “prices” column, which, as you’ll see here, is 24.5. This confirms your findings from 
# the labels—that there are high-range products within the subset.

# 04. If-Statements with the loc() Function

In [24]:
# Using loc(), you can apply the conditional logic of an if-statement to a function without explicitly creating an if-else construct.

In [25]:
# the loc() function is being called on the df dataframe. And within the brackets, the values in the “prices” column of the df dataframe are 
# being compared to a value, 15, using the > operator. You could say, “if the values in the ‘prices’ column of the df dataframe are greater 
# than 15.” After the comma comes the implied “then.” New column called “price_range_loc” is being set equal to the string “High-range product'
# the comma is key! It’s what separates the “if” from the “then.”

In [26]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [27]:
# This time, you’re dealing with two conditions—greater than 5 but less than or equal to 15. These two conditions are combined by the & sign 
# in the middle of the “if” half. Additionally, the two conditions have been placed inside parentheses. This simply ensures that they’re both
# treated as separate conditions. When you’re working with multiple conditions within the same statement, section them off with parentheses!

In [28]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [29]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [30]:
#  value_counts() function to count the number of products within each label. This time, it’s used on the “price_range_loc” column, as 
# that’s where you’ve put your new loc()-created labels. 

In [31]:
df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     652593
Low-range product     338063
High-range product      9344
Name: count, dtype: int64

In [32]:
# the loc() function locates a particular column in the dataframe it’s been assigned to. Now, a logical operator (smaller than, larger than, 
# equal to, etc.) is being added to the function to create a condition. The difference here is that there’s no explicit if in your 
# if-statement. Instead, it’s all been implied.

In [33]:
# Do you get the same results as when you used your user-defined function?
# If both methods arrive at the same results, why would you want to use loc() instead? First, using loc() won’t result in a warning message. 
# While this won’t actually interfere with your work, it’s still a sign that, for whatever reason, Python thinks you should be doing 
# something different. Second, the loc() method runs much faster; the loc() function applies the conditional filters before searching 
# through the dataframe, while your user-defined function searches through the entire dataframe and then determines where to set the filters 
# (remember axis = 1?). Now that you’ve seen how much faster loc() works, you can try repeating the process—this time, on your entire dataframe. 
# The only difference in the code is the use of ords_prods_merge instead of df

ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [34]:
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [35]:
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [36]:
ords_prods_merge['price_range_loc'].value_counts(dropna = False)

price_range_loc
NaN                   21209404
Low-range product     10126366
Mid-range product       652593
High-range product      417678
Name: count, dtype: int64

# 05. If-Statements with For-Loops

In [38]:
# For-loops, as their name implies, are loops for running the same block of code multiple times. They’re used to perform the same function on 
# multiple elements, for instance, by running through an entire dataframe and performing a function on each row within that dataframe.
# Let’s look at how you could use a for-loop in your Instacart dataframe. 
# Create a new column in your ords_prods_merge dataframe that summarizes how busy each day of the week is. This information is valuable 
# information for stakeholders as it gives them insight into what products are being bought on the busiest and slowest days.

In [39]:
ords_prods_merge['orders_day_of_week'].value_counts(dropna=False)

orders_day_of_week
0    6204404
1    5660456
6    4496635
2    4213986
5    4205906
3    3840701
4    3783953
Name: count, dtype: int64

In [40]:
# Printing the frequency of a column will quickly inform you which values appear most often within that column.
# Here, the value 0 means Saturday. This value has the highest frequency, meaning Saturday is the busiest day. Meanwhile, the 4 value has 
# the lowest frequency. A value of 4, here, refers to Wednesday, meaning Wednesday is the slowest day for Instacart app orders.

In [41]:
# You want to use this information to create a new column, “busiest day,” that will contain one of three different values: “Busiest day,” 
# “Least busy,” and “Regularly busy.” This can be done using a for-loop. The loop will run through every row in the “orders_day_of_week” 
# column, compare its value with what you know are the busiest and slowest days, and assign it the corresponding string value.The code would look like this

In [52]:
# No. of rows are going out of jupyter's bound hence, so working with smaller subset. 

ords_prods_merge1 = ords_prods_merge.head(10000)

In [81]:
#The first step is to create an empty list, result. This will act as the empty shell, into which you can place the results from your loop. 
# Then comes the loop itself. You’re telling Python that for each value within the “orders_day_of_week” column, you want to do something. 
# That something comes below it, where you’ll see a familiar if-else structure. If the value in that row is equal to 0, a “Busiest day” 
# string value is appended to your currently blank result list. If the value is equal to 4, a “Least busy” string value is appended to the 
# result list. If neither of these conditions has been met (the value is neither 0 nor 4), then a “Regularly busy” string value is appended
# to the result list.
# The one thing that might still be throwing you off is the value within the code. This value is simply acting as a placeholder. It could 
# stand for anything. And you could call it anything, too (oftentimes, an x is used, like in the simple loop you went through earlier).
# This element in the code represents every entry the loop will check. In the simple loop from earlier, it represented every possible age 
# that could exist between 30 and 45. Here, it represents every possible value within the “orders_day_of_week” column. It’s what you want 
# your loop to, well, loop through.

In [54]:
result = []

for value in ords_prods_merge1["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [83]:
# After running your loop, print the result list and see what shows up:

result

['Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Bus

In [71]:
ords_prods_merge1.loc[:, 'busiest_day'] = result

In [77]:
ords_prods_merge1['busiest_day'].value_counts(dropna=False)

busiest_day
Regularly busy    7003
Busiest day       1681
Least busy        1316
Name: count, dtype: int64

# Step 2. Update "busiest day" column to "busiest days" to identify order day of the week as "Busiest days," "Slowest days," or "Regularly busy."

In [109]:
# Create column “busiest_days" from "orders_day_of_week" to identify two busiest days (0, 1), slowest days (4, 3), or regularly busy (all other days).

result_2 = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0 or value == 1:
    result_2.append("Busiest days")
  elif value == 4 or value == 3:
    result_2.append("Slowest days")
  else:
    result_2.append("Regularly busy")

In [111]:
result_2

['Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Slowest days',
 'Slowest days',
 'Busiest days',
 'Regularly busy',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Slowest days',
 'Slowest days',
 'Regularly busy',
 'Slowest days',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',

# Step 3. Check values of new "busiest days" column for accuracy with observations in markdown format.

In [114]:
# Create new 'busiest_days' column in ords_prods_merge1 to view results in context.

ords_prods_merge.loc[:, 'busiest_days'] = result_2

In [116]:
ords_prods_merge.loc[:, 'busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy    12916527
Busiest days      11864860
Slowest days       7624654
Name: count, dtype: int64

In [118]:
# Check output of ords_prods_merge with new "busiest days" column.

ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_days
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range product,Regularly busy
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range product,Regularly busy
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range product,Busiest days
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,7.0,10,0,both,Mid-range product,Slowest days
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range product,Slowest days


The total value counts for the ords_prods_merge dataframe is equal to the total sum of all groupings (i.e. "Regularly busy" + "Busiest days" + "Slowest days"). In addition, the listed sums for each label match the sum totals for the qualifying days of the week (e.g. Total "Busiest days" = 11864860, which matches orders_day_of_week 0 + 1, or 6204404 + 5660456).

# Step 4. Create new column "busiest_period_of_day" to identify time periods “Most orders,” “Average orders,” and “Fewest orders.”

In [122]:
# Check value counts in "order_hour_of_day" column.

df_ords_prods_merge['order_hour_of_day'].value_counts()

order_hour_of_day
10    2761862
11    2736242
14    2689251
15    2662246
13    2661057
12    2618613
16    2535313
9     2454273
17    2087721
8     1718182
18    1636566
19    1258352
20     976182
7      891082
21     795658
22     634247
23     402335
6      290500
0      218786
1      115706
5       87964
2       69377
4       53245
3       51281
Name: count, dtype: int64

The value counts listed above will be split into equal thirds for the following labels: "Most orders" = 10, 11, 14, 15, 13, 12, 16, and 9; "Fewest orders" = 23, 6, 0, 1, 5, 2, 4, and 3; "Average orders" = all remaining values in order_hour_of_day.

In [126]:
# Create for-loop if statement labeling periods of time as “Most orders,” “Average orders,” and “Fewest orders.”

result_3 = []

for value in df_ords_prods_merge["order_hour_of_day"]:
  if value in [10, 11, 14, 15, 13, 12, 16, 9]:
    result_3.append("Most orders")
  elif value in [23, 6, 0, 1, 5, 2, 4, 3]:
    result_3.append("Fewest orders")
  else:
    result_3.append("Average orders")

In [128]:
result_3

['Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Fewest orders',
 'Average orders',
 'Fewest orders',
 'Fewest orders',
 'Fewest orders',
 'Fewest orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Fewest orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most ord

In [130]:
# Create new column "busiest_period_of_day" in ords_prods_merge.

df_ords_prods_merge['busiest_period_of_day'] = result_3

# Step 5. Print frequency of "busiest_period_of_day" column.

In [133]:
# Print value counts in "busiest_period_of_day" column.

df_ords_prods_merge['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21118857
Average orders     9997990
Fewest orders      1289194
Name: count, dtype: int64

# Step 7. Export dataframe as a pickle file to “Prepared Data” folder.

In [136]:
df_ords_prods_merge.to_pickle(os.path.join(path, 'Prepared Data', 'orders_products_merged_derived.pkl'))