# 4.10 IC Key Business Questions Regions and High Activity Customers

### This script contains the following points: <br> <br> 
1. Importing Libraries <br> <br> 
2. Importing Data Sets <br> <br> 
3. Data Checks <br><br>
4. Regional Considerations <br><br>
5. Exclusion Flag for Low Order Numbers


## 01 Import Libraries

In [2]:
# Import Libraries with visualization capabilities
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy


## 02 Import Data

In [3]:
# First create a string of the path for the main project folder
path = r'/Users/mistystone/Library/CloudStorage/OneDrive-Personal/Documents/CF_Data_Ach4_Python/2023-05_Instacart_Basket_Analysis/'

In [4]:
# Import pickle files
df = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data','orders_products_all.pkl')) 

## 03 Data Checks

In [5]:
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,spend_flag,frequent_orders,frequent_flag,gender,state,age,date_joined,number_dependents,family_status,income
0,2539329,1,1,2,8,7.0,196,1,0,Soda,...,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423


In [6]:
df.shape

(32404859, 30)

Note that I deleted the first name and surname variables during Exercise 4.9 due to PPI data ethical considerations.

## 04 Regional Considerations

Consider how region affects customer behavior, particularly in the variable 'spend_flag'. <br>
Recall that the variable spend_flag takes the value 'High spender' if the average spending of the customer is greater than or equal to 10 and the value 'Low spender if the average spending of the customer is below 10. 

In [7]:
# List of states in the Northeast
Northeast_list = ['Maine', 'New Hampshire', 'Vermont', 
                  'Massachusetts', 'Rhode Island', 'Connecticut', 
                  'New York', 'Pennsylvania', 'New Jersey']

In [8]:
# List of states in the Midwest
Midwest_list = ['Wisconsin', 'Michigan', 'Illinois',
                'Indiana', 'Ohio', 'North Dakota',
                'South Dakota', 'Nebraska', 'Kansas',
                'Minnesota', 'Iowa', 'Missouri']

In [9]:
# List of states in the South
South_list = ['Delaware', 'Maryland', 'District of Columbia', 
              'Virginia', 'West Virginia', 'North Carolina', 
              'South Carolina', 'Georgia', 'Florida', 
              'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']

In [10]:
# List of states in the West
West_list = ['Idaho', 'Montana', 'Wyoming', 
             'Nevada', 'Utah', 'Colorado', 
             'Arizona', 'New Mexico', 'Alaska', 
             'Washington', 'Oregon', 'California', 'Hawaii']

In [11]:
# Search for Northeast states
df.loc[df['state'].isin(Northeast_list),'region'] = "Northeast"

In [12]:
# Search for Midwest states
df.loc[df['state'].isin(Midwest_list),'region'] = "Midwest"

In [13]:
# Search for South states
df.loc[df['state'].isin(South_list),'region'] = "South"

In [14]:
# Search for West states
df.loc[df['state'].isin(West_list),'region'] = "West"

In [15]:
# Check region variable
df['region'].value_counts(dropna = False)

South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: region, dtype: int64

In [16]:
# Check spend_flag variable
df['spend_flag'].value_counts(dropna = False)

Low spender     32285131
High spender      119728
Name: spend_flag, dtype: int64

In [18]:
# Crosstab to compare
crosstab = pd.crosstab(df['region'], df['spend_flag'], dropna = False)

In [19]:
# This is a relatively small object, so print here
crosstab

spend_flag,High spender,Low spender
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,29265,7568060
Northeast,18642,5704094
South,40579,10751306
West,31242,8261671


We are asking the question: "How does region affect whether a customer is likely to be a high vs low spender?" Without doing an indepth statistical analysis, the answer is "not much". The proportion of High spenders in each region is approximately 2% (2.05%, 1.89%, 1.94%, and 1.93%, respectively). This percentage is arrived at by dividing the number of high spenders by the total number of customers in each region. Nonetheless, the Midwest has the highest percentage of high spenders and the Northeast has the lowest percentage of high spenders.

# 05 Exclusion Flag for Low Order Numbers

We aren't interested in customers who don’t generate much revenue for the app. Creating an exclusion flag for low-activity customers (customers with less than 5 orders) and exclude them from the data. 

In [20]:
df.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'Busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'average_spend', 'spend_flag', 'frequent_orders', 'frequent_flag',
       'gender', 'state', 'age', 'date_joined', 'number_dependents',
       'family_status', 'income', 'region'],
      dtype='object')

In [21]:
df['max_order'].value_counts(dropna = False).sort_index()

1           5
2           6
3      686741
4      753543
5      793140
       ...   
95      59877
96      40453
97      44949
98      44587
99    1171333
Name: max_order, Length: 99, dtype: int64

In [22]:
# Create a flag column called low_activity_flag for customers whose max_order is less than 5. 
df.loc[df['max_order'] < 5, 'low_activity_flag'] = "Low activity customer"

In [23]:
df['low_activity_flag'].value_counts(dropna = False)

NaN                      30964564
Low activity customer     1440295
Name: low_activity_flag, dtype: int64

We need to pull the low activity customers out of the sample and export their data because we are about to delete them!

In [24]:
# Low activity customers pulled out of the bigger dataset
df_low_activity_customers = df.loc[df['low_activity_flag'].isin(['Low activity customer'])]

In [25]:
# check df_low_activity_customers
df_low_activity_customers.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,frequent_flag,gender,state,age,date_joined,number_dependents,family_status,income,region,low_activity_flag
1510,520620,120,1,3,11,7.0,196,2,0,Soda,...,Regular customer,Female,Kentucky,54,3/2/2017,2,married,99219,South,Low activity customer
1511,3273029,120,3,2,8,19.0,196,2,1,Soda,...,Regular customer,Female,Kentucky,54,3/2/2017,2,married,99219,South,Low activity customer
1512,520620,120,1,3,11,7.0,46149,1,0,Zero Calorie Cola,...,Regular customer,Female,Kentucky,54,3/2/2017,2,married,99219,South,Low activity customer
1513,3273029,120,3,2,8,19.0,46149,1,1,Zero Calorie Cola,...,Regular customer,Female,Kentucky,54,3/2/2017,2,married,99219,South,Low activity customer
1514,520620,120,1,3,11,7.0,26348,3,0,Mixed Fruit Fruit Snacks,...,Regular customer,Female,Kentucky,54,3/2/2017,2,married,99219,South,Low activity customer


In [26]:
# Check df_low_activity_customers
df_low_activity_customers['max_order'].value_counts(dropna = False)

4    753543
3    686741
2         6
1         5
Name: max_order, dtype: int64

In [27]:
# Export df_low_activity_customers in pickle format
df_low_activity_customers.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'low_activity_customers.pkl'))

In [28]:
# Update df dataframe to delete rows from low activity customers.
df = df.loc[df['max_order'] > 4]

In [29]:
# Check sizes and hoping for 32404859
len(df) + len(df_low_activity_customers)

32404859

In [30]:
# Checking df['low_activity_flag'], hoping for all Nan
df['low_activity_flag'].value_counts(dropna = False)

NaN    30964564
Name: low_activity_flag, dtype: int64

In [31]:
# Export df without low activity customers.
df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'high_activity_customers.pkl'))