# Contents:
01. Import Libraries
02. Import Data
03. Data Security Check
04. Customer Behaviour by Regional Segmentation
    - Regional Segmentation
    - Spending Habits by Geographic Regions
05. Exclusion Low-Activity Customers

# 01. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib as plt
import seaborn as sns
import scipy

# 02. Import Data

In [2]:
# define path
path = r'/Users/lavinia/Documents/04-2020 Instacart Basket Analysis'

In [3]:
# import the project file
df_project = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'project_file.pkl'))

# 03. Data Security Check

In [4]:
# configure pandas to display all columns
pd.set_option('display.max_column', None)

In [5]:
# configure pandas to display all rows
pd.set_option('display.max_rows', None)

In [6]:
# view the top 5 rows of the dataframe
df_project.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price_order,spending_flag,median_freq,frequency_flag,First Name,Last Name,Gender,State,Age,date_joined,n_dependants,marital_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regular busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


Addressing PII (Personally Identifiable Information) data security by removing both 'First Name' and 'Last Name' columns in dataframe. This to ensure the sensitive data (customer name) remains confidential and is only accessible to authorized individuals or systems. 

In [7]:
# remove both "First Name" and "Last Name" columns  
df_project.drop(['First Name', 'Last Name'], axis = 1, inplace = True)

In [8]:
# view the top 5 rows of the dataframe
df_project.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price_order,spending_flag,median_freq,frequency_flag,Gender,State,Age,date_joined,n_dependants,marital_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regular busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423


In [9]:
# export the df_project
df_project.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'project_file_v2.pkl'))

# 04. Customer Behaviour by Regional Segmentation

### Create Regional Segmentation

In [10]:
# categorize states into their regional divisions based on Wikipedia list
northeast = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']
midwest = ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']
south = ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']
west = ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']

In [11]:
# use the loc function to assign region based on states in 'State' column
df_project.loc[df_project['State'].isin(northeast), 'Region'] = 'Northeast'
df_project.loc[df_project['State'].isin(midwest), 'Region'] = 'Midwest'
df_project.loc[df_project['State'].isin(south), 'Region'] = 'South'
df_project.loc[df_project['State'].isin(west), 'Region'] = 'West'

In [12]:
# check any null values
df_project['Region'].value_counts(dropna = False)

South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: Region, dtype: int64

In [13]:
# compare the total value counts of column "Region" and "State"
state_counts = df_project['State'].value_counts().sum()
region_counts = df_project['Region'].value_counts().sum()
if state_counts == region_counts:
    print("The total value counts of both columns are the same")
else:
    print("Recheck the region assignment!")

The total value counts of both columns are the same


### Spending Habits by Geographic Regions

In [14]:
# create a crosstab between the "spending_flag" column and the "Region" column
crosstab_spending_region = pd.crosstab(df_project['Region'], df_project['spending_flag'], dropna = False)

In [15]:
# add additional columns to calculate total, % High spender, % Low spender from each region
crosstab_spending_region['Total'] = crosstab_spending_region.sum(axis=1)
crosstab_spending_region['% High spender'] = 100 * crosstab_spending_region['High spender'] / crosstab_spending_region['Total']
crosstab_spending_region['% Low spender'] = 100 * crosstab_spending_region['Low spender'] / crosstab_spending_region['Total']

In [16]:
# display the crosstab_spending_region
crosstab_spending_region

spending_flag,High spender,Low spender,Total,% High spender,% Low spender
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Midwest,155946,7441379,7597325,2.052644,97.947356
Northeast,108218,5614518,5722736,1.891019,98.108981
South,209637,10582248,10791885,1.942543,98.057457
West,160316,8132597,8292913,1.933169,98.066831


Each US region has a high proportion of low-spenders, meaning that Instacart customers across regions tend to buy product with average prices below 10. 

In [29]:
# create unique_user_spending
unique_user_spending = df_project.drop_duplicates(subset = ['user_id'])

In [30]:
# create a crosstab between the "spending_flag" column and the "Region" column
crosstab_spending_region_n = pd.crosstab(unique_user_spending['Region'], unique_user_spending['spending_flag'])
crosstab_spending_region_n

spending_flag,High spender,Low spender
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,1260,47259
Northeast,882,35506
South,1816,66921
West,1393,51172


# 05. Exclusion Low-Activity Customers

In [17]:
# create an exclusion flag for low-activity customers 
df_project.loc[df_project['max_order'] < 5, 'activity_flag'] = 'Low-activity customer'
df_project.loc[df_project['max_order'] >= 5, 'activity_flag'] = 'Non low-activity customer'

In [18]:
# create a sample dataframe that excludes all low-activity customers 
df_project_excl = df_project[df_project['activity_flag'] == 'Non low-activity customer']

In [19]:
# export the sample data
df_project_excl.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'project_file_excllowactcust.pkl'))