# Coding Etiquette & Excel Reporting (1)
# (Part 1 - Q 1-4)

## Content
#### 1. Import libraries and data
#### 2. Drop unnecessary columns and make new 'department' column
#### 3. Drop PII columns (Q2)
#### 4. Create new column 'region' (Q3)
#### 5. Create exclusion flag for low-activity customers (Q4)
#### 6. Export data

#### Note: After reading entire task and reviewing project brief I decided to drop all columns that are not useful for analysis and visualizations. I will also create new column 'department' with names of departments to have all necessary data in one place and to have more user-friendly visulizations later. (See section 2. for this code)

# 1. Import libraries and data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Create project folder path
path = r'C:\Users\Lara\Career Foundry Projects\21-09-2023 Instacart Basket Analysis'

In [3]:
# Import dataset orders_products_customers_with_dow_names.pkl
df_ords_prods_cust = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_customers_with_dow_names.pkl'))

In [4]:
# Import dataset departments_wrangled.csv ('index_col = False' didn't work here)
df_dep = pd.read_csv (os.path.join (path, '02 Data','Prepared Data', 'departments_wrangled.csv'), index_col = 0)

In [5]:
# Check head() and info() for df_ords_prods_cust
df_ords_prods_cust.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,median_days,order_frequency_flag,first_name,surname,gender,state,age,number_of_dependants,marital_status,income
0,2539329,1,1,Monday,8,,196,1,0,Soda,...,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423
1,2398795,1,2,Tuesday,7,15.0,196,1,1,Soda,...,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423
2,473747,1,3,Tuesday,12,21.0,196,1,1,Soda,...,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423
3,2254736,1,4,Wednesday,7,29.0,196,1,1,Soda,...,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423
4,431534,1,5,Wednesday,15,28.0,196,1,1,Soda,...,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423


In [6]:
df_ords_prods_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434212 entries, 0 to 32434211
Data columns (total 30 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int16  
 3   orders_day_of_week      object 
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float32
 6   product_id              int32  
 7   add_to_cart_order       int16  
 8   reordered               int8   
 9   product_name            object 
 10  aisle_id                int16  
 11  department_id           int8   
 12  prices                  float32
 13  price_range             object 
 14  busiest_days            object 
 15  busiest_period_of_day   object 
 16  max_order               int16  
 17  loyalty_flag            object 
 18  mean_price              float32
 19  spending_flag           object 
 20  median_days             float32
 21  order_frequency_flag    objec

In [7]:
# Print all departments
df_dep

Unnamed: 0,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [8]:
# Check all department_id
df_ords_prods_cust['department_id'].value_counts().sort_index()

department_id
1     2236432
2       36291
3     1176787
4     9479291
5      153743
6      269253
7     2688159
8       97724
9      866627
10      34573
11     447691
12     708931
13    1875577
14     710647
15    1068058
16    5414016
17     738666
18     423802
19    2887550
20    1051249
21      69145
Name: count, dtype: int64

# 2. Drop unnecessary columns and make new 'department' column

In [9]:
# Drop unnecessary columns
df_ords_prods_cust = df_ords_prods_cust.drop (columns = ['add_to_cart_order', 'reordered', 'aisle_id'])

In [10]:
# Check shape
df_ords_prods_cust.shape

(32434212, 27)

In [11]:
# Make data dictionary out of df_dep to use for creation of new column
data_dict = df_dep.to_dict('index')

In [12]:
# Check dictionary
data_dict

{1: {'department': 'frozen'},
 2: {'department': 'other'},
 3: {'department': 'bakery'},
 4: {'department': 'produce'},
 5: {'department': 'alcohol'},
 6: {'department': 'international'},
 7: {'department': 'beverages'},
 8: {'department': 'pets'},
 9: {'department': 'dry goods pasta'},
 10: {'department': 'bulk'},
 11: {'department': 'personal care'},
 12: {'department': 'meat seafood'},
 13: {'department': 'pantry'},
 14: {'department': 'breakfast'},
 15: {'department': 'canned goods'},
 16: {'department': 'dairy eggs'},
 17: {'department': 'household'},
 18: {'department': 'babies'},
 19: {'department': 'snacks'},
 20: {'department': 'deli'},
 21: {'department': 'missing'}}

In [13]:
# Create 'dictionary' from the output above
dictionary = {1:'frozen', 2:'other', 3:'bakery', 4:'produce',5:'alcohol', 6:'international', 7:'beverages', 8:'pets',
              9:'dry goods pasta', 10:'bulk', 11:'personal care', 12:'meat seafood', 13:'pantry', 14:'breakfast',
              15:'canned goods', 16:'dairy eggs', 17:'household', 18:'babies', 19:'snacks', 20:'deli', 21:'missing'}

In [14]:
# Make a new column 'department' using 'dictionary'
df_ords_prods_cust['department'] = df_ords_prods_cust['department_id'].map(dictionary)

In [15]:
# Check shape
df_ords_prods_cust.shape

(32434212, 28)

In [16]:
# Check values in new column
df_ords_prods_cust['department'].value_counts().sort_index()

department
alcohol             153743
babies              423802
bakery             1176787
beverages          2688159
breakfast           710647
bulk                 34573
canned goods       1068058
dairy eggs         5414016
deli               1051249
dry goods pasta     866627
frozen             2236432
household           738666
international       269253
meat seafood        708931
missing              69145
other                36291
pantry             1875577
personal care       447691
pets                 97724
produce            9479291
snacks             2887550
Name: count, dtype: int64

In [17]:
# Drop column 'department_id' and make new dataframe (to indicate that all data is in it)
df_all = df_ords_prods_cust.drop (columns = ['department_id'])

In [18]:
# Check shape
df_all.shape

(32434212, 27)

# 3. Drop PII columns

#### Columns 'first_name' and 'surname' need to be dropped because they contain personally identifiable information (PII).

In [19]:
# Drop column 'first_name' and 'surname'
df_all = df_all.drop (columns = ['first_name', 'surname'])

In [20]:
# Check shape
df_all.shape

(32434212, 25)

# 4. Create new column 'region'

#### Create regional segmentation of the data.
#### Information from this page was used to create values for this column: https://simple.wikipedia.org/wiki/List_of_regions_of_the_United_States

In [21]:
# Create 4 list of States for 4 regions
Northeast = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']
Midwest = ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']
South = ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']
West = ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']

In [22]:
# Create new column 'region' with 4 values named after lists made above
df_all.loc[df_all['state'].isin(Northeast), 'region'] = 'Northeast'

In [23]:
df_all.loc[df_all['state'].isin(Midwest), 'region'] = 'Midwest'

In [24]:
df_all.loc[df_all['state'].isin(South), 'region'] = 'South'

In [25]:
df_all.loc[df_all['state'].isin(West), 'region'] = 'West'

In [26]:
# Check row counts per region
df_all['region'].value_counts(dropna = False)

region
South        10801610
West          8300445
Midwest       7603810
Northeast     5728347
Name: count, dtype: int64

#### Determine whether there’s a difference in spending habits between the different U.S. regions.

In [27]:
# Create crosstab
crosstab = pd.crosstab(df_all['region'], df_all['spending_flag'], dropna = False)

In [28]:
# Check output
crosstab

spending_flag,High spender,Low spender
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,29344,7574466
Northeast,18735,5709612
South,40990,10760620
West,31651,8268794


In [29]:
crosstab.to_clipboard()

#### Conclusion: Majority of Low spending users are from South and least of them come from Northeast region, by half compared to South. West and Midwest region have about the same proportion of users. The same goes for High spending users. 
#### This table can be found in Excel file Final report

# 5. Create exclusion flag for low-activity customers

#### Create an exclusion flag for low-activity customers (customers with less than 5 orders), exclude them from the data and export this sample.

In [30]:
# Create new column "activity-flag" based on their maximum order count
df_all.loc[df_all['max_order'] < 5, 'activity_flag'] = 'Low-activity customer'

In [31]:
df_all.loc[df_all['max_order'] >= 5, 'activity_flag'] = 'High-activity customer'

In [32]:
# Chech if new column was created
df_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,product_name,prices,price_range,...,order_frequency_flag,gender,state,age,number_of_dependants,marital_status,income,department,region,activity_flag
0,2539329,1,1,Monday,8,,196,Soda,9.0,Mid-range product,...,Non-frequent customer,Female,Alabama,31,3,married,40423,beverages,South,High-activity customer
1,2398795,1,2,Tuesday,7,15.0,196,Soda,9.0,Mid-range product,...,Non-frequent customer,Female,Alabama,31,3,married,40423,beverages,South,High-activity customer
2,473747,1,3,Tuesday,12,21.0,196,Soda,9.0,Mid-range product,...,Non-frequent customer,Female,Alabama,31,3,married,40423,beverages,South,High-activity customer
3,2254736,1,4,Wednesday,7,29.0,196,Soda,9.0,Mid-range product,...,Non-frequent customer,Female,Alabama,31,3,married,40423,beverages,South,High-activity customer
4,431534,1,5,Wednesday,15,28.0,196,Soda,9.0,Mid-range product,...,Non-frequent customer,Female,Alabama,31,3,married,40423,beverages,South,High-activity customer


In [33]:
# Check value counts
df_all['activity_flag'].value_counts(dropna = False)

activity_flag
High-activity customer    30992664
Low-activity customer      1441548
Name: count, dtype: int64

In [34]:
# Create subset that excludes all low-activity customers
df_high_act = df_all[df_all['activity_flag'] == 'High-activity customer']

In [35]:
# Check if all low-activity customers were excluded correctly. If number is the same as in the above value counts, than yes.
df_high_act['activity_flag'].value_counts()

activity_flag
High-activity customer    30992664
Name: count, dtype: int64

# 6. Export data

In [36]:
# Export separatelly df_all and df_high_act as pikle format
df_all.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'instacart_all_users.pkl'))

In [37]:
df_high_act.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'instacart_high_act_users.pkl'))