In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

### Import Data

In [2]:
path = r'/Users/henning/Documents/Data Analytics Portfolio/Instacart Project - ICP'

In [3]:
df_orders = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [4]:
df_products = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
df_order_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'order_products__prior.csv'), index_col = False)

In [6]:
df_order_recent = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'order_products__train.csv'), index_col = False)

In [7]:
df_customer = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

## Data Cleaning - Order Data

In [8]:
df_orders.shape

(3421083, 7)

In [9]:
# checking on eval_set
df_orders['eval_set'].value_counts()

eval_set
prior    3214874
train     131209
test       75000
Name: count, dtype: int64

In [10]:
# removing the test rows. This rows are in the data set to test predictions on the reordered column.
# Because this analysis doesn't attempt this, the rows can be removed.
df_orders = df_orders[df_orders['eval_set'] != 'test']

In [11]:
# renaming column for better understanding and readability
df_orders.rename(columns = {'order_dow' : 'order_weekday'}, inplace = True)

In [12]:
df_orders.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = True)

In [13]:
# checking if mixed data types column exist
for col in df_orders.columns.tolist():
  weird = (df_orders[[col]].map(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_orders[weird]) > 0:
    print (col)

In [14]:
# checking on empty cells
df_orders.isnull().sum()

order_id                      0
user_id                       0
eval_set                      0
order_number                  0
order_weekday                 0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

In [15]:
# empty rows in days_since_last_order can be explained that the first order from a customer must be empty.
# first-time customers can't have a prior order.
# Therefore each user_id should have one row where "distance_in_days_to_last_order" is empty.
df_orders.user_id.max()

206209

the number of customers matches with the number of empty cells in the days_since_last_order column

In [16]:
# Run a check for duplicate values.
df_orders[df_orders.duplicated()]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_weekday,order_hour_of_day,days_since_last_order


no duplicates found

In [17]:
df_orders.sample(5)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_weekday,order_hour_of_day,days_since_last_order
2554452,2256666,153706,prior,11,0,8,8.0
3031979,1181868,182930,prior,36,1,7,4.0
2547018,2339596,153244,prior,78,0,12,8.0
262048,204443,15892,prior,1,5,12,
2872025,3042244,173288,prior,8,6,8,7.0


In [18]:
# checking on data types
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3346083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   order_id               int64  
 1   user_id                int64  
 2   eval_set               object 
 3   order_number           int64  
 4   order_weekday          int64  
 5   order_hour_of_day      int64  
 6   days_since_last_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 204.2+ MB


In [19]:
# changing columns with IDs to string type
df_orders['user_id'] = df_orders['user_id'].astype('str')
df_orders['order_id'] = df_orders['order_id'].astype('str')

In [20]:
df_orders.shape

(3346083, 7)

### Data Cleaning - Products Data

In [21]:
df_products.shape

(49693, 5)

In [22]:
# checking for empty cells
df_products.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [23]:
df_products[df_products['product_name'].isnull() == True]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [24]:
# filling in missing names with a placeholder
df_products['product_name'] = df_products['product_name'].fillna('unknown product')

In [25]:
df_products[df_products['product_name'] == 'unknown product']

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,unknown product,121,14,12.2
68,69,unknown product,26,7,11.8
115,116,unknown product,93,3,10.8
261,262,unknown product,110,13,12.1
525,525,unknown product,109,11,1.2
1511,1511,unknown product,84,16,14.3
1780,1780,unknown product,126,11,12.3
2240,2240,unknown product,52,1,14.2
2586,2586,unknown product,104,13,12.4
3159,3159,unknown product,126,11,13.1


In [26]:
# looking for duplicates
df_products[df_products.duplicated()]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [27]:
df_products = df_products.drop_duplicates()

In [28]:
# Check for mixed-type data
for col in df_products.columns.tolist():
  weird = (df_products[[col]].map(type) != df_products[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_products[weird]) > 0:
    print (col)

In [29]:
# checking for defect data
df_products.loc[df_products['prices'] >100, 'prices']

21554    14900.0
33666    99999.0
Name: prices, dtype: float64

In [30]:
# removing prices with defect data (price = 99999.0 and 14900.0)
df_products.loc[df_products['prices'] >100, 'prices'] = np.nan

In [31]:
# checking on data types
df_products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49688 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49688 non-null  int64  
 1   product_name   49688 non-null  object 
 2   aisle_id       49688 non-null  int64  
 3   department_id  49688 non-null  int64  
 4   prices         49686 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 2.3+ MB


In [32]:
# changing columns with IDs to string type
df_products['product_id'] = df_products['product_id'].astype('str')
df_products['aisle_id'] = df_products['aisle_id'].astype('str')
df_products['department_id'] = df_products['department_id'].astype('str')

In [33]:
df_products.shape

(49688, 5)

### Cleaning orders_products__prior

In [34]:
df_order_prior.shape

(32434489, 4)

In [35]:
df_order_prior.sample(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
17686639,1865351,22947,5,0
6255514,660220,47866,11,0
9167758,967715,25072,3,0
409135,43274,24937,2,0
16686967,1760153,39877,13,0


In [36]:
# removing unnecessary column. The column only indicates in which sequence the products were added in the basket.
df_order_prior = df_order_prior.drop(columns = ['add_to_cart_order'])

In [37]:
# checking for empty cells
df_order_prior.isnull().sum()

order_id      0
product_id    0
reordered     0
dtype: int64

In [38]:
# checking if mixed data types column exist
for col in df_order_prior.columns.tolist():
  weird = (df_order_prior[[col]].map(type) != df_order_prior[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_order_prior[weird]) > 0:
    print (col)

In [39]:
# Run a check for duplicate values.
df_order_prior[df_order_prior.duplicated()]

Unnamed: 0,order_id,product_id,reordered


In [40]:
# changing columns with IDs to string type
df_order_prior['product_id'] = df_order_prior['product_id'].astype('str')
df_order_prior['order_id'] = df_order_prior['order_id'].astype('str')

In [41]:
df_order_prior.shape

(32434489, 3)

### Cleaning orders_products__train

In [42]:
df_order_recent.shape

(1384617, 4)

In [43]:
# removing unnecessary column. The column only indicates in which sequence the products were added in the basket.
df_order_recent = df_order_recent.drop(columns = ['add_to_cart_order'])

In [44]:
# checking for empty cells
df_order_recent.isnull().sum()

order_id      0
product_id    0
reordered     0
dtype: int64

In [45]:
# checking if mixed data types column exist
for col in df_order_recent.columns.tolist():
  weird = (df_order_recent[[col]].map(type) != df_order_recent[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_order_recent[weird]) > 0:
    print (col)

In [46]:
# Run a check for duplicate values.
df_order_recent[df_order_recent.duplicated()]

Unnamed: 0,order_id,product_id,reordered


In [47]:
# changing columns with IDs to string type
df_order_recent['product_id'] = df_order_recent['product_id'].astype('str')
df_order_recent['order_id'] = df_order_recent['order_id'].astype('str')

In [48]:
df_order_recent.shape

(1384617, 3)

### Cleaning Customer Data

In [49]:
df_customer.shape

(206209, 10)

In [50]:
# removing unnecessary column
df_customer = df_customer.drop(columns = ['n_dependants'])

In [51]:
# renaming column for better understanding and readability
df_customer.rename(columns = {'Gender' : 'gender'}, inplace = True)
df_customer.rename(columns = {'STATE' : 'state'}, inplace = True)
df_customer.rename(columns = {'Age' : 'age'}, inplace = True)
df_customer.rename(columns = {'fam_status' : 'civil_status'}, inplace = True)

In [52]:
# removing name columns for anonymity
df_customer = df_customer.drop(columns = ['First Name', 'Surnam'])

In [53]:
df_customer.sample(5)

Unnamed: 0,user_id,gender,state,age,date_joined,civil_status,income
31354,36850,Male,North Carolina,60,7/1/2017,married,100216
61385,176590,Male,North Carolina,51,12/20/2017,married,70050
203644,92524,Male,Florida,38,3/18/2020,married,59365
37447,109552,Female,Arkansas,39,8/5/2017,married,30462
188931,49608,Female,Minnesota,28,12/24/2019,married,29910


In [54]:
# checking on missing values
df_customer.isnull().sum()

user_id         0
gender          0
state           0
age             0
date_joined     0
civil_status    0
income          0
dtype: int64

In [55]:
# changing IDs into string type
df_customer['user_id'] = df_customer['user_id'].astype('str')

In [56]:
# checking for duplicates
df_customer[df_customer.duplicated()]

Unnamed: 0,user_id,gender,state,age,date_joined,civil_status,income


In [57]:
df_customer.shape

(206209, 7)

### Saving Cleaned Data Sets

In [58]:
# export order data set
df_orders.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_cleaned.pkl'))

In [59]:
# export product data set
df_products.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'products_cleaned.pkl'))

In [60]:
# export orders_products__prior data set
df_order_prior.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'old_baskets_cleaned.pkl'))

In [61]:
# export orders_products__train data set
df_order_recent.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'recent_baskets_cleaned.pkl'))

In [62]:
# export orders_products__prior data set
df_customer.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'customers_cleaned.pkl'))