# Cleaning and Preparing

## Table of Content

#### 1. Import Data

#### 2. Data Cleaning - Order Data

#### 3. Data Cleaning - Products Data

#### 4. Cleaning Data - Ordered Products

#### 5. Cleaning Data - Ordered Products train

#### 6. Saving Data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

## 1. Import Data

In [2]:
path = r'/Users/henning/Portfolio/Instacart_Basket'

In [3]:
df_orders = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'orders.csv'), index_col = False)

In [4]:
df_products = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
df_order_prior = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'order_products__prior.csv'), index_col = False)

In [6]:
df_order_train = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'order_products__train.csv'), index_col = False)

## 2. Data Cleaning - Order Data

In [7]:
df_orders.shape

(3421083, 7)

In [8]:
# checking on eval_set
df_orders['eval_set'].value_counts()

eval_set
prior    3214874
train     131209
test       75000
Name: count, dtype: int64

In [9]:
# removing the test rows. This rows are in the data set to test predictions on the reordered column.
# Because this analysis doesn't attempt this, the rows can be removed.
df_orders = df_orders[df_orders['eval_set'] != 'test']

In [10]:
# renaming column for better understanding and readability
df_orders.rename(columns = {'order_dow' : 'order_weekday'}, inplace = True)

In [11]:
# checking if mixed data types column exist
for col in df_orders.columns.tolist():
  weird = (df_orders[[col]].map(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_orders[weird]) > 0:
    print (col)

In [12]:
# checking on empty cells
df_orders.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_weekday                  0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [13]:
# empty rows in days_prior_last_order can be explained that the first order from a customer must be empty.
# first-time customers can't have a prior order.
# Therefore the empty rows should match the number of customers.
df_orders.user_id.max()

206209

the number of customers matches with the number of empty cells in the days_since_prior_order column

In [14]:
# Run a check for duplicate values.
df_orders[df_orders.duplicated()]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_weekday,order_hour_of_day,days_since_prior_order


no duplicates found

In [15]:
df_orders.sample(5)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_weekday,order_hour_of_day,days_since_prior_order
1292198,159007,77699,prior,51,0,10,7.0
1054798,3417904,63499,train,6,5,10,11.0
182220,1793315,11015,prior,31,4,15,5.0
514823,208755,31056,prior,9,5,13,2.0
2897326,1002145,174837,prior,5,4,8,17.0


In [16]:
# checking on data types
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3346083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_weekday           int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 204.2+ MB


In [17]:
# changing columns with IDs to string type
df_orders['user_id'] = df_orders['user_id'].astype('str')
df_orders['order_id'] = df_orders['order_id'].astype('str')

In [18]:
df_orders.shape

(3346083, 7)

## 3. Data Cleaning - Products Data

In [19]:
df_products.shape

(49688, 4)

In [20]:
# checking for empty cells
df_products.isnull().sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: int64

In [21]:
# checking for products with missing names.
df_products[df_products['product_name'].isnull() == True]

Unnamed: 0,product_id,product_name,aisle_id,department_id


In [22]:
# looking for duplicates
df_products[df_products.duplicated()]

Unnamed: 0,product_id,product_name,aisle_id,department_id


In [23]:
# Check for mixed-type data
for col in df_products.columns.tolist():
  weird = (df_products[[col]].map(type) != df_products[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_products[weird]) > 0:
    print (col)

In [24]:
# checking on data types
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49688 entries, 0 to 49687
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_id     49688 non-null  int64 
 1   product_name   49688 non-null  object
 2   aisle_id       49688 non-null  int64 
 3   department_id  49688 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [25]:
# changing columns with IDs to string type
df_products['product_id'] = df_products['product_id'].astype('str')
df_products['aisle_id'] = df_products['aisle_id'].astype('str')
df_products['department_id'] = df_products['department_id'].astype('str')

In [26]:
df_products.shape

(49688, 4)

## 4. Cleaning Data - Ordered Products

Dataset is called order_products__prior

In [27]:
df_order_prior.shape

(32434489, 4)

In [28]:
df_order_prior.sample(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
11059618,1167531,16797,1,1
29166476,3076062,12456,1,0
7028810,742062,48205,4,1
8252302,871153,47209,2,1
26422643,2786298,46969,13,0


In [29]:
# removing unnecessary column. The column only indicates in which sequence the products were added in the basket. This aspect won't be analysed.
df_order_prior = df_order_prior.drop(columns = ['add_to_cart_order'])

In [30]:
# checking for empty cells
df_order_prior.isnull().sum()

order_id      0
product_id    0
reordered     0
dtype: int64

In [31]:
# checking if mixed data types column exist
for col in df_order_prior.columns.tolist():
  weird = (df_order_prior[[col]].map(type) != df_order_prior[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_order_prior[weird]) > 0:
    print (col)

In [32]:
# Run a check for duplicate values.
df_order_prior[df_order_prior.duplicated()]

Unnamed: 0,order_id,product_id,reordered


In [33]:
# changing columns with IDs to string type
df_order_prior['product_id'] = df_order_prior['product_id'].astype('str')
df_order_prior['order_id'] = df_order_prior['order_id'].astype('str')

In [34]:
df_order_prior.shape

(32434489, 3)

## 5. Cleaning Data - Ordered Products train

Cleaning Data - Ordered Products

Dataset is called order_products__train


In [35]:
df_order_train.shape

(1384617, 4)

In [36]:
# removing unnecessary column. The column only indicates in which sequence the products were added in the basket.
df_order_train = df_order_train.drop(columns = ['add_to_cart_order'])

In [37]:
# checking for empty cells
df_order_train.isnull().sum()

order_id      0
product_id    0
reordered     0
dtype: int64

In [38]:
# checking if mixed data types column exist
for col in df_order_train.columns.tolist():
  weird = (df_order_train[[col]].map(type) != df_order_train[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_order_train[weird]) > 0:
    print (col)

In [39]:
# Run a check for duplicate values.
df_order_train[df_order_train.duplicated()]

Unnamed: 0,order_id,product_id,reordered


In [40]:
# changing columns with IDs to string type
df_order_train['product_id'] = df_order_train['product_id'].astype('str')
df_order_train['order_id'] = df_order_train['order_id'].astype('str')

In [41]:
df_order_train.shape

(1384617, 3)

## 6. Saving Cleaned Data Sets

In [42]:
# export order data set
df_orders.to_pickle(os.path.join(path, 'Data','Prepared Data', 'orders_cleaned.pkl'))

In [43]:
# export product data set
df_products.to_pickle(os.path.join(path, 'Data','Prepared Data', 'products_cleaned.pkl'))

In [44]:
# export orders_products__prior data set
df_order_prior.to_pickle(os.path.join(path, 'Data','Prepared Data', 'prior_baskets_cleaned.pkl'))

In [45]:
# export orders_products__train data set
df_order_train.to_pickle(os.path.join(path, 'Data','Prepared Data', 'train_baskets_cleaned.pkl'))