# Table of Content

1. Importing libraries
2. Importing data
3. Consistency Checks Products Dataframe
4. Consistency Checks Orders Dataframe
5. Exporting cleaned and consistent dataframes

# 1. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Importing data

In [2]:
# Define path
path = r'/Users/frederikeschulz-mullensiefen/Desktop/Master Folder_Instacart/02_Data'

In [3]:
# Import products dataframe
df_products = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'), index_col = False)

In [4]:
# Import orders dataframe
df_orders = pd.read_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# 3. Consistency Checks Products Dataframe

In [5]:
# Descriptive statistics
df_products.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


In [6]:
# Finding missing data in products dataframe
df_products.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [7]:
# Creating subset with null values 
df_nan = df_products[df_products['product_name'].isnull() == True]

In [8]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [9]:
# Row check for original dataset
df_products.shape

(49693, 5)

In [10]:
# Creating cleaned subset without missing values
df_products_clean = df_products[df_products['product_name'].isnull() == False]

In [11]:
# Row check for cleaned subset
df_products_clean.shape

(49677, 5)

In [12]:
# Finding duplicates
df_dups = df_products_clean[df_products_clean.duplicated()]

In [13]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [14]:
# Row check before removing duplicates
df_products_clean.shape

(49677, 5)

In [15]:
# Creating cleaned subset without duplicates
df_products_clean_nodups = df_products_clean.drop_duplicates()

In [16]:
# Row check after removing duplcates
df_products_clean_nodups.shape

(49672, 5)

# 4. Consistency Checks Orders Dataframe

In [17]:
# Descriptive statistics
df_orders.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,ordertime_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


Observations: 
- Count: All columns have the same number of observations, apart from the 'days since prior order' column, which has less observations, which may indicate missing data.
- Order_id: The min and max values make sense.
- User_id: The min and max values make sense.
- Order_number: The min and max values make sense, if the order number is calculcated per customer. This would mean that the maximum order number would be 100 (meaning the maximum number a customer has ordered is 100 times). This means that there are duplicates within this column (not necessarily from the full data set - tbd). 
- Orders_day_of_week: The min and max values make sense, if 0 = Monday and 6 = Sunday (if it's true that orders can be done every day of the week).
- Ordertime_hour_of_day: The min and max values make sense, if 0 = 0:00 and 2.3 = 23:00 (if it's true that orders can be done at every time of the day).
- Days_since_prior_order: The min and max make sense, if 0 = same day multiple orders, and 30 = 30 days after previous order. 

In [18]:
# Checking for mixed-type data
for col in df_orders.columns.tolist():
  weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_orders[weird]) > 0:
    print (col)

  weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)


There are no mixed-type data in this dataset.

In [19]:
# Finding missing data in orders dataframe
df_orders.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
ordertime_hour_of_day          0
days_since_prior_order    206209
dtype: int64

In [35]:
# Creating a dataframe with nan values for checks
df_orders_null = df_orders[df_orders['days_since_prior_order'].isnull() == True]

In [36]:
# Checking if there are prior purchases for nan values
df_orders_null[df_orders_null['order_number']>1]

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,ordertime_hour_of_day,days_since_prior_order


I ran a test if the nan values are truly missing data or if it is just the first order by a customer. Therefore, I ran the logic test if there are any order numbers > 1 in the rows where days since prior order are missing. Since there are no records with order numbers of > 1, there are no missing values in the orders dataframe, since there was no previous order. Therefore, the data should remain as-is.
If there had been any records with an order number of > 1, values could have been imputed with the mean (as there are no outliers in this column). 

In [37]:
# Finding duplicates
df_dups_2 = df_orders[df_orders.duplicated()]

In [38]:
df_dups_2

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,ordertime_hour_of_day,days_since_prior_order


There are no full duplicates in the orders data set.

In [39]:
# Checking for duplicate values in order_id column (as these should be unique)
duplicate_values = df_orders['order_id'].duplicated()
print(duplicate_values)

0          False
1          False
2          False
3          False
4          False
           ...  
3421078    False
3421079    False
3421080    False
3421081    False
3421082    False
Name: order_id, Length: 3421083, dtype: bool


The only column where it is important that it is unique (the order_id) also does not contain any duplicates.

# 5. Exporting checked and cleaned dataframes

In [42]:
# Exporting checked and cleaned orders data set
df_orders.to_csv(os.path.join(path,'Prepared Data', 'orders_checked.csv'), index = False)

In [43]:
# Exporting checked and cleaned products data set
df_products_clean_nodups.to_csv(os.path.join(path,'Prepared Data', 'products_checked.csv'), index = False)