# 4.5 Data Consistency Checks

## Contents:
### Set import variable
### Import products and analyze
### Import orders and analyze

## Import libraries and set import path

In [32]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [33]:
# Set path for use in import
path = r'C:\Users\XLT2\CFProjects\2023-04-07 Instacart Basket Analysis\02 Data'

### Import products

In [34]:
# Import products.csv
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'), index_col = False)

In [35]:
# Confirm products df
df_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


### Create NaN df - test products for NaN values

In [36]:
# Get counts of NaN / NULL in each column
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

### Write all products NaN values to df_nan

In [37]:
# Create df df_nan and write all records with a NaN in product_name column
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [38]:
# Confirm contents of df_nan
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Write all non-NaN values to df_prods_clean

In [39]:
# Create new df df_prods_clean and write all rows that DO NOT include NULL values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [40]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


### Write all duplicate rows to df_dups

In [41]:
# Create df df_dups and write all records that are duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [42]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [43]:
# Confirm shape of df_prods_clean before removing dups
df_prods_clean.shape

(49677, 5)

### Drop all duplicates from df_prods_clean

In [44]:
# Create df df_prods_clean_no_dups and write all records that are NOT duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [45]:
# Confirm size of new df with no dups
df_prods_clean_no_dups.shape

(49672, 5)

### Create test df to show mixed-type values in a single column

In [46]:
# Create a dataframe
df_test = pd.DataFrame()

In [47]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [48]:
# Confirm contents
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


### Import orders_wrangled

In [49]:
# Import orders_wrangled.csv
df_ords = pd.read_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [50]:
df_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
1,1,2398795,1,prior,2,3,7,15.0
2,2,473747,1,prior,3,3,12,21.0
3,3,2254736,1,prior,4,4,7,29.0
4,4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,prior,10,5,18,29.0
3421079,3421079,1854736,206209,prior,11,4,10,30.0
3421080,3421080,626363,206209,prior,12,1,12,18.0
3421081,3421081,2977660,206209,prior,13,1,12,7.0


## Submission exercises

### Step 2 - Identify areas that should be investigated further in orders

In [51]:
# Details of df_ords
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### The min values should be looked at. order_id, user_id, and order_number may make sense with a min value of 0. If there is an order placed, it should have a day, time, and days since. Those are all 0.
#### It may be good to just confirm the 25, 50, and 75% for days_since_prior_order. The max value is less than the 25% and 50% values.

In [52]:
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
1,1,2398795,1,prior,2,3,7,15.0
2,2,473747,1,prior,3,3,12,21.0
3,3,2254736,1,prior,4,4,7,29.0
4,4,431534,1,prior,5,4,15,28.0


### Step 3 - Check for mixed-type data

In [53]:
# Check for mixed-type data

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

### Step 4 - Handle mixed-type data

#### No data required a mixed-type fix.

### Step 5 - Check for NaN values

In [54]:
# Get counts of NaN / NULL in each column
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

#### Only days_since_prior_order has NaN values.
#### In an ordering system, it would make sense that these are the single orders. That customer has placed one order giving them no other previous order to compare to calculate this value. It would make sense that this is NaN.

### Step 6 - Handle missing values (days_since_prior_order)

In [55]:
# Replace all NaN values in days_since_prior_order with 0
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].replace(np.nan, 0)

In [56]:
# Confirm counts of NaN / NULL in each column
df_ords.isnull().sum()

Unnamed: 0                0
order_id                  0
user_id                   0
eval_set                  0
order_number              0
order_day_of_week         0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

#### I selected to replace the NaN values in the days_since_prior_order with zero.
#### The other values in this order list are still valuable to this data set. It doesn't make sense to delete over 206k records thus losing valuable order data. In this case 0 is the same as NaN.

### Step 7 - Check for duplicate values

In [57]:
# Create df df_ords_dups and write all records that are duplicates
df_ords_dups = df_ords[df_ords.duplicated()]

In [58]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


#### There are no duplicate records in df_ords

### Step 8 - Handle duplicate values

#### There are no duplicates to handle in this data set.

### Step 9 - Export final data sets

In [59]:
# Write orders to orders_checked.csv
df_ords.to_csv(os.path.join(path, 'Prepared Data', 'orders_checked.csv'))

In [60]:
# Write products with no dups to new df products_checked
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Prepared Data', 'products_checked.csv'))