# Data Immersion Task 4.5 - Data Consistency Checks

### Table of Contents

1. Importing libraries
2. Importing dataframes
3. Consistency checks
4. Exporting data
5. Task work

        5b. Consistency check on orders dataframe
        5c. Looking for mixed data types
        5d. Looking for missing data
        5e. Looking for duplicate values
6. Exporting data

# 01. Importing Libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

# 02. Importing dataframes

In [2]:
# Importing instacart products and orders data

# identifying project path
project =r'D:\Adam\Employment\Data Analysis Course\Python Instacart project'

# Import Instacart orders data
df_ords = pd.read_csv(os.path.join(project, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# Import Instacart products data
df_prods = pd.read_csv(os.path.join(project, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [3]:
# Create a dataframe

df_test = pd.DataFrame()

In [4]:
# Create a mixed type column

df_test['mix'] = ['a', 'b', 1, True]

# 03. Consistency Checks

In [5]:
# Check for mixed types

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [6]:
df_test['mix'] = df_test['mix'].astype('str')

In [7]:
# looking for missing values

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [8]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [9]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [10]:
df_prods.shape

(49693, 5)

In [11]:
# creating new dataframe without missing values

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [12]:
df_prods_clean.shape

(49677, 5)

In [13]:
# looking for duplicates

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [14]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [15]:
# creating new dataframe without missing values or duplicates

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [16]:
df_prods_clean_no_dups.shape

(49672, 5)

# 04. Exporting Data

In [17]:
# exporting cleaned products data

df_prods_clean_no_dups.to_csv(os.path.join(project, '02 Data','Prepared Data', 'products_checked.csv'))

# 05. Task Work

##### 5b. Looking at orders df to see if anything is off

In [18]:
# Using your new knowledge about how to interpret the output of this function, 
# share in a markdown cell whether anything about the data looks off or should be 
# investigated further.

df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [19]:
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,0,2539329,1,1,2,8,
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0


In [20]:
# the max for order number is apparently "100" - but that possibly means this data sample 
# was limited in that way - that it was selected as customers with less than 100 orders.
# It's possible it was also a mistake and that it was intended to be "10". Hard to know.
# Other than order number maxing out at 100, nothing else looks off at all.

In [21]:
# checking for order numbers above 50

df_ords[df_ords['order_number'] > 50]

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
364,364,933179,27,51,2,9,5.0
365,365,682914,27,52,3,15,15.0
366,366,1149184,27,53,2,9,6.0
367,367,65724,27,54,4,15,2.0
368,368,2244394,27,55,2,16,5.0
...,...,...,...,...,...,...,...
3420997,3420997,2643521,206206,64,0,17,2.0
3420998,3420998,2750895,206206,65,3,14,10.0
3420999,3420999,2986341,206206,66,3,9,14.0
3421000,3421000,1904200,206206,67,0,13,11.0


In [22]:
# A whole lot of them have more than 50 orders, so having 100 orders might not be uncommon.
# I'm going to check for orders over 80.

In [23]:
# checking for order numbers above 80

df_ords[df_ords['order_number'] > 80]

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
394,394,1573122,27,81,1,13,6.0
395,395,2614670,27,82,5,14,4.0
3245,3245,1289149,210,81,3,17,3.0
3246,3246,3321331,210,82,0,10,4.0
3247,3247,2185732,210,83,3,8,3.0
...,...,...,...,...,...,...,...
3419497,3419497,2233698,206105,96,2,17,1.0
3419498,3419498,1211365,206105,97,4,17,2.0
3419499,3419499,1279578,206105,98,5,20,1.0
3419500,3419500,3367002,206105,99,6,7,1.0


In [24]:
# Also, a whole lot have orders above 80.  So hitting 100 orders isn't that strange.
# It must be that this dataset was limited to customers with 100 orders or less 
# (or limited in recording that number of orders per customer)

##### 5c. Looking for mixed data types

In [25]:
# checking for mixed type data

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [26]:
# Apparently no mixed type data

In [27]:
# I just saw an extra line of code in another student submission, so I'm going to include it:

for col in df_ords.columns.tolist():
 weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
 if len (df_ords[weird]) > 0:
    print (col, 'mixed')
 else: print(col, ' consistent')

Unnamed: 0  consistent
order_id  consistent
user_id  consistent
order_number  consistent
orders_day_of_week  consistent
order_hour_of_day  consistent
days_since_last_order  consistent


In [28]:
# While trying to include that other line of code from the other student, 
# I learned that indentation  matching up makes a big difference here.

##### 5d. Looking for missing data and deciding what to do about it

In [29]:
# Checking for missing data

df_ords.isnull().sum()

Unnamed: 0                    0
order_id                      0
user_id                       0
order_number                  0
orders_day_of_week            0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

In [30]:
# wow, days_since_last_order has a TON of missing values.
# My guess is that this is happening because people's first order is "null" days since last order.

In [31]:
# checking nuls in "days_since_last_order" to see if it corresponds with customers' first order

df_nan_orders = df_ords[df_ords['days_since_last_order'].isnull() == True]

In [32]:
df_nan_orders.head(30)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
50,50,2086598,6,1,5,18,
54,54,2565571,7,1,3,9,
75,75,600894,8,1,6,0,
79,79,280530,9,1,1,17,
83,83,1224907,10,1,2,14,


In [33]:
df_nan_orders.tail()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,
3421069,3421069,3154581,206209,1,3,11,


In [34]:
# based on the sample, it appears my guess was correct. It's people's first order.
# not sure how to fix this, because we want to keep "days_since_last_order" as an integer.
# I would like to make a new column identifying these as "first_order" flag, because that might
# provide valuable insights. However, we haven't learned how to do that yet.
# But, I just checked other student submissions and learned how to do it. So I'll create a new column

In [35]:
# Creating new dataframe 'df_ords_new' with column for flagging new customers
# Later when we learn how, we can use the flag in this column to exclude new customers
# when analyzing habits of existing customers.

df_ords_new = df_ords
df_ords_new['new_customer'] = df_ords['days_since_last_order'].isnull() == True

In [36]:
df_ords_new

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,new_customer
0,0,2539329,1,1,2,8,,True
1,1,2398795,1,2,3,7,15.0,False
2,2,473747,1,3,3,12,21.0,False
3,3,2254736,1,4,4,7,29.0,False
4,4,431534,1,5,4,15,28.0,False
...,...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,10,5,18,29.0,False
3421079,3421079,1854736,206209,11,4,10,30.0,False
3421080,3421080,626363,206209,12,1,12,18.0,False
3421081,3421081,2977660,206209,13,1,12,7.0,False


In [37]:
df_ords_new.head(30)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,new_customer
0,0,2539329,1,1,2,8,,True
1,1,2398795,1,2,3,7,15.0,False
2,2,473747,1,3,3,12,21.0,False
3,3,2254736,1,4,4,7,29.0,False
4,4,431534,1,5,4,15,28.0,False
5,5,3367565,1,6,2,7,19.0,False
6,6,550135,1,7,1,9,20.0,False
7,7,3108588,1,8,1,14,14.0,False
8,8,2295261,1,9,1,16,0.0,False
9,9,2550362,1,10,4,8,30.0,False


In [38]:
# It seems to have worked.

##### 5e. Checking for duplicate values

In [39]:
df_ords_new_dups = df_ords_new[df_ords_new.duplicated()]

In [40]:
df_ords_new_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,new_customer


In [41]:
# Apparently no duplicates

##### 6. Exporting cleaned df_ords data

In [42]:
df_ords_new.to_csv(os.path.join(project, '02 Data','Prepared Data', 'orders_checked.csv'))

In [43]:
df_ords_new.shape

(3421083, 8)