# 4.5 Data Consistency Checks

1. Import libriaries and dataset
2. Mixed-type data
3. Missing value
4. Duplicates
5. Task answers

## 1. Import libriaries and dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Import Product Dataset (straight) 
df_prods = pd.read_csv(r'C:\Users\irikh\iCloudDrive\Data analytics\Instacart basket Analysis\02 Data\Original Data\products.csv', index_col = False)

## 2. Mixed-type data

In [3]:
# Create a new dataframe 
df_test = pd.DataFrame()

In [4]:
# Create a mixed_type column
df_test['mix'] = ['a', 'b', 1, True]

In [5]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [6]:
# Checking mixed_type data
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [7]:
# Changing data type in column to string
df_test['mix'] = df_test['mix'].astype('str')

In [None]:
# if changing to integer: df_test['mix'] = df_test['mix'].astype('int64')

In [8]:
# Checking results of changing data type 
df_test.dtypes

mix    object
dtype: object

## 3. Missing values

In [9]:
# Finding missing values in df_prods
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [10]:
# Creating subset with Nan values from df_prods
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [11]:
# Checking created subset with Nan values
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [13]:
# Dealing with missing values:
# Var_1 Create a new variable that acts like a flag based on the missing value. (see down)
# Var_2 Impute the value with the mean or median of the column (if the variable is numeric).
# MEAN: df.describe() AFTER df['column with missings'].fillna(mean value, inplace=True)
# MEDIAN: df_prods.median() AFTER df['column with missings'].fillna(median value, inplace=True)
# Var_3 Remove (overwriting!):
# df_prods.dropna(inplace = True) (If all missing values)
# df_prods.dropna(subset = [‘product_name’], inplace = True) (if particular column)

In [14]:
# Checking shape of main dataframe
df_prods.shape

(49693, 5)

In [15]:
# Adressing missing values (Creating NEW DATAFRAME PRODUCTS without missing values)
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [16]:
# Checking shape of NEW DATAFRAME PRODUCTS (result of filter out missing values)
df_prods_clean.shape

(49677, 5)

## 4. Duplicates

In [17]:
# Finding duplicates.Creating a new subset containing only rows that are duplicates.
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [18]:
# Checking created subset with duplicates
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [19]:
# Checking shape of main data 
df_prods_clean.shape

(49677, 5)

In [20]:
# Adressing duplicates (Creating NEW DATFRAME PRODUCT w_out duplicates)
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [21]:
# Checking shape of NEW DATAFRAME PRODUCTS (result of filter out duplicates)
df_prods_clean_no_dups.shape

(49672, 5)

## 5. TASK answers (df_ords_wr check)

In [22]:
# Import order data set (path)
path = r"C:\Users\irikh\iCloudDrive\Data analytics\Instacart basket Analysis"

In [23]:
# Import order data set
df_ords_wrangled = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [24]:
# 2 Describing statistics df_order_wrangled 
df_ords_wrangled.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [None]:
# Investigating describing table, it appears:
# in "days_since_last_order" has strange recordings the max is 3 (days), but 50% is 7 (days) which is higher than the max.
# in "order_number" has min of 1 and max of 1, but 25% is recorded as 5 which is higher than the max of 1.

In [25]:
# 3 Check for mixed types
for col in df_ords_wrangled.columns.tolist():
  weird = (df_ords_wrangled[[col]].applymap(type) != df_ords_wrangled[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords_wrangled[weird]) > 0:
    print (col)

In [26]:
# 4 There is No mixed data types

In [27]:
# 5 Finding missing values
df_ords_wrangled.isnull().sum()

Unnamed: 0                    0
order_id                      0
user_id                       0
order_number                  0
orders_day_of_week            0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

In [28]:
# extra checking for explanation of missing values (less than 1 day since last order)
# "0" indicates less than 1 day since last order.
df_ords_wrangled['days_since_last_order'].value_counts(dropna = False)

days_since_last_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [29]:
# Extra cheching for explanation of missing values (first order of new user). 
# first order_number of new user matches Nan (Days since last order).
df_users = df_ords_wrangled.loc[df_ords_wrangled['user_id'].isin([1,52,20])]

In [30]:
df_users

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,0,2539329,1,1,2,8,
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0
5,5,3367565,1,6,2,7,19.0
6,6,550135,1,7,1,9,20.0
7,7,3108588,1,8,1,14,14.0
8,8,2295261,1,9,1,16,0.0
9,9,2550362,1,10,4,8,30.0


In [31]:
# Extra checking for explanation of missing values (first order of new client). 
# Total users is 206209 (by user_id)) and it matches with 206209 Nan (missing values).  
df_ords_wrangled.tail()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
3421078,3421078,2266710,206209,10,5,18,29.0
3421079,3421079,1854736,206209,11,4,10,30.0
3421080,3421080,626363,206209,12,1,12,18.0
3421081,3421081,2977660,206209,13,1,12,7.0
3421082,3421082,272231,206209,14,6,14,30.0


### There are 206209 missing values in column "days since last order". It could be explained as the first order of new user.  

In [32]:
# 6 Address the missing values. 

### I would not change this value.  Deleting this missing value or imputing other would skrew some results of analysis.  

In [33]:
# 7 Creating a new subset containing only rows that are duplicates
df_ords_dups = df_ords_wrangled[df_ords_wrangled.duplicated()]

In [34]:
# 8 Address the duplicates.
# No duplicates

In [35]:
# 9 Exporting df_ords_wrangled as df_ords_clean
df_ords_wrangled.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_clean.csv'))

In [36]:
# 10  Exporting df_prods_ as df_prods_clean
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'prods_clean.csv'))