# 01. Consistency checks on Products dataset csv files

## List of Contents
##### 01. Consistency checks on products dataset csv files
- Observation
- Export the file

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Import products dataset csv files as df_prods

df_prods = pd.read_csv(r'C:\Users\IDONG\Original data\products.csv', index_col = False)

In [3]:
# First step is to have an idea of the data

df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [4]:
# At this juncture, there might be no need to run a 'pd_prods.describe()' command because we do not know the exact number of
# aisle_ids, department_ids or even the cost of the highest/lowest products (prices column) to fish out any possible error 

# Now we can proceed with the consistency checks proper. We start with Checking for 'MIXED-TYPE' data

for col in df_prods.columns.tolist():
  weird = (df_prods[[col]].applymap(type) != df_prods[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods[weird]) > 0:
    print (col)

product_name


In [5]:
# 'Product name' ougt to contain only string values so we need to correct this.

df_prods['product_name'] = df_prods['product_name'].astype('str')

In [6]:
# Next check will be for 'MISSING VALUES' (and the extent to which it occurs)

df_prods.isnull().sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
prices           0
dtype: int64

In [7]:
# The dataset is certainly perfect in terms of no missing values

# Lastly, we can check for DUPLICATES. For this purpose, a new dataset is created as 'df_prods_dup'

df_prods_dup = df_prods[df_prods.duplicated()]

In [8]:
df_prods_dup

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [9]:
# 5 rows have be identified to constitute duplicates. To effect a clean up, we use the command below

df_prods_no_dup = df_prods.drop_duplicates()

In [10]:
# To ascertain the duplicates were accurately removed, we can do a comaprison between the the number of rows in 
# 'df_prods' & 'df_prods_no_dup'

df_prods.shape

(49693, 5)

In [11]:
df_prods_no_dup.shape

(49688, 5)

In [12]:
# Difference between 49693 and 49688 is 5, which proves the duplicates were actually removed.

# Finally as per instruction, we do 'df_prods_no_dup.describe()' to determine min and max values

df_prods_no_dup.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49688.0,49688.0,49688.0,49688.0
mean,24844.50004,67.769582,11.728687,9.994254
std,14343.834402,38.316162,5.85041,453.542503
min,1.0,1.0,1.0,1.0
25%,12422.75,35.0,7.0,4.1
50%,24844.5,69.0,13.0,7.1
75%,37266.25,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


## Observation

In [14]:
# While nothing looks unusal, the maximum value for prices (of $99,999) looks a bit strange. The items in stock do not involve
# heavy machinery goods or cost of housing property, so there is something definitely wrong here.
# N.B.-The maximum value in product_id matches with total number of rows (from df_prods_no_dup.shape) so this is fine

## Export the file

In [15]:
# Define the path
path = path = r'C:\Users\IDONG'

In [17]:
# Exporting and saving in Prepared data folder as products_checked

df_prods_no_dup.to_csv(os.path.join(path, 'Prepared Data', 'products_checked.csv'))