# 4.5 Instacart data consistency

# This script contains the following points:

## 1. Import libraries

## 2. Import 'product.csv' dataframe

## 3. Import 'orders_wrangled.csv' dataframe

## 4. Data quality measures

## 5. Create a dataframe

## 6. Create a mixed type column

## 7. Check for mixed-type columns using a for loop

## 8. Convert mixed datatype to 'str'

## 9. Finding missing values in 'df_prods' dataframe

## 10. Find missing values in 'product_name' column

## 11. Overwriting original dataframe and removing missing data (risky!)

## 12. Find full duplicates (multiple rows)

## 13. Verify current number of rows in df_prods_clean

## 14. Create new dataframe that doesn't include duplicate rows

## 15. Verify that number of rows has decreased

## 16. Change dataframe name and export cleaned dataframe to 'Prepared_Data' folder as 'products_checked.csv'

## 17. 4.5 Tasks. Task 2 - evaluate data quality on the 'products_checked' dataframe

## 18. Since 'product_id', 'aisle_id', and 'department_id' are descriptive values, they should be converted to 'str'

## 19. Task 3 - check for mixed-data in df_ords dataframe

## 20. Task 5 - check for missing values in 'df_ords' dataframe

## 21. Create a new dataframe with new column 'first_time_order

## 22. Task 7 - Check for duplicate values in 'df_ords' dataframe

## 23. Export new dataframe as orders_checked.csv file to 'Prepared_Data' folder.


## 1. Import libraries

In [1]:
# import lbraries

import pandas as pd
import numpy as np
import os

## 2. Import product.csv dataframe

In [2]:
# id path

path = r'C:\Users\howl6\OneDrive\Certificates\CareerFoundry\Coursework\Data_Immersion\Chapter 4\Instacart Basket Analysis'

In [127]:
# import products.csv as df_prods

df_prods = pd.read_csv(os.path.join(path,'02_Data','Original_Data', '4.3_orders_products', 'products.csv'), index_col=False)

## 3. Import orders_wrangled.csv dataframe

In [4]:
# import orders_wrangled as df_ords

df_ords = pd.read_csv(os.path.join(path, '02_Data', 'Prepared_Data', 'CSV_files', 'orders_wrangled.csv'), index_col=False)

In [5]:
# view number of rows, columns

df_ords.shape

(3421083, 7)

## 4. Data quality measures

In [133]:
# id descriptive statistics for df_prods

df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


## 5. Create a dataframe

In [10]:
# create df_test dataframe

df_test = pd.DataFrame()

## 6. Create a mixed type column

In [11]:
# create mixed column using df_test

df_test['mix'] = ['a', 'b', 1, True]

In [12]:
# preview top 5 rows

df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


## 7. Check for mixed-type columns using a for loop

In [13]:
# create for loop

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


## 8. Convert mixed datatype to 'str'

In [14]:
# convert df_test mix to string

df_test['mix'] = df_test['mix'].astype('str')

## 9. Finding missing values in 'df_prods' dataframe

In [59]:
# find null in df_prods

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

## 10. Find missing values in 'product_name' column

In [130]:
# find missing product_name using isnull

df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [61]:
# view df_nan

df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Compare total number of rows

In [62]:
# view number of rows, columns in df_prods

df_prods.shape

(49693, 5)

### To see if the number of rows has decreased

In [63]:
# create new dataframe df_prods_clean

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [64]:
# view number rows, columns in df_prods_clean

df_prods_clean.shape

(49677, 5)

## 11. Overwriting original dataframe and removing missing data (risky!)

In [65]:
df_prods['product_name'].dropna(inplace = True)

## 12. Find full duplicates (multiple rows)

In [66]:
# create new dataframe df_dups to find duplicat rows

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [67]:
# view df_dups

df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


## 13. Verify current number of rows in df_prods_clean

In [68]:
# view row, columns

df_prods_clean.shape

(49677, 5)

## 14. Create new dataframe that doesn't include duplicate rows

In [69]:
# create df_prods_clean_no_dups using drop.duplicates

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

## 15. Verify that number of rows has decreased

In [71]:
# view rows, columns

df_prods_clean_no_dups.shape

(49672, 5)

In [72]:
# view top 4 rows

df_prods_clean_no_dups.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


## 16. Change dataframe name and export cleaned dataframe to 'Prepared_Data' folder as products_checked.csv

In [74]:
# export new cleaned dataframe

df_prods_clean_no_dups.to_csv(os.path.join(path, '02_Data','Prepared_Data', 'products_checked.csv'))

## 17. 4.5 Tasks. Task 2 - evaluate data quality on the 'products_checked' dataframe

In [75]:
# conduct descriptive statistics

products_checked.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


 ## 18. Since 'product_id', 'aisle_id', and 'department_id' are descriptive values, they should be converted to 'str'

In [None]:
# convert 'product_id' to str

products_checked['product_id'] = products_checked['product_id'].astype('str')

In [None]:
# convert 'aisle_id' to str

products_checked['aisle_id'] = products_checked['aisle_id'].astype('str')

In [None]:
# convert 'department_id' to str

products_checked['department_id'] = products_checked['department_id'].astype('str')

In [105]:
# verify change to str datatypes

products_checked.dtypes

product_id        object
product_name      object
aisle_id          object
department_id     object
prices           float64
dtype: object

In [106]:
#conduct descriptive analysis

products_checked.describe()

Unnamed: 0,prices
count,49672.0
mean,9.993282
std,453.615536
min,1.0
25%,4.1
50%,7.1
75%,11.1
max,99999.0


In [104]:
# view products_checked

products_checked

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [107]:
# verify no nulls in dataframe

products_checked.isnull().sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
prices           0
dtype: int64

### The max value for 'prices' looks like it may be too high and warrants further investigation.

## 19. Task 3 - check for mixed-data in df_ords dataframe

In [108]:
# create for loop 

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

### Running a for-loop did not identify any mix-data in 'df_ords' columns.

## 20. Task 5 - check for missing values in 'df_ords' dataframe

In [109]:
# check for nulls in df_ords

df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_time_of_day              0
days_since_prior_order    206209
dtype: int64

In [111]:
# check for nulls in days_since_prior_order

df_nan_2 = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [112]:
# view df_nan_2

df_nan_2

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


### There are 206,209 missing values in the 'days_since_prior_order' column.  The order_number '1' indicates that these customers only made one order, therefore there would be no prior order count.  

### Task 6 - The values should not be eliminated as it is a subset of the larger dataframe. A new column could be created 'first_time_order' and set as a boolean, with 'true' for the first time the customer ordered, 'false' for not.

## 21. Create a new dataframe with new column 'first_time_order'

In [119]:
# create df_ords_checked

df_ords_checked = df_ords

In [122]:
# create boolean column for first_time_order

df_ords_checked ['first_time_order'] = df_ords['days_since_prior_order'].isnull()== True

In [123]:
# preview first 5 rows

df_ords_checked.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_time_order
0,0,2539329,1,1,2,8,,True
1,1,2398795,1,2,3,7,15.0,False
2,2,473747,1,3,3,12,21.0,False
3,3,2254736,1,4,4,7,29.0,False
4,4,431534,1,5,4,15,28.0,False


 ## 22. Task 7 - Check for duplicate values in 'df_ords' dataframe

In [124]:
# create df_dups_2 using duplicated

df_dups_2 = df_ords_checked[df_ords_checked.duplicated()]

In [118]:
# view df_dups_2

df_dups_2

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order


### There are no duplicate rows in the 'df_ords' dataframe

## 23. Export new dataframe as orders_checked.csv file to 'Prepared_Data' folder.

In [125]:
df_ords_checked.to_csv(os.path.join(path, '02_Data','Prepared_Data', 'orders_checked.csv'))