# 01. Importing libraries

In [80]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing and analyzing data

In [81]:
# Define path
path = r'/Users/elisabetta/Documents/12-2022 Instacart Basket Analysis'

In [82]:
# Import products.csv and orders_wrangled.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [83]:
# Create test dataframe
df_test = pd.DataFrame()

In [84]:
# Create mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [85]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [86]:
# Check for mixed type columns
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [87]:
# Update column type
# df_test['mix'] = df_test['mix'].astype('str')

In [88]:
# Find missing values (check for nulls) in the whole dataframe
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [89]:
# Create subset to see only missing values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [90]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [91]:
# Impute using mean
# Find it using df.describe() and then use:
# df['column with missings'].fillna(mean value, inplace=True)

In [92]:
# Impute using median
# Find it using df_prods.median() and then use:
# df['column with missings'].fillna(median value, inplace=True)

In [93]:
# Compare total rows and rows in subset
df_prods.shape

(49693, 5)

In [94]:
# Create new cleaned dataframe
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [95]:
# Check shape of new dataframe
df_prods_clean.shape

(49677, 5)

In [96]:
# Alternative command to drop ALL missing values. THIS OVERWRITES THE DATAFRAME because of inplace = True
# Without inplace or with inplace = False the command returns only a view and the dataframe is not modified
# df_prods.dropna(inplace = True)

# Alternative command to drop missing values FROM ONE COLUMN. THIS OVERWRITES THE DATAFRAME because of inplace = True
# Without inplace or with inplace = False the command returns only a view and the dataframe is not modified
# df_prods.dropna(subset = [‘product_name’], inplace = True)

In [97]:
# Look for duplicates in dataframe
# It checks for FULL DUPLICATE ROWS and puts them in a new dataframe
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [98]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [99]:
# Create new dataframe excluding duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [100]:
df_prods_clean_no_dups.shape

(49672, 5)

In [101]:
# Export new dataframe to csv file
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

# 03. Task 4.5 starts here

In [102]:
# STEP 2 run df.describe() on df_prods and comment
# max product_id does not match the count product_id but it's alright since we dropped a bunch of rows
# max prices seems a bit off as 99999 seems quite high a price for a supermarket business
# The other stats look fine
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [103]:
# STEP 2 CONTINUED
# Investigating prices value 99999
# Check what products have a price higher than 500, which is already a lot for a supermarket
# It's two products. Most probably typos
# Check with the client before dropping rows/editing values
df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] > 50]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [104]:
# STEP 3 Check for mixed-type data within df_ords
# No mixed-type columns found
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [105]:
# Check for mixed-type data within df_prods_clean_no_dups
# No mixed-type columns found
for col in df_prods_clean_no_dups.columns.tolist():
  weird = (df_prods_clean_no_dups[[col]].applymap(type) != df_prods_clean_no_dups[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods_clean_no_dups[weird]) > 0:
    print (col)

In [106]:
# STEP 5 Find missing values (check for nulls) within df_ords
# No nulls except for column days_since_prior_order
# All those 206209 orders are most probably first orders from new customers. There is no prior order to count days from
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [107]:
# STEP 6 The missing values in column days_since_prior_order are correct. No action needed

In [108]:
# STEP 7 Check for duplicate values within df_ords
df_ords_dups = df_ords[df_ords.duplicated()]

In [109]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


In [110]:
df_ords_dups.shape

(0, 7)

In [111]:
# STEP 7 CONTINUED
# As we can see from the lines above there are no duplicate rows withn df_ords