# This script contains the following:
- Import libraries
- Load datasets
- Checking mixed types on test dataframe
- Data consitency check on Products
- Exporting new product dataframe
- Data consistency check on Orders
- Export

# Import libraries

In [8]:
import pandas as pd
import numpy as np
import os

# Load datasets

In [9]:
# Define paths and filenames
path = r'/Users/macbook/Dropbox/Mac/Documents/Pro/Data Analyst/Course_Career foundry/A4_Python/2023.08_Instacart basket analysis/02_data'
orders_filename = 'orders_wrangled.csv'
products_filename = 'products.csv'

# Construct full file paths
orders_file_path = os.path.join(path,'prepared data', orders_filename)
products_file_path = os.path.join(path, 'original data', products_filename)

# Import dataframes
df_ords = pd.read_csv(orders_file_path, index_col=False)
df_prods = pd.read_csv(products_file_path, index_col=False)

# Descriptive stats for orders & products dataframes

In [10]:
# descriptive stats of df orders
round(df_ords.describe(),2)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,number_order_client,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.21,17.15,2.78,13.45,11.11
std,987581.74,987581.74,59533.72,17.73,2.05,4.23,9.21
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565811.5,2565812.5,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [11]:
# descriptive stats df products
round(df_prods.describe(),2)

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.35,67.77,11.73,9.99
std,14343.72,38.32,5.85,453.52
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


### there is outliers with a price of 99999.00. lets check which products are concerned

In [12]:
# Find median of prices
prices_median = df_prods['prices'].median()
prices_median

7.1

In [13]:
# Finding the outliers products with a price of 99999.00
df_prods[df_prods['prices'] == 99999.00]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


### NOTE:  2 % Reduced Fat Milk is the outlier. this needs to be adreesed

# Checking mixed types on Test dataframe

In [14]:
# Creating df test
df_test = pd.DataFrame()

# Attributes of df_test
df_test['mix'] = ['a', 'b', 1, True]

# Checking
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [15]:
#check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


# Data consitency on Products dataframe

## Finding missing values

In [16]:
#finding count missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [17]:
# seeing the missing values
df_nan = df_prods[df_prods['product_name'].isnull() == True]
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [18]:
# show the number of rows & columns in df_prods data frame
df_prods.shape

(49693, 5)

In [19]:
# Ignoring missing values from df_prods
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [20]:
# Checking ignored missing values
df_prods_clean.shape # 16 NAN values are out of the df

(49677, 5)

## Check for duplicates

In [21]:
# Finding duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]
df_dups


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [22]:
# Checking shape of df
df_prods_clean.shape

(49677, 5)

In [23]:
# dropping duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

# checking drop duplicates
df_prods_clean_no_dups.shape # 5 records have been dropped

(49672, 5)

## Exporting consistency checked Product dataframe

In [24]:
# Renaming df before export 
df_prods_clean_no_dups.to_csv(os.path.join(path,'prepared data', 'products_checked.csv'))

# Data consitency check on Orders dataframe

In [25]:
# Performing a descriptive stats observation
df_ords.describe().round()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,number_order_client,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.0,17.0,3.0,13.0,11.0
std,987582.0,987582.0,59534.0,18.0,2.0,4.0,9.0
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.0,855272.0,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


### NOTE: I don't see any weird data patern here


## Checking mixed types

In [26]:
# Finding data types
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   order_id                int64  
 2   user_id                 int64  
 3   number_order_client     int64  
 4   orders_day_of_week      int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(6)
memory usage: 182.7 MB


In [27]:
# checking mixed types
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

### NOTE: no mixed types 

## Check for missing values

In [28]:
# finding missing values
df_ords.isnull().sum()


Unnamed: 0                     0
order_id                       0
user_id                        0
number_order_client            0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

### NOTE: the missing values represent the numbers of days since last order. it is possible that all those missing values are there because the clients only made one order. There is then no prior order.
 

In [29]:
# Checking dataframe shape 
df_ords.shape

(3421083, 7)

### NOTE: What to do with missing values?
There is no reason to delete or impute those values. We could attribute it to a label like "unique_order_client" or do nothing and explain it in the data dictionary.
I also calculate the percentage with total orders.

### calculation percentage of missing values

In [30]:
# missing values count
missing_values = 206209

# number of rows
total_rows = 3421083

# calcul % of missing values
percent_missing_values = round(missing_values / total_rows * 100)
percent_missing_values

6

### days_since_prior_order has 6% of missing values. 

## Duplicates

In [31]:
# finding duplicates
df_dups = df_ords[df_ords.duplicated()]
df_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,number_order_client,orders_day_of_week,order_hour_of_day,days_since_prior_order


### NOTE: no duplicates

# Export

In [32]:
# Export df_ords checked data to prepared file directory
df_ords.to_csv(os.path.join(path,'prepared data', 'orders_checked.csv'))
