### Exercise 01. What are Data Consistency Checks

In [83]:
# Import libraries 

In [84]:
import pandas as pd
import numpy as np
import os

In [85]:
# Set path

In [86]:
path = r'C:\Users\lance\Documents\Achievement 4 Project'

In [87]:
# Import and define dataframes ('products.csv' and 'orders_wrangled.csv')

In [88]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [89]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [90]:
# Begin data consistency check by reviewing descriptive statistics

In [91]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [92]:
df_ords.drop(columns=['Unnamed: 0'], inplace =True)

In [93]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [94]:
# Carefully analyzing your data and knowing what the columns are supposed to contain can help greatly when it comes to verifying your data’s consistency

### Exercise 02. Mixed-Data Type

In [95]:
# The current dataframes don't have any mixed-type columns so create one to work with

In [96]:
# Create a dataframe
df_test = pd.DataFrame()

In [97]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [98]:
# the first command 'df_test = pd.DataFrame()' creates a new dataframe called df_test. The second creates a new column with mixed-data

In [99]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [100]:
# Check whether a dataframe contains any mixed-type columns
for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [101]:
# Note that the original '.appymap' argument from the exercise has been deprecated and '.map' should be used instead

In [102]:
# Once you determine that mixed-type data is present, you must decide what single data type the column should be

In [103]:
# When you've reached a decision, the following code can be executed
df_test['mix'] = df_test['mix'].astype('str')

In [104]:
# Sometimes the values will be either strings/objects or int64/floats

### Exercise 03. Missing Values

In [105]:
# Missing data can be attributed to two reasons: data corruption or the data was never recorded in the first place

In [106]:
# Two functions are useful here: 'isnull()', which is used the find missing observations and returns a True or False value
# True values can be interpretted numerically as 1. If each missing observation is equal to 1, then the 'sum()' function returns the total
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [107]:
# The columns in df_prods are listed on the left, while the columns with missing values are on the right 

In [108]:
# To view the 16 missing values, create a subset of the dataframe containing only the values in question

In [109]:
# Create a new dataframe, df_nan, containing on the values within the 'product_name' column that meets the condition of 'isnull()' true
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [110]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Exercise 04. Addressing Missing Values

In [111]:
# There are several ways to deal with missing data:
# 1. Create a new variable that acts like a flag based on the missing value
# 2. Impute the value with the mean or median of the column (if the variable is numeric)
# 3. Remove or filter out the missing data

In [112]:
# If you choose to impute using the mean, use the following code to replace the missing values:
# df['column with missings'].fillna(mean value, inplace=True)
# If you choose to impute using the median, use the following code to replace the missing values:
# df['column with missings'].fillna(median value, inplace=True)

In [113]:
# Looking at df_nan it's clear imputation isn't an option because the data-type is a string
# You can either remove the missing values entirely or filter the non-missing values into a subset dataframe

In [114]:
# Compare the number of rows in the current dataframe with the number in your subset once the missing rows are removed
df_prods.shape

(49693, 5)

In [115]:
# Create a new dataframe, setting the 'isnull()' condition to False so that only non-missing values are present
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [116]:
# Check the number of rows again to make sure it descreased
df_prods_clean.shape

(49677, 5)

In [117]:
# An alternative method to drop all missing values is via the following command
df_prods.dropna(inplace = True)

In [118]:
# To drop only NaNs from a particular column use the following command
df_prods.dropna(subset = ['product_name'], inplace = True)

In [119]:
# In these cases, rather than creating a new dataframe, you're overwriting df_prods with a new version of df_prods that doesn't contain missing values
# This is done using the 'inplace = True' function, which overwrites the dataframe
# If you don't specift an 'inplace' argument, the default is False which only returns a view of the changed dataframe

### Exercise 05. Duplicates

In [120]:
# For this project, look for full duplicates (multiple rows that have the same exact values in every column) 
# Single duplicates aren't inconsistencies in your data

In [121]:
# The following command will look for full duplicates within the dataframe
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [122]:
# This code creates a new subset dataframe only containing rows that are duplicates

In [123]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


### Exercises 06. Addressing Duplicates 

In [124]:
# Once the duplicates are identified, they must be removed using 'df.drop_duplicates()'

In [125]:
# Before make sure to compare row counts between df_prods_clean before and after duplicate removal
df_prods_clean.shape

(49677, 5)

In [126]:
# Now create a new dataframe that doesn't include duplicates 
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [127]:
df_prods_clean_no_dups.shape

(49672, 5)

### Exercise 07. Tidying Up and Exporting Changes

In [46]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

### Task 02. Run the df.describe() function on your df_ords dataframe. Interpret the output of this function

In [47]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [72]:
# Lets look at the values for 'order_number', 'orders_day_of_week', and 'days_since_prior_order'. 
# 'order_number' has a maximum value of 100, which seems very high depending on the time frame
# 'orders_day_of_week' has a minimum of '0' and a maximum of '6' suggesting the days are zero-indexed (0 for Monday or Sunday)
# 'days_since_prior_order' has a maximum value of 30 may suggest orders are capped monthly
# Now, let's look at zero values for 'days_since_prior_order'
# The minimum value for 'days_since_prior_order' is 0, suggesting multiple orders on the same day or a placeholder for customers' first orders
# The count is also lower compared to the other columns, suggesting missing data
# Finally, 'order_id', 'user_id', and 'order_number' should be considered string values instead of numerical values.

### Task 03. Check for mixed-type data in your df_ords dataframe

In [49]:
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [50]:
# There's no mixed data present in the df_ords dataframe
df_ords.dtypes

order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

### Task 05. Run a check for missing values in your df_ords dataframe

In [51]:
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [52]:
# The 'days_since_prior_order' column is missing 206209 values
# An explanation for this quanitty of missing values could be customers' first orders don't have a value
# However, this is relevant information that should be preserved instead of of removed

### Task 06. Address the missing values using an appropriate method

In [53]:
# In order to determine what course of action to take regarding the missing values, we must separate the data from the original dataframe
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [54]:
df_ords_nan

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


In [55]:
# Now we must verify whether all 'order_number' values equal to 1 are missing values in the 'days_since_prior_order' column

In [56]:
# Filter the dataframe where 'order_number' equals 1
df_first_orders = df_ords_nan[df_ords_nan['order_number'] == '1']

In [57]:
# Check if all of these have missing values in 'days_since_prior_order'
df_first_orders_missing = df_first_orders['days_since_prior_order'].isna().all()

In [58]:
# Print the result
print("All first orders have missing 'days_since_prior_order':", df_first_orders_missing) 

All first orders have missing 'days_since_prior_order': True


In [59]:
# We know for certain that every row with 'order_number' equal to 1 has a missing value in the'days_since_prior_order' column

In [60]:
# Address the missing values by creating a column that flags order numbers as either True/False depending on if the 'order_number' = 1
df_ords['first_order'] = df_ords['days_since_prior_order'].isna()

In [61]:
df_ords

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False
...,...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0,False
3421079,1854736,206209,11,4,10,30.0,False
3421080,626363,206209,12,1,12,18.0,False
3421081,2977660,206209,13,1,12,7.0,False


In [62]:
# Check data-type of new column and update if necessary
df_ords['first_order'].dtype

dtype('bool')

In [63]:
# Re-check data to make sure totals are accurate
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
first_order                    0
dtype: int64

In [64]:
# Missing values have been addressed appropriately, now the client can see why there are missing values in the 'order_number' column
# Removing/filtering these values would've been incorrect because the missing data is quite valuable 

### Task 07. Run a check for duplicate values in your df_ords data

In [65]:
df_ords_dup = df_ords[df_ords.duplicated()]

In [66]:
df_ords_dup

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order


In [67]:
# The dataframe created to check duplicates has returned empty, meaning there are no duplicates in the data set

### Task 08. Run the df.describe() function on your df_prods dataframe. Interpret the output of this function and address any errors

In [79]:
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [None]:
# Let's look at the maximum value in the 'prices' column, which is unusually high '99999'.
# It's likely such an extreme outlier is either a placeholder or and error
# The 'product_id' column has a maximum value of 49688, assuming the count of 49672 is reflective of 49672 unique products there may be an error and/or duplicate values

In [70]:
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_cleaned.csv'))

In [132]:
# Check number of unique product ids
df_unique_product_ids = df_prods_clean_no_dups['product_id'].nunique()

In [133]:
print(f"Number of unique product IDs: {df_unique_product_ids}")

Number of unique product IDs: 49670


In [134]:
# Check if product_id values are sequential and without gaps
df_all_ids = set(range(1, df_prods_clean_no_dups['product_id'].max() + 1))

In [137]:
# Create a set of actual ids in the dataframe
df_actual_ids = set(df_prods_clean_no_dups['product_id'].unique())

In [138]:
# Find any missing ids
missing_ids = df_all_ids - df_actual_ids

In [139]:
if missing_ids:
    print(f"There are missing product IDs: {sorted(missing_ids)}")
else:
    print("No missing product IDs; they are sequential.")

There are missing product IDs: [34, 69, 116, 262, 525, 1511, 1780, 2240, 2586, 3159, 3230, 3736, 4283, 4790, 6799, 26519, 38183, 40440]


In [None]:
# These results indicate that some ids may be missing, the data collection process never captured them, or they may have been removed

In [None]:
# The maximum value of 99999 also needs to be investigated 

In [144]:
outliers = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] == 99999]

In [145]:
print(outliers)

       product_id           product_name  aisle_id  department_id   prices
33666       33664  2 % Reduced Fat  Milk        84             16  99999.0


In [147]:
# Now that we've determined where the outlier is located, there are several approaches that can be taken
# In this case, since the price skews that data immensely and it's clear that its an error and not an expensive product, we can remove it
df_prods_extra_clean_no_dups = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] != 99999]

In [148]:
df_prods_extra_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49671.0,49671.0,49671.0,49671.0
mean,24850.172334,67.762115,11.728856,7.980256
std,14340.795118,38.3161,5.850806,66.952504
min,1.0,1.0,1.0,1.0
25%,12432.5,35.0,7.0,4.1
50%,24850.0,69.0,13.0,7.1
75%,37268.5,100.0,17.0,11.1
max,49688.0,134.0,21.0,14900.0


In [149]:
# It seems like there's yet another outlier. As such, we'll investigate the rows with the highest prices to see the other outliers
df_prods_sorted = df_prods_extra_clean_no_dups.sort_values(by='prices', ascending=False)

In [150]:
top_priced_products = df_prods_sorted.head()

In [151]:
print(top_priced_products)

       product_id                      product_name  aisle_id  department_id  \
21554       21553  Lowfat 2% Milkfat Cottage Cheese       108             16   
9020         9020  Boneless Skinless Chicken Thighs        35             12   
40490       40486                   Chicken Tenders        49             12   
21468       21467            Wild Caught Raw Shrimp        15             12   
25580       25579     Naturally Smoked Trout Fillet        15             12   

        prices  
21554  14900.0  
9020      25.0  
40490     25.0  
21468     25.0  
25580     25.0  


In [None]:
# Now we know that the product priced 14900 is the only other outlier that we must remove

In [154]:
df_prods_extra_clean_no_dups_2 = df_prods_extra_clean_no_dups[df_prods_extra_clean_no_dups['prices'] != 14900]

In [155]:
df_prods_extra_clean_no_dups_2.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49670.0,49670.0,49670.0,49670.0
mean,24850.238716,67.761305,11.72877,7.680437
std,14340.93185,38.31606,5.850834,4.199381
min,1.0,1.0,1.0,1.0
25%,12432.25,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.75,100.0,17.0,11.1
max,49688.0,134.0,21.0,25.0


In [None]:
# The descriptive statistics now appear more normal and no visible errors are present

### Task 09. Export your final, cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folder

In [156]:
df_prods_extra_clean_no_dups_2.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_rechecked.csv'))

In [158]:
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_cleaned.csv'))